In [2]:
import numpy as np
import pandas as pd
from scipy import stats

In [3]:
    import warnings
    warnings.filterwarnings('ignore')

In [7]:
df_stitch_2 = pd.read_pickle("./data_stitch_2.pkl")

In [8]:
df_stitch_1 = pd.read_pickle("./data_stitch_1.pkl")

In [9]:
df_stitch_2.shape

(14449, 6)

In [10]:
df_stitch_1.shape

(1789, 6)

In [11]:
df_stitch_2.head(2)

Unnamed: 0,t,Um,Ur,Ud,temperature,C_target
0,108,0.70172,0.70287,0.77656,10,0.0
1,113,0.70175,0.70305,0.77649,10,0.0


In [12]:
df_stitch_2[['Um','Ur','Ud']].corr()

Unnamed: 0,Um,Ur,Ud
Um,1.0,0.999773,0.999752
Ur,0.999773,1.0,0.999222
Ud,0.999752,0.999222,1.0


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [14]:
def set_features(dataframe):
    features = dataframe[['Ur','Ud']]
    features['Ur^2'] = features['Ur'].pow(2)
    features['Ur^3'] = features['Ur'].pow(3)
    features['Ud^2'] = features['Ud'].pow(2)
    features['Ud^3'] = features['Ud'].pow(3)
    return features

In [15]:
stitch_2_features = set_features(df_stitch_2)
stitch_2_features.head(1)

Unnamed: 0,Ur,Ud,Ur^2,Ur^3,Ud^2,Ud^3
0,0.70287,0.77656,0.494026,0.347236,0.603045,0.468301


In [16]:
stitch_2_target = df_stitch_2['Um']

In [17]:
lin_reg = LinearRegression().fit(stitch_2_features, stitch_2_target)
print(f'coefs = {lin_reg.coef_}')
print(f'bias = {lin_reg.intercept_}')
print(f'R2 score = {lin_reg.score(stitch_2_features, stitch_2_target)}')

coefs = [ 1.52741178e-01 -1.19291251e+02 -8.68007242e-02  3.14086141e-01
  1.82826412e+02 -9.02884032e+01]
bias = 25.19465914959857
R2 score = 0.9999796603701566


In [18]:
from sklearn.linear_model import Lasso
prev_R2 = 0.
R2_score = 0.
bias = 0.0000001
coef = 1.
while (R2_score >= prev_R2) and (bias < 1) and (bias > 0):
    coef *= 0.1
    lasso_reg_calibrate = Lasso(alpha=coef).fit(stitch_2_features, stitch_2_target)
    bias = lasso_reg_calibrate.intercept_
    prev_R2 = R2_score
    R2_score = lasso_reg_calibrate.score(stitch_2_features, stitch_2_target)

optimal_alpha = coef*10
print(f'optimal alpha = {optimal_alpha}')

optimal alpha = 1.0000000000000006e-06


In [19]:
lasso_reg = Lasso(alpha=optimal_alpha).fit(stitch_2_features, stitch_2_target)
print(f'coefs = {lasso_reg.coef_}')
print(f'bias = {lasso_reg.intercept_}')
print(f'R2 score = {lasso_reg.score(stitch_2_features, stitch_2_target)}')

coefs = [ 0.76226938  0.         -0.00940744 -0.0342366   0.          0.        ]
bias = 0.18325837139463358
R2 score = 0.9998399724747122


In [20]:
stitch_1_features = set_features(df_stitch_1)

In [21]:
df_stitch_1_target = df_stitch_1['Um']
print(f'R2 score on test = {lasso_reg.score(stitch_1_features, df_stitch_1_target)}')

R2 score on test = 0.9998498222172715


In [22]:
correction_vector = lasso_reg.coef_
def Um_correction(dataframe):
    features = set_features(dataframe)
    Um_corrected = dataframe['Um'] - np.dot(features.to_numpy(),np.transpose([correction_vector])).flatten()
    return Um_corrected

In [23]:
outlook = pd.DataFrame({'Um original': df_stitch_2['Um'],'Um_corrected': Um_correction(df_stitch_2)},
                       columns = ['Um original','Um_corrected'])
outlook

Unnamed: 0,Um original,Um_corrected
0,0.70172,0.182479
1,0.70175,0.182384
2,0.70177,0.182446
3,0.70173,0.182196
4,0.70214,0.182746
...,...,...
14444,0.70094,0.182817
14445,0.70106,0.182846
14446,0.70090,0.182735
14447,0.70130,0.183177


In [24]:
outlook.describe()

Unnamed: 0,Um original,Um_corrected
count,14449.0,14449.0
mean,0.655691,0.183258
std,0.066761,0.000845
min,0.50123,0.180434
25%,0.62165,0.182842
50%,0.67885,0.183121
75%,0.69626,0.183821
max,0.83903,0.187534


In [25]:
df_gases_1 = pd.read_pickle("./data_gases_1_filtered.pkl")
df_gases_2 = pd.read_pickle("./data_gases_2_filtered.pkl")

In [26]:
print(f'df_gases_1 size = {df_gases_1.shape[0]}')
print(f'df_gases_2 size = {df_gases_2.shape[0]}')

df_gases_1 size = 1012
df_gases_2 size = 2036


In [27]:
df_gases_1.head(1)

Unnamed: 0,t,C,Um,Ur,Ud,percentage,C_target
0,1592,84.065575,0.70109,0.70087,0.77666,0.0,0.0


In [28]:
df_gases_2.columns

Index(['t', 'C', 'Um', 'Ur', 'Ud', 'percentage', 'C_target'], dtype='object')

In [29]:
df_gases_2_train, gases_test = train_test_split(df_gases_2, test_size=0.4, random_state=42)
gases_train = pd.concat([df_gases_1, df_gases_2_train])
gases_train.shape

(2233, 7)

In [30]:
def create_nonelinear_features(feature, name: str):
    features = pd.DataFrame({name: feature})
    features[f'{name}^2'] = features[name].pow(2)
    features[f'{name}^3'] = features[name].pow(3)
    features[f'ln({name})'] = np.log(features[name])
    return features

In [31]:
Um_corr = Um_correction(gases_train)
gas_features = create_nonelinear_features(Um_corr, 'Um_corr')
Um_corr = Um_corr.to_numpy().reshape(-1, 1)
gas_features.columns

Index(['Um_corr', 'Um_corr^2', 'Um_corr^3', 'ln(Um_corr)'], dtype='object')

In [32]:
C_target = gases_train['C_target']

In [33]:
lin_reg_Um = LinearRegression().fit(Um_corr, C_target)
print('linear model on Um_corr only')
print(f'coefs = {lin_reg_Um.coef_}')
print(f'R2 score = {lin_reg_Um.score(Um_corr, C_target)}')
print('\n')
lin_reg_features = LinearRegression().fit(gas_features, C_target)
print('Model on nonelinear Um_corr features')
print(f'coefs = {lin_reg_features.coef_}')
print(f'R2 score = {lin_reg_features.score(gas_features, C_target)}')


linear model on Um_corr only
coefs = [-1428958.1310235]
R2 score = 0.9594593385344147


Model on nonelinear Um_corr features
coefs = [ 2.63469379e+09 -7.61613088e+09  9.81190461e+09 -1.52527366e+08]
R2 score = 0.9984688428218556


In [34]:
from sklearn.linear_model import LassoCV
alphas = np.array([10**(-degree) for degree in range(0,7)])
lasso_reg_features = LassoCV(alphas=alphas, random_state=42).fit(gas_features, C_target)
print(f'selected alpha = {lasso_reg_features.alpha_}')
print(f'coefs = {lasso_reg_features.coef_}')
print(f'R2 score = {lasso_reg_features.score(gas_features, C_target)}')

selected alpha = 0.001
coefs = [  842322.05212184  6407086.18175488 23365643.21733014 -1103183.73587719]
R2 score = 0.9971029277296064


In [35]:
C_target_test = gases_test['C_target']
Um_corr_test = Um_correction(gases_test)
gas_features_test = create_nonelinear_features(Um_corr_test, 'Um_corr')
Um_corr_test = Um_corr_test.to_numpy().reshape(-1, 1)

lin_reg_Um_prediction = lin_reg_Um.predict(Um_corr_test)
print(f'only Um lineal model R2 score on test = {lin_reg_Um.score(Um_corr_test, C_target_test)}')

lasso_reg_features_prediction = lasso_reg_features.predict(gas_features_test)
print(f'nonelineal features model R2 score on test = {lasso_reg_features.score(gas_features_test, C_target_test)}')

only Um lineal model R2 score on test = 0.9611945185442383
nonelineal features model R2 score on test = 0.9980241617938441


In [36]:
gases_test.columns

Index(['t', 'C', 'Um', 'Ur', 'Ud', 'percentage', 'C_target'], dtype='object')

In [37]:
overview_table = pd.DataFrame({'C_original': gases_test['C'], 'C_pred_only_UM': lin_reg_Um_prediction,
                               'C_pred_nonelinear': lasso_reg_features_prediction, 'C_target': C_target_test},
                             columns=['C_original', 'C_pred_only_UM', 'C_pred_nonelinear', 'C_target'])
overview_table.head(20)

Unnamed: 0,C_original,C_pred_only_UM,C_pred_nonelinear,C_target
238,-217.619904,-1909.311191,-126.052751,0.0
128,8.319479,-1268.214012,56.741115,0.0
532,-48.212215,-1488.560134,-7.700781,0.0
629,-219.135361,-1959.890014,-139.864771,0.0
1717,29089.167969,33025.922406,31304.428871,30000.0
1931,50245.789063,43166.709883,48978.986309,50000.0
1707,28909.958984,32911.317746,31127.510518,30000.0
1341,19662.826172,25804.853405,21133.216409,20000.0
1049,2082.421875,3103.269156,1686.210442,2000.0
1456,19809.912109,25953.962937,21323.342036,20000.0


In [38]:
error_bias = df_gases_2[df_gases_2['percentage'] == 0.0]['C'].mean()
error_bias

-110.4721044798729

In [39]:
error_bias = df_gases_2[df_gases_2['percentage'] == 0.0]['C'].mean()
C_unbiased = gases_test['C'] - error_bias

In [40]:
from sklearn.metrics import mean_squared_error
sensor_original_RMSE = np.sqrt(mean_squared_error(gases_test['C'], gases_test['C_target']))
C_unbiased_RMSE = np.sqrt(mean_squared_error(C_unbiased, gases_test['C_target']))
lin_reg_Um_RMSE = np.sqrt(mean_squared_error(lin_reg_Um_prediction, gases_test['C_target']))
lasso_reg_features_RMSE = np.sqrt(mean_squared_error(lasso_reg_features_prediction, gases_test['C_target']))

print(f'sensor_original_RMSE = {sensor_original_RMSE}')
print(f'C_unbiased_RMSE = {C_unbiased_RMSE}')
print(f'lin_reg_Um_RMSE = {lin_reg_Um_RMSE}')
print(f'lasso_reg_features_RMSE = {lasso_reg_features_RMSE}')

sensor_original_RMSE = 544.6193245422411
C_unbiased_RMSE = 500.9429312760324
lin_reg_Um_RMSE = 3431.2535468634483
lasso_reg_features_RMSE = 774.2513338055619


In [41]:
df_stitch_1.columns

Index(['t', 'Um', 'Ur', 'Ud', 'temperature', 'C_target'], dtype='object')

In [42]:
Um_corr_stitch = Um_correction(df_stitch_1)
stitch_features = create_nonelinear_features(Um_corr_stitch, 'Um_corr')

lasso_reg_stitch_prediction = lasso_reg_features.predict(stitch_features)
print(np.sqrt(mean_squared_error(lasso_reg_stitch_prediction, df_stitch_1['C_target'])))

579.584670242544


In [43]:
overview_stitch = pd.DataFrame({'C_pred_nonelinear': lasso_reg_stitch_prediction, 'C_target': df_stitch_1['C_target'], 'Ur': df_stitch_1['Ur']},
                             columns=[ 'C_pred_nonelinear', 'C_target', 'Ur'])
overview_stitch

Unnamed: 0,C_pred_nonelinear,C_target,Ur
0,774.281829,0.0,0.70287
1,825.396827,0.0,0.70305
2,792.259139,0.0,0.70299
3,927.496552,0.0,0.70329
4,635.442025,0.0,0.70309
...,...,...,...
1784,1625.943425,0.0,0.42593
1785,1537.537349,0.0,0.42612
1786,1525.762575,0.0,0.42604
1787,1411.577949,0.0,0.42621


In [63]:
overview_stitch.loc[500:510]

Unnamed: 0,C_pred_nonelinear,C_target,Ur
500,9.989452,0.0,0.88737
501,53.258966,0.0,0.88742
502,123.015082,0.0,0.88771
503,107.037567,0.0,0.88782
504,68.360522,0.0,0.88794
505,111.004837,0.0,0.88821
506,3.652203,0.0,0.88831
507,99.199711,0.0,0.88853
508,80.432026,0.0,0.88872
509,15.730561,0.0,0.88873


In [62]:
overview_stitch.loc[1000:1010]

Unnamed: 0,C_pred_nonelinear,C_target,Ur
1000,166.777458,0.0,0.58846
1001,24.418211,0.0,0.58755
1002,31.244989,0.0,0.587
1003,47.690382,0.0,0.58644
1004,104.596239,0.0,0.5859
1005,104.652775,0.0,0.58516
1006,-37.166153,0.0,0.58459
1007,87.707907,0.0,0.58396
1008,63.805587,0.0,0.5831
1009,188.271504,0.0,0.58269


In [64]:
overview_stitch.describe()

Unnamed: 0,C_pred_nonelinear,C_target,Ur
count,1789.0,1789.0,1789.0
mean,164.349149,0.0,0.651778
std,555.950101,0.0,0.176402
min,-999.350352,0.0,0.42588
25%,-88.936692,0.0,0.46009
50%,139.031848,0.0,0.70327
75%,557.841186,0.0,0.83149
max,1800.583522,0.0,0.90182
