In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import pickle

In [4]:
df = pd.read_csv('ai4i2020.csv')

In [5]:
df.info

<bound method DataFrame.info of         UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0         1     M14860    M                298.1                    308.6   
1         2     L47181    L                298.2                    308.7   
2         3     L47182    L                298.1                    308.5   
3         4     L47183    L                298.2                    308.6   
4         5     L47184    L                298.2                    308.7   
...     ...        ...  ...                  ...                      ...   
9995   9996     M24855    M                298.8                    308.4   
9996   9997     H39410    H                298.9                    308.4   
9997   9998     M24857    M                299.0                    308.6   
9998   9999     H39412    H                299.0                    308.7   
9999  10000     M24859    M                299.0                    308.7   

      Rotational speed [rpm]  Torque [Nm]  

In [8]:
df.isna().sum()

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [12]:
df = df.drop(columns=['UDI','Product ID','Type','Machine failure'])

In [13]:
df

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,0,0,0,0,0
9996,298.9,308.4,1632,31.8,17,0,0,0,0,0
9997,299.0,308.6,1645,33.4,22,0,0,0,0,0
9998,299.0,308.7,1408,48.5,25,0,0,0,0,0


In [14]:
df = df.rename(columns={"Air temperature [K]":'Air_temp','Process temperature [K]':'process_temp','Rotational speed [rpm]':'Rotational_speed','Torque [Nm]':'Torque','Tool wear [min]':'Tool_wear'})

In [15]:
df

Unnamed: 0,Air_temp,process_temp,Rotational_speed,Torque,Tool_wear,TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,0,0,0,0,0
9996,298.9,308.4,1632,31.8,17,0,0,0,0,0
9997,299.0,308.6,1645,33.4,22,0,0,0,0,0
9998,299.0,308.7,1408,48.5,25,0,0,0,0,0


In [17]:
import statsmodels.formula.api as smf
l = smf.ols(formula = 'Air_temp ~ process_temp + Rotational_speed+Torque+Tool_wear+TWF+HDF+PWF+OSF+RNF', data=df).fit()
l.conf_int()
l.summary()

0,1,2,3
Dep. Variable:,Air_temp,R-squared:,0.776
Model:,OLS,Adj. R-squared:,0.775
Method:,Least Squares,F-statistic:,3838.0
Date:,"Mon, 27 Dec 2021",Prob (F-statistic):,0.0
Time:,18:59:24,Log-Likelihood:,-13649.0
No. Observations:,10000,AIC:,27320.0
Df Residuals:,9990,BIC:,27390.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-64.1882,1.999,-32.105,0.000,-68.107,-60.269
process_temp,1.1737,0.006,183.291,0.000,1.161,1.186
Rotational_speed,0.0002,0.000,1.556,0.120,-4.85e-05,0.000
Torque,0.0002,0.002,0.098,0.922,-0.004,0.004
Tool_wear,8.399e-05,0.000,0.553,0.580,-0.000,0.000
TWF,0.1184,0.141,0.838,0.402,-0.158,0.395
HDF,1.6952,0.090,18.835,0.000,1.519,1.872
PWF,0.0661,0.108,0.614,0.539,-0.145,0.277
OSF,-0.1129,0.100,-1.128,0.259,-0.309,0.083

0,1,2,3
Omnibus:,648.362,Durbin-Watson:,0.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,241.436
Skew:,-0.091,Prob(JB):,3.74e-53
Kurtosis:,2.261,Cond. No.,334000.0


In [19]:
df.head()

Unnamed: 0,Air_temp,process_temp,Rotational_speed,Torque,Tool_wear,TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0


In [20]:
x = df.drop(columns=['Air_temp'])

In [21]:
x

Unnamed: 0,process_temp,Rotational_speed,Torque,Tool_wear,TWF,HDF,PWF,OSF,RNF
0,308.6,1551,42.8,0,0,0,0,0,0
1,308.7,1408,46.3,3,0,0,0,0,0
2,308.5,1498,49.4,5,0,0,0,0,0
3,308.6,1433,39.5,7,0,0,0,0,0
4,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
9995,308.4,1604,29.5,14,0,0,0,0,0
9996,308.4,1632,31.8,17,0,0,0,0,0
9997,308.6,1645,33.4,22,0,0,0,0,0
9998,308.7,1408,48.5,25,0,0,0,0,0


In [22]:
y = df['Air_temp']
y

0       298.1
1       298.2
2       298.1
3       298.2
4       298.2
        ...  
9995    298.8
9996    298.9
9997    299.0
9998    299.0
9999    299.0
Name: Air_temp, Length: 10000, dtype: float64

In [23]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [24]:
x_scaled

array([[-0.94735989,  0.06818514,  0.28219976, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.72947151,  0.63330802, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-1.01476077, -0.22744984,  0.94428963, ..., -0.09793424,
        -0.09948362, -0.04363046],
       ...,
       [-0.94735989,  0.59251888, -0.66077672, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.72947151,  0.85400464, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.2162938 ,  0.02137647, ..., -0.09793424,
        -0.09948362, -0.04363046]])

In [27]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = x_scaled
vif = pd.DataFrame()
vif['Vif'] = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
vif['Features'] = x.columns
vif

Unnamed: 0,Vif,Features
0,1.004777,process_temp
1,5.154221,Rotational_speed
2,5.222899,Torque
3,1.039904,Tool_wear
4,1.015584,TWF
5,1.024976,HDF
6,1.212152,PWF
7,1.082597,OSF
8,1.002015,RNF


In [28]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled,y, test_size=0.25, random_state=355)

In [29]:
x_train

array([[ 0.33325694,  1.80852691, -1.78432317, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [ 0.26585605, -0.29438606,  0.63330802, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [ 0.40065782,  0.73196934, -1.09213831, ..., -0.09793424,
        -0.09948362, -0.04363046],
       ...,
       [ 1.20946845, -1.13108884,  1.84713945, ..., -0.09793424,
        10.0519061 , -0.04363046],
       [-0.34075192,  1.05549441, -1.40311991, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [ 0.80506314, -0.58444302,  0.56308637, ..., -0.09793424,
        -0.09948362, -0.04363046]])

In [30]:
log_r = LinearRegression()
log_r.fit(x_train,y_train)

LinearRegression()

In [31]:
log_r.score(x_train,y_train)

0.7806812508144696

In [32]:
def adj_r2(x,y):
    r2 = log_r.score(x,y)
    n=x.shape[0]
    p=x.shape[1]
    adjusted_r2=1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [33]:
adj_r2(x_train,y_train)

0.7804177169369436

In [34]:
pickle.dump(log_r,open('UCI_linreg_model.pickle','wb'))

In [35]:
df.head()

Unnamed: 0,Air_temp,process_temp,Rotational_speed,Torque,Tool_wear,TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0


In [37]:
log_r.predict([[308.6,1551,42.8,0,0,0,0,0,0]])

array([911.54878066])

In [38]:
test1 = scaler.transform([[308.6,1551,42.8,0,0,0,0,0,0]])
test2 = scaler.transform([[308.7,1408,46.3,3,0,0,0,0,0]])
test3 = scaler.transform([[308.5,1498,49.4,5,0,0,0,0,0]])
test4 = scaler.transform([[308.6,1433,39.5,7,0,0,0,0,0]])

In [39]:
log_r.predict(test3)

array([298.20931646])