# Machine Model Training

In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

%matplotlib inline 

# ML training and tuning 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score




# Loading and saving models
import pickle



In [182]:
df = pd.read_csv('garments_worker_productivity.csv')

### Exploratory Data Analysis

In [183]:
df.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [184]:
df.describe()

Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
count,1197.0,1197.0,1197.0,691.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0
mean,6.426901,0.729632,15.062172,1190.465991,4567.460317,38.210526,0.730159,0.369256,0.150376,34.609858,0.735091
std,3.463963,0.097891,10.943219,1837.455001,3348.823563,160.182643,12.709757,3.268987,0.427848,22.197687,0.174488
min,1.0,0.07,2.9,7.0,0.0,0.0,0.0,0.0,0.0,2.0,0.233705
25%,3.0,0.7,3.94,774.5,1440.0,0.0,0.0,0.0,0.0,9.0,0.650307
50%,6.0,0.75,15.26,1039.0,3960.0,0.0,0.0,0.0,0.0,34.0,0.773333
75%,9.0,0.8,24.26,1252.5,6960.0,50.0,0.0,0.0,0.0,57.0,0.850253
max,12.0,0.8,54.56,23122.0,25920.0,3600.0,300.0,45.0,2.0,89.0,1.120437


In [185]:
# Selecting categorical columns
df.describe(exclude = 'number')

Unnamed: 0,date,quarter,department,day
count,1197,1197,1197,1197
unique,59,5,3,6
top,3/11/2015,Quarter1,sweing,Wednesday
freq,24,360,691,208


In [186]:
# Check types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   object 
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    691 non-null    float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

In [187]:
# Find the percentage of missing values
round (df.isna().sum()/df.shape[0], 2)

date                     0.00
quarter                  0.00
department               0.00
day                      0.00
team                     0.00
targeted_productivity    0.00
smv                      0.00
wip                      0.42
over_time                0.00
incentive                0.00
idle_time                0.00
idle_men                 0.00
no_of_style_change       0.00
no_of_workers            0.00
actual_productivity      0.00
dtype: float64

In [188]:
df["wip"].fillna(df["wip"].mean(),inplace=True)
# zero_imputer = SimpleImputer( strategy='constant', fill_value=0)
# df["wip"] = pd.Series(zero_imputer.fit_transform(np.array(df["wip"]).reshape(1, -1)).reshape(-1))

In [189]:
df["quarter"].unique()

array(['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4', 'Quarter5'],
      dtype=object)

In [190]:
# df["quarter"]=df["quarter"].replace(['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4', 'Quarter5'],[1,2,3,4,5])

In [191]:
df["department"].unique()

array(['sweing', 'finishing ', 'finishing'], dtype=object)

In [192]:
df["department"]=df["department"].str.strip()
df["department"]=df["department"].replace('sweing','sewing')

In [195]:
df["department"].unique()

array(['sewing', 'finishing'], dtype=object)

In [198]:
# df["date"]=df["date"].str.strip()
df["date"] = pd.to_datetime(df["date"])
df['date']=df['date'].map(dt.datetime.toordinal)


In [199]:
df['date']

0       719163
1       719163
2       719163
3       719163
4       719163
         ...  
1192    719163
1193    719163
1194    719163
1195    719163
1196    719163
Name: date, Length: 1197, dtype: int64

In [200]:
df["day"].unique()

array(['Thursday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday'],
      dtype=object)

In [201]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   int64  
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    1197 non-null   float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

### Featue Engineering 

In [202]:
one_hot = OneHotEncoder()

col_names = ["department", "day","quarter"]

one_hot_df = one_hot.fit_transform(df[col_names]).toarray()


In [203]:
one_hot.categories_

[array(['finishing', 'sewing'], dtype=object),
 array(['Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'],
       dtype=object),
 array(['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4', 'Quarter5'],
       dtype=object)]

In [168]:
one_hot_df

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [169]:
def col_names(one_hot_model):
        
    column_names = []
    
    col_names = ["department", "day","quarter"]

    for y in range(len(one_hot_model.categories_)):
        for z in range(len(one_hot_model.categories_[y])):
            column_names.append(col_names[y]+"_"+one_hot_model.categories_[y][z])
            
    return column_names

column_names = col_names(one_hot)

In [170]:
oh_df = pd.DataFrame(
    one_hot_df,
    index=df.index,
    columns = column_names
)


print(oh_df.shape)


(1197, 13)


In [171]:
oh_df.head()

Unnamed: 0,department_finishing,department_sewing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,quarter_Quarter1,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [172]:
# df_new = df.merge(oh_df,left_index=True, right_index=True).drop(
#     ["department", "day","quarter"]
#     ,axis=1)
# print(df_new.shape)


(1197, 25)


In [173]:
# df_new

Unnamed: 0,date,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,...,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,quarter_Quarter1,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5
0,735599,8,0.80,26.16,1108.000000,7080,98,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,735599,1,0.75,3.94,1190.465991,960,0,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,735599,11,0.80,11.41,968.000000,3660,50,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,735599,12,0.80,11.41,968.000000,3660,50,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,735599,6,0.80,25.90,1170.000000,1920,50,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,735668,10,0.75,2.90,1190.465991,960,0,0.0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1193,735668,8,0.70,3.90,1190.465991,960,0,0.0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1194,735668,7,0.65,3.90,1190.465991,960,0,0.0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1195,735668,9,0.75,2.90,1190.465991,1800,0,0.0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [174]:
# df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   int64  
 1   team                   1197 non-null   int64  
 2   targeted_productivity  1197 non-null   float64
 3   smv                    1197 non-null   float64
 4   wip                    1197 non-null   float64
 5   over_time              1197 non-null   int64  
 6   incentive              1197 non-null   int64  
 7   idle_time              1197 non-null   float64
 8   idle_men               1197 non-null   int64  
 9   no_of_style_change     1197 non-null   int64  
 10  no_of_workers          1197 non-null   float64
 11  actual_productivity    1197 non-null   float64
 12  department_finishing   1197 non-null   float64
 13  department_sewing      1197 non-null   float64
 14  day_Monday             1197 non-null   float64
 15  day_

In [175]:
# df_new=df_new.astype(float)

In [176]:
# df_new

Unnamed: 0,date,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,...,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,quarter_Quarter1,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5
0,735599.0,8.0,0.80,26.16,1108.000000,7080.0,98.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,735599.0,1.0,0.75,3.94,1190.465991,960.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,735599.0,11.0,0.80,11.41,968.000000,3660.0,50.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,735599.0,12.0,0.80,11.41,968.000000,3660.0,50.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,735599.0,6.0,0.80,25.90,1170.000000,1920.0,50.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,735668.0,10.0,0.75,2.90,1190.465991,960.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1193,735668.0,8.0,0.70,3.90,1190.465991,960.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1194,735668.0,7.0,0.65,3.90,1190.465991,960.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1195,735668.0,9.0,0.75,2.90,1190.465991,1800.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [179]:
corr=df_new.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,date,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,department_finishing,department_sewing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,quarter_Quarter1,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5
date,1.0,0.008856,-0.098956,0.000955,-0.023143,-0.25459,0.105767,0.007818,0.076985,0.315056,-0.012219,-0.122575,0.007069,-0.007069,0.013083,-0.079704,-0.003951,-0.068964,0.057801,0.07817,-0.063183,0.043694,-0.073793,0.11594,-0.050825
team,0.008856,1.0,0.030274,-0.110011,-0.025384,-0.096737,-0.007674,0.003796,0.026974,-0.011194,-0.075113,-0.148753,-0.031753,0.031753,0.019472,-0.006533,0.000861,-0.011635,0.001416,-0.00369,-0.018775,-0.01183,0.028769,0.00722,0.000277
targeted_productivity,-0.098956,0.030274,1.0,-0.069489,0.049114,-0.088557,0.032768,-0.056181,-0.053818,-0.209294,-0.084288,0.421594,0.067508,-0.067508,-0.001074,0.017842,0.036051,-0.040747,-0.009276,-0.002558,0.096855,0.015654,-0.039795,-0.078973,-0.02286
smv,0.000955,-0.110011,-0.069489,1.0,-0.018322,0.674887,0.032629,0.056863,0.105901,0.315388,0.912176,-0.122089,-0.87423,0.87423,0.000305,-0.016807,0.004015,0.002882,0.009399,-0.000277,-0.003996,-0.013198,0.036639,-0.00651,-0.018811
wip,-0.023143,-0.025384,0.049114,-0.018322,1.0,0.01449,0.021881,-0.026267,-0.04849,-0.069004,0.009791,0.088365,0.0,-0.0,0.18021,-0.032154,-0.032258,-0.04108,-0.033646,-0.040754,0.10733,-0.063206,-0.032647,-0.020539,-0.00055
over_time,-0.25459,-0.096737,-0.088557,0.674887,0.01449,1.0,-0.004793,0.031038,-0.017913,0.05979,0.734164,-0.054206,-0.677519,0.677519,-0.027983,0.016666,0.002245,0.084735,-0.017185,-0.057001,-0.016956,-0.039564,0.045272,0.043339,-0.049138
incentive,0.105767,-0.007674,0.032768,0.032629,0.021881,-0.004793,1.0,-0.012024,-0.02114,-0.026607,0.049222,0.076538,-0.045782,0.045782,0.173065,-0.031046,-0.039635,-0.033605,-0.032179,-0.036281,-0.044075,0.122725,-0.043119,-0.042757,-0.006135
idle_time,0.007818,0.003796,-0.056181,0.056863,-0.026267,0.031038,-0.012024,1.0,0.559146,-0.011598,0.058049,-0.080851,-0.049181,0.049181,-0.025311,0.078508,-0.022906,-0.023456,-0.022388,0.017463,0.081038,-0.035829,-0.01994,-0.028082,-0.011227
idle_men,0.076985,0.026974,-0.053818,0.105901,-0.04849,-0.017913,-0.02114,0.559146,1.0,0.133632,0.106946,-0.181734,-0.096701,0.096701,-0.043594,0.009119,0.010247,0.001042,0.014213,0.008903,0.044081,-0.070447,0.072232,-0.029381,-0.022075
no_of_style_change,0.315056,-0.011194,-0.209294,0.315388,-0.069004,0.05979,-0.026607,-0.011598,0.133632,1.0,0.327787,-0.207366,-0.300889,0.300889,-0.004852,-0.011408,-0.033972,0.021382,0.009273,0.019187,-0.188003,-0.01469,0.074066,0.191381,-0.068688


In [None]:
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm')

### Model Training

In [180]:
# split to X and y
# X=df_new.drop("actual_productivity",axis=1)
# y=df_new["actual_productivity"]

In [None]:
X=df.drop("actual_productivity",axis=1)
y=df["actual_productivity"]

In [129]:
# sc=StandardScaler()
# X=sc.fit_transform(X)
# y=sc.transform(y)

In [130]:
# split to X_train, X_test, y_train, y_test
X_train, X_test, y_train,y_test = train_test_split(
    X,y,
    test_size=0.2,
    train_size = 0.8,
    random_state=42
)

In [131]:
lin_reg=LinearRegression()
lin_reg.fit(X_train,y_train)

LinearRegression()

In [132]:
y_preds=lin_reg.predict(X_test)


In [133]:
r2_score(y_true=y_test, y_pred=y_preds)

0.14074403230368626

In [134]:
mean_absolute_error(y_true=y_test, y_pred=y_preds)

0.11114686283643235

### Retrain model using grid search

In [135]:
lr = LinearRegression()

In [136]:
# Create a dictiionary of the parameters you want to tune

# Note: more paramaters and values --> increase the run time

p_grid = {'fit_intercept': [True, False],
          'normalize': [True, False]}

# Note: by default the cross validation is set to 5

grid = GridSearchCV(lr, p_grid)

#grid = GridSearchCV(lr, p_grid, cv = 10)

In [137]:
X_train


array([[ 1.67514272,  0.45432264, -0.3028349 , ..., -0.4612656 ,
        -0.51120222, -0.19534922],
       [-0.11910614, -1.56732865,  0.20815097, ..., -0.4612656 ,
        -0.51120222, -0.19534922],
       [-1.47691608,  1.03193729,  0.20815097, ..., -0.4612656 ,
        -0.51120222, -0.19534922],
       ...,
       [ 1.5781563 , -0.41209934, -1.32480663, ..., -0.4612656 ,
        -0.51120222, -0.19534922],
       [ 0.70527847,  0.16551531,  0.20815097, ...,  2.16794834,
        -0.51120222, -0.19534922],
       [ 1.52966308,  1.03193729, -0.3028349 , ..., -0.4612656 ,
        -0.51120222, -0.19534922]])

In [138]:
grid.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

GridSearchCV(estimator=LinearRegression(),
             param_grid={'fit_intercept': [True, False],
                         'normalize': [True, False]})

In [139]:
grid.cv_results_

{'mean_fit_time': array([0.0043767 , 0.0013957 , 0.00130148, 0.00113144]),
 'std_fit_time': array([0.00415628, 0.00010943, 0.00016608, 0.00011675]),
 'mean_score_time': array([0.0004209 , 0.00036454, 0.00039091, 0.00032921]),
 'std_score_time': array([1.35017315e-04, 6.74599039e-05, 1.37761432e-04, 9.73890606e-05]),
 'param_fit_intercept': masked_array(data=[True, True, False, False],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_normalize': masked_array(data=[True, False, True, False],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'fit_intercept': True, 'normalize': True},
  {'fit_intercept': True, 'normalize': False},
  {'fit_intercept': False, 'normalize': True},
  {'fit_intercept': False, 'normalize': False}],
 'split0_test_score': array([-6.42477574e-01, -8.22127565e-02, -4.97558930e+03, -4.97558930e+03]),
 'split1_test_score': array([  0.16760644,   0

In [140]:
grid_df = pd.DataFrame(grid.cv_results_)

grid_df 



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fit_intercept,param_normalize,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004377,0.004156,0.000421,0.000135,True,True,"{'fit_intercept': True, 'normalize': True}",-0.642478,0.167606,0.368126,0.245521,0.342034,0.096162,0.376143,2
1,0.001396,0.000109,0.000365,6.7e-05,True,False,"{'fit_intercept': True, 'normalize': False}",-0.082213,0.190005,0.358316,0.258688,0.33999,0.212957,0.15938,1
2,0.001301,0.000166,0.000391,0.000138,False,True,"{'fit_intercept': False, 'normalize': True}",-4975.589299,-22.988326,-15.957188,-17.628225,-16.51143,-1009.734894,1982.928772,3
3,0.001131,0.000117,0.000329,9.7e-05,False,False,"{'fit_intercept': False, 'normalize': False}",-4975.589299,-22.988326,-15.957188,-17.628225,-16.51143,-1009.734894,1982.928772,3


In [141]:
parameters = ['param_fit_intercept', 'param_normalize', 'mean_test_score', 'rank_test_score']

grid_df[parameters]

Unnamed: 0,param_fit_intercept,param_normalize,mean_test_score,rank_test_score
0,True,True,0.096162,2
1,True,False,0.212957,1
2,False,True,-1009.734894,3
3,False,False,-1009.734894,3


In [142]:
# Note: if you have more than one combination "possibility" that give you the best score
# then the first one in the list will be chosen

# exmple option_1 ---> score: 0.87
# exmple option_2 ---> score: 0.89
# exmple option_3 ---> score: 0.80
# exmple option_4 ---> score: 0.89

# grid.best_params_ will give you option 2
grid.best_params_

{'fit_intercept': True, 'normalize': False}

In [143]:
grid.best_score_

0.2129571701985357

In [144]:
# From there, I still need to remake the model...
lr = LinearRegression(fit_intercept=True,
                      normalize=True)

In [145]:
lr.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




LinearRegression(normalize=True)

In [146]:
preds = lr.predict(X_test)

In [147]:
r2_score(y_true=y_test, y_pred=preds)

0.17166330427041288

In [148]:
mean_absolute_error(y_true=y_test, y_pred=preds)

0.10850397241948444

### Save your model and other files if needed (encoder, scaler)

In [31]:
# use pickle