# Regressor Pre-Defined functions with Linear Regression Example

## Defining the functions

In [1]:
import pandas as pd
import numpy as np

#This dataframe stores the scores from regressor models
df_model=pd.DataFrame(columns=['Model', 'MAE' ,'RMSE', 'R2 Score' , 'Adjusted R2 Score'])
df_model_performance =df_model

#This data frame stores the train and test "adjusted R2 scores" from regressor models to compare at the end of the model building. This can also be further modified to compare the other score such as MSE , RMSE  etc
df_model_test_train_r2 = pd.DataFrame(columns=['Model' , 'Train Adjusted R2 Score' ,'Test Adjusted R2 Score'])
df_model_r2 =df_model_test_train_r2

In [2]:
from sklearn.model_selection import GridSearchCV 
def get_best_hyperparameters(model, params, cv_value , X_train, y_train ): 
    search = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, verbose=1,cv=cv_value) 
    search.fit(X_train, y_train)  
    print("Best Accuracy    :",  search.best_score_) 
    print("Best Parameters  : ", search.best_params_)
    print("Best Estimators : ",  search.best_estimator_)  
    best_grid = search.best_estimator_
    return best_grid

In [3]:
def get_regressor_predictions(regressor, X_train, y_train, X_test):  
    regressor.fit(X_train,y_train)
    y_pred_train =regressor.predict(X_train)
    y_pred_test = regressor.predict(X_test)
    return y_pred_train, y_pred_test

In [4]:
# For regressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def print_regressor_scores(regressor, X_train, X_test, y_train ,y_test,y_pred_train, y_pred_test,algorithm):
    
    # store regressor scores for Training Dataset
    MAE_train = mean_absolute_error(y_train, y_pred_train)
    RMSE_train = np.sqrt( mean_squared_error(y_train, y_pred_train))
    r2_score_train = r2_score(y_train, y_pred_train)
    # Calculating Adjusted R2 for training set
    SS_Residual_train = sum((y_train-y_pred_train)**2)
    SS_Total_train = sum((y_train-np.mean(y_train))**2)
    r_squared_train = 1 - (float(SS_Residual_train))/SS_Total_train
    adj_r_sq_train = 1 - (1-r_squared_train)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
    
    # print regressor scores for Training Dataset
    print('MAE for training set is {}'.format(MAE_train))
    print('RMSE for training set is {}'.format(RMSE_train))
    print('R squared score for training set is {}'.format(r2_score_train))
    print('Adjusted R squared score for training set is {}'.format(adj_r_sq_train))
    
    # store regressor scores for Test Dataset
    MAE_test = mean_absolute_error(y_test, y_pred_test)
    RMSE_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    r2_score_test = r2_score(y_test, y_pred_test)
    # Calculating Adjusted R2 for test set
    SS_Residual_test = sum((y_test-y_pred_test)**2)
    SS_Total_test = sum((y_test-np.mean(y_test))**2)
    r_squared_test = 1 - (float(SS_Residual_test))/SS_Total_test
    adj_r_sq_test = 1 - (1-r_squared_test)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    
    # print regressor scores for Test Dataset 
    print('MAE for test set is {}'.format(MAE_test))
    print('RMSE for test set is {}'.format(RMSE_test))
    print('R squared score for test set is {}'.format(r2_score_test))
    print('Adjusted R squared score for testing set is {}'.format(adj_r_sq_test))
    
    # store to append the results in dataframe for final comparison of performance
    df_model_test_train_r2= dict({'Model' : algorithm, 'Train Adjusted R2 Score' :adj_r_sq_train,'Test Adjusted R2 Score' :adj_r_sq_test })
    df_model_performance = dict({'Model' : algorithm, 'MAE' : MAE_test, 'RMSE' : RMSE_test, 'R2 Score' : r2_score_test, 'Adjusted R2 Score' :adj_r_sq_test})
    return df_model_test_train_r2 , df_model_performance

## Get the data set and transform into X (independent ) and y (dependent)

In [5]:
df_garment_prod = pd.read_csv(filepath_or_buffer = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00597/garments_worker_productivity.csv')
print('Data Shape:', df_garment_prod.shape)
df_garment_prod.head()

Data Shape: (1197, 15)


Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [6]:
df_garment_prod = df_garment_prod.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

In [7]:
df_garment_prod=df_garment_prod.dropna()

In [8]:
X = df_garment_prod.iloc[:,:-2] 

In [9]:
y = df_garment_prod['actual_productivity']

In [10]:
X = pd.get_dummies(X, columns=['quarter' ,'department' , 'team', 'no_of_style_change'],  drop_first=True)

In [11]:
X.drop(columns=['date','day'],inplace=True)

## Split the dataset into Train and Test

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(897, 26)
(897,)
(300, 26)
(300,)


## Linear Regression Model using pre-defined functions

Hyperparameter Tuning - This step is optional or and empty parameter dictionary can also be passed

In [13]:
from sklearn.linear_model import LinearRegression
parameters = {'fit_intercept':[True,False],  'copy_X':[True, False]}
lr_best_grid= get_best_hyperparameters(LinearRegression(), parameters, 5, X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Accuracy    : 0.23458278669584232
Best Parameters  :  {'copy_X': True, 'fit_intercept': True}
Best Estimators :  LinearRegression()


Fit and predict using Best Grid obtained from hyperparameter tuning

In [14]:
y_pred_train, y_pred_test = get_regressor_predictions(lr_best_grid, X_train, y_train, X_test )

Calculate and print the perfomance metrics

In [15]:
df_model_test_train_r2_1, df_model_performance1=print_regressor_scores(lr_best_grid, X_train, X_test, y_train , y_test, y_pred_train, y_pred_test , 'Linear Regression')

MAE for training set is 0.10613632985055337
RMSE for training set is 0.14603251981172016
R squared score for training set is 0.3088477292725489
Adjusted R squared score for training set is 0.2881926039404653
MAE for test set is 0.1066209355896
RMSE for test set is 0.14358671503672177
R squared score for test set is 0.2912632191346879
Adjusted R squared score for testing set is 0.22376447809989575


Append the performance metric result to the dataframe to compare the results

In [16]:
df_model=df_model.append(df_model_performance1,ignore_index=True )
df_model_r2= df_model_r2.append(df_model_test_train_r2_1, ignore_index=True)

In [17]:
df_model

Unnamed: 0,Model,MAE,RMSE,R2 Score,Adjusted R2 Score
0,Linear Regression,0.106621,0.143587,0.291263,0.223764


In [18]:
df_model_r2

Unnamed: 0,Model,Train Adjusted R2 Score,Test Adjusted R2 Score
0,Linear Regression,0.288193,0.223764
