In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from feature_selector_regression import RegressionFeatureSelector

In [2]:
dataset_path = 'garments_worker_productivity.csv'
df = pd.read_csv(dataset_path)

In [3]:
df.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   object 
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    691 non-null    float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

In [5]:
df = df.drop('targeted_productivity', axis=1)
df = df.drop('date', axis=1)

In [6]:
df = df.fillna(0)

In [7]:
numeric_features=['team', 'smv', 'wip', 'over_time', 'incentive', 'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers', 'actual_productivity']
categorical_features=['quarter', 'department', 'day']

In [8]:
df_test = df
df_test = df_test[numeric_features + categorical_features]
df_test = pd.concat([df_test[numeric_features], pd.get_dummies(df_test[categorical_features])],
                     axis=1)

In [9]:
df_test.head()

Unnamed: 0,team,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,...,quarter_Quarter5,department_finishing,department_finishing.1,department_sweing,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725,...,0,0,0,1,0,0,0,1,0,0
1,1,3.94,0.0,960,0,0.0,0,0,8.0,0.8865,...,0,0,1,0,0,0,0,1,0,0
2,11,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057,...,0,0,0,1,0,0,0,1,0,0
3,12,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057,...,0,0,0,1,0,0,0,1,0,0
4,6,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382,...,0,0,0,1,0,0,0,1,0,0


In [10]:
df_test.shape

(1197, 24)

In [11]:
df.shape

(1197, 13)

In [12]:
selector = RegressionFeatureSelector(n_jobs=-1)

In [13]:
selector.select_features(data_frame=df.copy(),
                             number_of_features=df_test.shape[1] - 1,
                             target_name='actual_productivity',
                             numeric_features=['team', 'smv', 'wip', 'over_time', 'incentive', 'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers', 'actual_productivity'],
                             categorical_features=['quarter', 'department', 'day'])

Calculating pearson
Calculating mutual_info
Calculating rfe
Calculating lin-reg
Calculating rf
Calculating lgbm


  return reduction(axis=axis, out=out, **passkwargs)


In [14]:
selector.X_scaled_.shape[1]

23

In [15]:
mse = 1
number_of_features = None
best_features = None
estimator = None

In [16]:
for i in range(1, selector.X_scaled_.shape[1]):
    print(f'calculating for number_of_features: {i}')
    best_features = selector.best_features_[:i]
    X = selector.X_scaled_[best_features]
    y = selector.y_
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=50, criterion='mse', random_state=42)
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse_ = mean_squared_error(y_test, y_pred)
    if mse_ < mse:
        print(f'mse: {mse_}')
        print(mse - mse_)
        mse = mse_
        number_of_features = i
        best_features = best_features
        estimator = model

calculating for number_of_features: 1
mse: 0.018656525357983678
0.9813434746420163
calculating for number_of_features: 2
calculating for number_of_features: 3
mse: 0.018129238537186385
0.0005272868207972931
calculating for number_of_features: 4
mse: 0.015550216721595945
0.0025790218155904397
calculating for number_of_features: 5
calculating for number_of_features: 6
calculating for number_of_features: 7
calculating for number_of_features: 8
calculating for number_of_features: 9
calculating for number_of_features: 10
mse: 0.015150965860031784
0.0003992508615641606
calculating for number_of_features: 11
mse: 0.014265212644834107
0.0008857532151976769
calculating for number_of_features: 12
calculating for number_of_features: 13
calculating for number_of_features: 14
calculating for number_of_features: 15
calculating for number_of_features: 16
calculating for number_of_features: 17
calculating for number_of_features: 18
calculating for number_of_features: 19
calculating for number_of_featu

  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)
  model = model.fit(X_train, y_train)


In [17]:
best_features = best_features[:number_of_features]

In [18]:
print(f'mse: {mse}')
print(f'best number of features: {number_of_features}')
print(f'best_features: {best_features}')

mse: 0.014265212644834107
best number of features: 11
best_features: ['smv', 'over_time', 'no_of_workers', 'incentive', 'team', 'wip', 'no_of_style_change', 'idle_men', 'quarter_Quarter5', 'quarter_Quarter4', 'quarter_Quarter3']


In [19]:
max_depth = np.max([e.get_depth() for e in estimator.estimators_])
max_depth

26

In [20]:
n_estimators = [50, 100, 150, 300, 500, 1000]
max_depths = [None] + [num for num in np.linspace(5, max_depth, 4)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
max_features = ['auto', 'sqrt', 'log2']
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators,
    'criterion': ['mse'],
    'max_depth': max_depths,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'max_features': max_features,
    'bootstrap': bootstrap,
    'random_state': [42]
}

In [21]:
grid_model = RandomForestRegressor()
grid = GridSearchCV(grid_model, random_grid, n_jobs=-1)

In [22]:
X = selector.X_scaled_[best_features]
y = selector.y_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
X.head()

Unnamed: 0,smv,over_time,no_of_workers,incentive,team,wip,no_of_style_change,idle_men,quarter_Quarter5,quarter_Quarter4,quarter_Quarter3
0,0.450252,0.273148,0.655172,0.027222,0.636364,0.04792,0.0,0.0,0.0,0.0,0.0
1,0.020132,0.037037,0.068966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.164731,0.141204,0.327586,0.013889,0.909091,0.041865,0.0,0.0,0.0,0.0,0.0
3,0.164731,0.141204,0.327586,0.013889,1.0,0.041865,0.0,0.0,0.0,0.0,0.0
4,0.445219,0.074074,0.62069,0.013889,0.454545,0.050601,0.0,0.0,0.0,0.0,0.0


In [24]:
grid.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True, False], 'criterion': ['mse'],
                         'max_depth': [None, 5.0, 12.0, 19.0, 26.0],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 150, 300, 500, 1000],
                         'random_state': [42]})

In [25]:
grid.best_params_

{'bootstrap': False,
 'criterion': 'mse',
 'max_depth': 12.0,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 100,
 'random_state': 42}

In [26]:
best_estimator = grid.best_estimator_

In [27]:
y_pred_best = best_estimator.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
mse_best

0.014462858333881312

In [28]:
percentage = (1 - mse_best/mse) * 100
f'Improvement of {percentage.round(2)}%'



'Improvement of -1.39%'

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_