# The New Approach (Merging Rows Approch)

In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

# Reading the Data

In [2]:
train_data = pd.read_csv("train_data.csv", na_values=[" ","?","#"])
test_data = pd.read_csv("test_data.csv", na_values=[" ", "?", "#"])
assembly_line_data = pd.read_csv("assembly_line_info.csv", na_values=[" ", "?", "#"])
car_variant_data = pd.read_csv("car_variant_data.csv", na_values=[" ", "?", "#"])
issue_info_data = pd.read_csv("issue_info.csv", na_values=[" ", "?", "#"])
log_report_type_data = pd.read_csv("log_report_type_data.csv", na_values=[" ", "?", "#"])

In [3]:
print("The shape of train data is: ",train_data.shape)
print("The shape of test data is: ",test_data.shape)
print("The shape of assembly line data is: ",assembly_line_data.shape)
print("The shape of car variant data is: ",car_variant_data.shape)
print("The shape of issue info data is: ",issue_info_data.shape)
print("The shape of the log report data is: ",log_report_type_data.shape)

The shape of train data is:  (5904, 3)
The shape of test data is:  (1477, 2)
The shape of assembly line data is:  (21076, 2)
The shape of car variant data is:  (31170, 2)
The shape of issue info data is:  (18552, 2)
The shape of the log report data is:  (58671, 3)


In [4]:
train_data.head() 

Unnamed: 0,id,factory_number,downtime_duration
0,13366,factory_number_415,1
1,6783,factory_number_474,0
2,9519,factory_number_931,1
3,10202,factory_number_700,1
4,4555,factory_number_600,2


In [5]:
train_data.downtime_duration.value_counts() 

0    3827
1    1496
2     581
Name: downtime_duration, dtype: int64

In [6]:
test_data.head() 

Unnamed: 0,id,factory_number
0,3340,factory_number_344
1,14067,factory_number_933
2,1134,factory_number_16
3,27,factory_number_793
4,9793,factory_number_344


In [7]:
print(assembly_line_data.shape) 
print(assembly_line_data.id.nunique())
assembly_line_data.head() 

(21076, 2)
18552


Unnamed: 0,id,assembly_line_type
0,6597,assembly_line_type_8
1,8011,assembly_line_type_8
2,2597,assembly_line_type_8
3,5022,assembly_line_type_8
4,6852,assembly_line_type_8


In [99]:
print(issue_info_data.shape)
print(issue_info_data.id.nunique())
print(issue_info_data.issue_type.nunique())
issue_info_data.head() 

(18552, 2)
18552
5


Unnamed: 0,id,issue_type
10448,1,1
8351,2,2
10957,3,1
11106,4,4
7736,5,2


In [103]:
print(car_variant_data.shape)
print(car_variant_data.id.nunique())
print(car_variant_data.car_variant.nunique())
car_variant_data.head() 

(31170, 2)
18552
53


Unnamed: 0,id,car_variant
18299,1,11
18300,1,13
14536,2,34
14535,2,35
19226,3,11


In [104]:
print(log_report_type_data.shape)
print(log_report_type_data.id.nunique())
print(log_report_type_data.log_report_type.nunique()) 
log_report_type_data.head() 

(58671, 2)
18552
386


Unnamed: 0,id,log_report_type
36197,1,68
36199,1,179
36198,1,345
27670,2,233
27669,2,235


In [8]:
# renaming thr column values, sorting and changing the type to int
assembly_line_data.assembly_line_type = assembly_line_data["assembly_line_type"].str.replace("assembly_line_type_","")
assembly_line_data.assembly_line_type = assembly_line_data.assembly_line_type.astype("int")
assembly_line_data = assembly_line_data.sort_values(by=["id","assembly_line_type"])
assembly_line_data.head()

Unnamed: 0,id,assembly_line_type
11285,1,6
11284,1,8
9069,2,2
11953,3,8
12102,4,2


In [9]:
# This function merges rows into one cell based on id
def merge_rows(data,id_col,merge_col):
    unique_id = data[id_col].unique() # get unique names
    new_data = []                 # data list for output dataframe
    for u in unique_id:          # for each unique name
        subdf = data[data[id_col] == u] # get rows with this unique name
        s = ""
        for i in subdf[merge_col]:
            s += ""+str(i)+","      # join all info cells for that name
        new_data.append([u, s[:-1]])

    newdf = pd.DataFrame(data=new_data, columns=[id_col,merge_col])
    return newdf

In [10]:
# Merging the assembly data based on id
assembly_line_data_merged =merge_rows(assembly_line_data,id_col="id",merge_col="assembly_line_type")

In [11]:
assembly_line_data_merged.head() 

Unnamed: 0,id,assembly_line_type
0,1,68
1,2,2
2,3,8
3,4,2
4,5,2


In [12]:
# renaming thr column values, sorting and changing the type to int
car_variant_data.car_variant = car_variant_data["car_variant"].str.replace("car_variant_","")
car_variant_data.car_variant = car_variant_data.car_variant.astype("int")
car_variant_data = car_variant_data.sort_values(by=["id","car_variant"])
car_variant_data.head()

Unnamed: 0,id,car_variant
18299,1,11
18300,1,13
14536,2,34
14535,2,35
19226,3,11


In [13]:
# Merging the car_variant data based on id
car_variant_data_merged = merge_rows(car_variant_data, id_col="id", merge_col="car_variant")

In [14]:
car_variant_data_merged.head() 

Unnamed: 0,id,car_variant
0,1,1113
1,2,3435
2,3,11
3,4,47
4,5,3435


In [15]:
# renaming thr column values, sorting, changing the type to int and aggregating the volume column
log_report_type_data.log_report_type = log_report_type_data["log_report_type"].str.replace("log_report_type_","")
log_report_type_data.log_report_type = log_report_type_data.log_report_type.astype("int")
log_report_type_data = log_report_type_data.sort_values(by=["id","log_report_type"])
log_report_type_data_aggregated = log_report_type_data.groupby("id",as_index=False).agg({"volume":"sum"})
log_report_type_data.head() 

Unnamed: 0,id,log_report_type,volume
36197,1,68,2
36199,1,179,1
36198,1,345,2
27670,2,233,1
27669,2,235,1


In [16]:
# Dropping the Volume Column
log_report_type_data.drop("volume", axis =1,inplace=True) 

In [17]:
# merging the log_report_type based on id
log_report_type_data_merged = merge_rows(log_report_type_data,id_col="id", merge_col="log_report_type")

In [18]:
log_report_type_data_merged.head() 

Unnamed: 0,id,log_report_type
0,1,68179345
1,2,233235312313315
2,3,171
3,4,370
4,5,232312


In [19]:
#Combining the merged log_report_type and aggregated Volume data
log_report_type_data_aggregated_merged = pd.merge(log_report_type_data_merged,
                                                  log_report_type_data_aggregated, on="id",
                                                 how="left")

In [20]:
log_report_type_data_aggregated_merged.head() 

Unnamed: 0,id,log_report_type,volume
0,1,68179345,5
1,2,233235312313315,5
2,3,171,2
3,4,370,3
4,5,232312,17


In [24]:
# renaming thr column values, sorting and changing the type to int
issue_info_data.issue_type = issue_info_data["issue_type"].str.replace("issue_type_","")
issue_info_data.issue_type = issue_info_data.issue_type.astype("int")
issue_info_data = issue_info_data.sort_values(by=["id","issue_type"])
issue_info_data.head()

Unnamed: 0,id,issue_type
10448,1,1
8351,2,2
10957,3,1
11106,4,4
7736,5,2


# Merging The Data

In [25]:
# Merging issue_info and assembly_line_merged_data based on id
df_issue_assembly = pd.merge(issue_info_data, assembly_line_data_merged, on="id", how="inner")
print(df_issue_assembly.shape)
print(df_issue_assembly.id.nunique())
df_issue_assembly.head() 

(18552, 3)
18552


Unnamed: 0,id,issue_type,assembly_line_type
0,1,1,68
1,2,2,2
2,3,1,8
3,4,4,2
4,5,2,2


In [100]:
# merging the issue_assembly_data with car_variant_merged_data based on id
df_issue_assembly_car = pd.merge(df_issue_assembly, car_variant_data_merged, on="id",how="inner")
print(df_issue_assembly_car.shape)
print(df_issue_assembly_car.id.nunique())
df_issue_assembly_car.head() 

(18552, 4)
18552


Unnamed: 0,id,issue_type,assembly_line_type,car_variant
0,1,1,68,1113
1,2,2,2,3435
2,3,1,8,11
3,4,4,2,47
4,5,2,2,3435


In [27]:
# Merging issue_assembly_car data with log_report_type_data_aggregated_merged data
df_issue_assembly_car_log = pd.merge(df_issue_assembly_car, log_report_type_data_aggregated_merged, 
                                     on="id",
                                     how="inner")
print(df_issue_assembly_car_log.shape)
print(df_issue_assembly_car_log.id.nunique())
df_issue_assembly_car_log.head() 

(18552, 6)
18552


Unnamed: 0,id,issue_type,assembly_line_type,car_variant,log_report_type,volume
0,1,1,68,1113,68179345,5
1,2,2,2,3435,233235312313315,5
2,3,1,8,11,171,2
3,4,4,2,47,370,3
4,5,2,2,3435,232312,17


In [28]:
# Performing Label Encoding
encode_columns = ["assembly_line_type","car_variant","log_report_type"]
for i in range(0,len(df_issue_assembly_car_log.columns)):
    j=df_issue_assembly_car_log.columns[i]
    if(j in encode_columns):
        label_encoder = LabelEncoder()                  # converts the character array to numeric array. Assigns levels to unique labels.
        label_encoder.fit(df_issue_assembly_car_log[j])
        df_issue_assembly_car_log[j]  = label_encoder.transform(df_issue_assembly_car_log[j])
        df_issue_assembly_car_log[j]  = df_issue_assembly_car_log[j].astype('category') 

In [29]:
# Creating the Train Data_set
df_train = pd.merge(train_data, df_issue_assembly_car_log, on="id",how="inner")
print(df_train.shape)
df_train.head() 

(5904, 8)


Unnamed: 0,id,factory_number,downtime_duration,issue_type,assembly_line_type,car_variant,log_report_type,volume
0,13366,factory_number_415,1,4,8,245,1073,1
1,6783,factory_number_474,0,2,8,244,801,9
2,9519,factory_number_931,1,2,51,161,2151,11
3,10202,factory_number_700,1,1,51,32,1393,1
4,4555,factory_number_600,2,2,51,161,2175,44


In [30]:
# Creating the Test Data set
df_test = pd.merge(test_data, df_issue_assembly_car_log,on="id",how="inner")
print(df_test.shape) 
df_test.head() 

(1477, 7)


Unnamed: 0,id,factory_number,issue_type,assembly_line_type,car_variant,log_report_type,volume
0,3340,factory_number_344,4,8,245,1073,2
1,14067,factory_number_933,1,8,180,71,1
2,1134,factory_number_16,1,51,32,254,16
3,27,factory_number_793,1,51,32,2035,5
4,9793,factory_number_344,2,8,244,849,24


In [31]:
# Printing the nunique values
print(df_train.log_report_type.nunique())
print(df_train.assembly_line_type.nunique())
print(df_train.car_variant.nunique())


1108
43
197


In [32]:
# Printing the nunique values
print(df_test.log_report_type.nunique())
print(df_test.assembly_line_type.nunique())
print(df_test.car_variant.nunique())

447
33
103


In [33]:
# Type casting the column back to int type
for col in encode_columns:
    df_train[col]= df_train[col].astype("int")
    df_test[col] = df_test[col].astype("int")

In [34]:
df_train.dtypes

id                     int64
factory_number        object
downtime_duration      int64
issue_type             int64
assembly_line_type     int64
car_variant            int64
log_report_type        int64
volume                 int64
dtype: object

In [35]:
df_test.dtypes

id                     int64
factory_number        object
issue_type             int64
assembly_line_type     int64
car_variant            int64
log_report_type        int64
volume                 int64
dtype: object

In [36]:
# renaming thr column values, sorting and changing the type to int both on train and test data
df_train.factory_number = df_train["factory_number"].str.replace("factory_number_","")
df_train.factory_number = df_train.factory_number.astype("int")
df_test.factory_number = df_test["factory_number"].str.replace("factory_number_","")
df_test.factory_number = df_test.factory_number.astype("int")

In [37]:
# chamging the data type of target variable to category
df_train.downtime_duration = df_train.downtime_duration.astype("category")

In [38]:
df_train.dtypes

id                       int64
factory_number           int64
downtime_duration     category
issue_type               int64
assembly_line_type       int64
car_variant              int64
log_report_type          int64
volume                   int64
dtype: object

In [39]:
df_test.dtypes

id                    int64
factory_number        int64
issue_type            int64
assembly_line_type    int64
car_variant           int64
log_report_type       int64
volume                int64
dtype: object

In [40]:
# Creating x_train
x_train = df_train.drop("downtime_duration",axis=1)

In [41]:
# Creating y_train
y_train = df_train.downtime_duration

In [42]:
# Building Decison Tree Classifier with CV
paramgrid = {"criterion": ["gini", "entropy"], 
               "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1)),
             "min_samples_split": [1,2,5,10]}
dt_gridsearch = GridSearchCV(DecisionTreeClassifier(class_weight="balanced", random_state=1234),
                             param_grid=paramgrid,n_jobs=-1,cv=10)
dt_gridsearch.fit(x_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                              class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=1234,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'c

In [43]:
y_train_predict_dt_cv = dt_gridsearch.best_estimator_.predict(x_train)
y_test_predict_dt_cv = dt_gridsearch.best_estimator_.predict(df_test)

In [44]:
def classifcation_report_train_test(y_train, y_train_pred):

    print('''
            =========================================
               CLASSIFICATION REPORT FOR TRAIN DATA
            =========================================
            ''')
    print(classification_report(y_train, y_train_pred))

In [45]:
classifcation_report_train_test(y_train,y_train_predict_dt_cv)


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.84      0.53      0.65      3827
           1       0.37      0.51      0.43      1496
           2       0.34      0.85      0.49       581

    accuracy                           0.56      5904
   macro avg       0.52      0.63      0.52      5904
weighted avg       0.67      0.56      0.58      5904



In [46]:
df_test_dt_cv = df_test.copy()
df_test_dt_cv["downtime_duration"] = y_test_predict_dt_cv
df_test_dt_cv.head() 

Unnamed: 0,id,factory_number,issue_type,assembly_line_type,car_variant,log_report_type,volume,downtime_duration
0,3340,344,4,8,245,1073,2,0
1,14067,933,1,8,180,71,1,0
2,1134,16,1,51,32,254,16,1
3,27,793,1,51,32,2035,5,1
4,9793,344,2,8,244,849,24,0


In [47]:
df_test_dt_cv.drop(["factory_number", "issue_type", "assembly_line_type", "car_variant",
                   "log_report_type", "volume"], axis=1, inplace = True)

In [48]:
df_test_dt_cv.head() 

Unnamed: 0,id,downtime_duration
0,3340,0
1,14067,0
2,1134,1
3,27,1
4,9793,0


In [51]:
# Creating the csv file
df_test_dt_cv.to_csv("df_test_dt_cv.csv", sep=',',index=False)

In [52]:
# Building a Random Forest
param_grid = {"n_estimators" : [25,50,75],
           "max_depth" : [2,3,4,5],
           "min_samples_leaf" : [2,3,4,5],'max_features':[3,5,7],
           "class_weight" : ['balanced','balanced_subsample']}
rf_grid = GridSearchCV(RandomForestClassifier(n_jobs=-1,random_state=1),param_grid=param_grid,
                      cv= 10)
rf_grid.fit(x_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=False, random_state=1,
                                    

In [53]:
y_train_predict_rf_cv = rf_grid.best_estimator_.predict(x_train)
y_test_predict_rf_cv = rf_grid.best_estimator_.predict(df_test)

In [54]:
classifcation_report_train_test(y_train, y_train_predict_rf_cv)


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.91      0.65      0.76      3827
           1       0.50      0.61      0.55      1496
           2       0.38      0.87      0.53       581

    accuracy                           0.66      5904
   macro avg       0.60      0.71      0.61      5904
weighted avg       0.75      0.66      0.68      5904



In [55]:
df_test_rf_cv = df_test.copy()
df_test_rf_cv["downtime_duration"] = y_test_predict_rf_cv

In [56]:
df_test_rf_cv.drop(["factory_number", "issue_type", "assembly_line_type", "car_variant",
                   "log_report_type", "volume"], axis=1, inplace = True)
df_test_rf_cv.head() 

Unnamed: 0,id,downtime_duration
0,3340,0
1,14067,0
2,1134,2
3,27,0
4,9793,0


In [57]:
# Creating the csv file
df_test_rf_cv.to_csv("df_test_rf_cv.csv",sep=",", index=False)

In [59]:
# Bulinding LightGBM classifier with CV
lgbm = LGBMClassifier(random_state=1234)
gridParams = {
    'learning_rate': [0.005, 0.01,0.1,0.3],# shrinkage rate at which model learns
    'n_estimators': [50,100,150],# Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy 
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],# Random number seed.
    'colsample_bytree' : [0.64, 0.65, 0.66],# Subsample ratio of columns when constructing each tree.
    'subsample' : [0.7,0.75,0.8],# Subsample ratio of the training instance.
    'reg_alpha' : [1,1.2],# L1 regularization term on weights
    'reg_lambda' : [1,1.2,1.4],# L2 regularization term on weights
    }
grid_lgbm = GridSearchCV(lgbm, gridParams, verbose=10, cv=5, n_jobs=-1)
grid_lgbm.fit(x_train, y_train) 

Fitting 5 folds for each of 10368 candidates, totalling 51840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 9105 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 9240 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 9377 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 9514 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 9653 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 23.0min
[Parallel(n_jobs=-1)]: Done 9933 tasks      | elapsed: 23.1min
[Parallel(n_jobs=-1)]: Done 10074 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done 10217 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 10360 tasks      | elapsed: 23.8min
[Parallel(n_jobs=-1)]: Done 10505 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 10650 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 10797 tasks      | elapsed: 24.9min
[Parallel(n_jobs=-1)]: Done 10944 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 11093 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done 11242 tasks      | 

[Parallel(n_jobs=-1)]: Done 34577 tasks      | elapsed: 114.4min
[Parallel(n_jobs=-1)]: Done 34840 tasks      | elapsed: 114.8min
[Parallel(n_jobs=-1)]: Done 35105 tasks      | elapsed: 115.5min
[Parallel(n_jobs=-1)]: Done 35370 tasks      | elapsed: 116.9min
[Parallel(n_jobs=-1)]: Done 35637 tasks      | elapsed: 118.9min
[Parallel(n_jobs=-1)]: Done 35904 tasks      | elapsed: 119.4min
[Parallel(n_jobs=-1)]: Done 36173 tasks      | elapsed: 120.2min
[Parallel(n_jobs=-1)]: Done 36442 tasks      | elapsed: 121.4min
[Parallel(n_jobs=-1)]: Done 36713 tasks      | elapsed: 123.4min
[Parallel(n_jobs=-1)]: Done 36984 tasks      | elapsed: 123.8min
[Parallel(n_jobs=-1)]: Done 37257 tasks      | elapsed: 124.6min
[Parallel(n_jobs=-1)]: Done 37530 tasks      | elapsed: 125.9min
[Parallel(n_jobs=-1)]: Done 37805 tasks      | elapsed: 128.0min
[Parallel(n_jobs=-1)]: Done 38080 tasks      | elapsed: 128.5min
[Parallel(n_jobs=-1)]: Done 38357 tasks      | elapsed: 129.3min
[Parallel(n_jobs=-1)]: Do

GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=1234, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=Tru...
             param_grid={'boosting_type': ['gbdt', 'dart'],
                         'colsample_bytree': [0.64, 0.65, 0.66],
                         'learning_rate': [0.005, 0.01, 0.1, 0.3],
                         'max_bin': [255, 510], 'n_estimators': [50, 100, 150],
 

In [60]:
y_train_predict_lgbm_cv = grid_lgbm.best_estimator_.predict(x_train)
y_test_predict_lgbm_cv = grid_lgbm.best_estimator_.predict(df_test) 

In [61]:
classifcation_report_train_test(y_train, y_train_predict_lgbm_cv)


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      3827
           1       0.75      0.63      0.68      1496
           2       0.73      0.74      0.73       581

    accuracy                           0.83      5904
   macro avg       0.78      0.76      0.77      5904
weighted avg       0.82      0.83      0.82      5904



In [62]:
df_test_lgbm_cv = df_test.copy() 
df_test_lgbm_cv["downtime_duration"] = y_test_predict_lgbm_cv

In [63]:
df_test_lgbm_cv.drop(["factory_number", "issue_type", "assembly_line_type", "car_variant",
                   "log_report_type", "volume"], axis=1, inplace = True) 

In [64]:
df_test_lgbm_cv.head() 

Unnamed: 0,id,downtime_duration
0,3340,0
1,14067,0
2,1134,2
3,27,0
4,9793,0


In [65]:
# Creating the csv file
df_test_lgbm_cv.to_csv("df_test_lgbm_cv.csv", sep=",", index=False)

In [96]:
# Building Xgboost Classifier

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
model = XGBClassifier(n_estimators=200,learning_rate=0.02, random_state=1234) 

In [98]:
gd_xgb = GridSearchCV(model, param_grid=params,n_jobs=-1,verbose=10,cv=10)
gd_xgb.fit(x_train, y_train)

Fitting 10 folds for each of 405 candidates, totalling 4050 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1

GridSearchCV(cv=10, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.02, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_esti...
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=-1,
             

In [101]:
y_train_predict_xgb_cv = gd_xgb.best_estimator_.predict(x_train)
y_test_predict_xgb_cv = gd_xgb.best_estimator_.predict(df_test)

In [102]:
classifcation_report_train_test(y_train, y_train_predict_xgb_cv) 


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.80      0.91      0.85      3827
           1       0.68      0.46      0.55      1496
           2       0.64      0.59      0.61       581

    accuracy                           0.77      5904
   macro avg       0.71      0.65      0.67      5904
weighted avg       0.75      0.77      0.75      5904



In [106]:
# XgBoost with more parameters
model = XGBClassifier(random_state=1234)
gridParams = {
    'learning_rate': [0.005, 0.01,0.1,0.3],# shrinkage rate at which model learns
    'n_estimators': [50,100,150],# Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy 
    'objective' : ['multi:softmax'],# mentioning objective as multiclass classification Model
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],# Random number seed.
    'colsample_bytree' : [0.64, 0.65, 0.66],# Subsample ratio of columns when constructing each tree.
    'subsample' : [0.7,0.75,0.8],# Subsample ratio of the training instance.
    'reg_alpha' : [1,1.2],# L1 regularization term on weights
    'reg_lambda' : [1,1.2,1.4],# L2 regularization term on weights
    'num_class' : [3],
    }
grid_xgb = GridSearchCV(model, gridParams, verbose=10, cv=5, n_jobs=-1)
grid_xgb.fit(x_train, y_train)

Fitting 5 folds for each of 10368 candidates, totalling 51840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 9105 tasks      | elapsed: 263.9min
[Parallel(n_jobs=-1)]: Done 9240 tasks      | elapsed: 265.1min
[Parallel(n_jobs=-1)]: Done 9377 tasks      | elapsed: 266.4min
[Parallel(n_jobs=-1)]: Done 9514 tasks      | elapsed: 268.8min
[Parallel(n_jobs=-1)]: Done 9653 tasks      | elapsed: 270.8min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 272.0min
[Parallel(n_jobs=-1)]: Done 9933 tasks      | elapsed: 272.9min
[Parallel(n_jobs=-1)]: Done 10074 tasks      | elapsed: 273.5min
[Parallel(n_jobs=-1)]: Done 10217 tasks      | elapsed: 274.8min
[Parallel(n_jobs=-1)]: Done 10360 tasks      | elapsed: 276.1min
[Parallel(n_jobs=-1)]: Done 10505 tasks      | elapsed: 277.5min
[Parallel(n_jobs=-1)]: Done 10650 tasks      | elapsed: 279.5min
[Parallel(n_jobs=-1)]: Done 10797 tasks      | elapsed: 281.6min
[Parallel(n_jobs=-1)]: Done 10944 tasks      | elapsed: 282.3min
[Parallel(n_jobs=-1)]: Done 11093 tasks      | elapsed: 283.0min
[Parallel(n_jobs=-1)]: Done 1124

[Parallel(n_jobs=-1)]: Done 34314 tasks      | elapsed: 481.7min
[Parallel(n_jobs=-1)]: Done 34577 tasks      | elapsed: 484.8min
[Parallel(n_jobs=-1)]: Done 34840 tasks      | elapsed: 486.0min
[Parallel(n_jobs=-1)]: Done 35105 tasks      | elapsed: 487.8min
[Parallel(n_jobs=-1)]: Done 35370 tasks      | elapsed: 490.1min
[Parallel(n_jobs=-1)]: Done 35637 tasks      | elapsed: 493.3min
[Parallel(n_jobs=-1)]: Done 35904 tasks      | elapsed: 494.4min
[Parallel(n_jobs=-1)]: Done 36173 tasks      | elapsed: 496.1min
[Parallel(n_jobs=-1)]: Done 36442 tasks      | elapsed: 498.5min
[Parallel(n_jobs=-1)]: Done 36713 tasks      | elapsed: 501.4min
[Parallel(n_jobs=-1)]: Done 36984 tasks      | elapsed: 502.5min
[Parallel(n_jobs=-1)]: Done 37257 tasks      | elapsed: 504.2min
[Parallel(n_jobs=-1)]: Done 37530 tasks      | elapsed: 506.8min
[Parallel(n_jobs=-1)]: Done 37805 tasks      | elapsed: 510.2min
[Parallel(n_jobs=-1)]: Done 38080 tasks      | elapsed: 511.4min
[Parallel(n_jobs=-1)]: Do

Parameters: { boosting_type, num_leaves } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                         'colsample_bytree': [0.64, 0.65, 0.66],
                         'learning_rate': [0.005, 0.01, 0.1, 0.3],
                         'max_bin': [255, 510], 'n_estimators': [50, 100, 150],
                         'num_class': [3], 'num_leaves': [6, 8, 12, 16]

In [107]:
y_train_xgb_pm = grid_xgb.best_estimator_.predict(x_train)
y_test_xgb_pm = grid_xgb.best_estimator_.predict(df_test)

In [108]:
classifcation_report_train_test(y_train, y_train_xgb_pm)


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      3827
           1       0.83      0.74      0.78      1496
           2       0.81      0.78      0.80       581

    accuracy                           0.88      5904
   macro avg       0.85      0.82      0.83      5904
weighted avg       0.87      0.88      0.87      5904



In [109]:
y_test_xgb_cv_pm = df_test.copy()
y_test_xgb_cv_pm["downtime_duration"] = y_test_xgb_pm

In [110]:
y_test_xgb_cv_pm.drop(["factory_number", "issue_type", "assembly_line_type", "car_variant",
                   "log_report_type", "volume"], axis=1, inplace = True)

In [111]:
y_test_xgb_cv_pm.head() 

Unnamed: 0,id,downtime_duration
0,3340,0
1,14067,0
2,1134,2
3,27,0
4,9793,0


In [112]:
y_test_xgb_cv_pm.to_csv("y_test_xgb_cv_pm.csv", sep=",", index=False)

# Further Suggestion

In This Data set we have an indirect mapping between id, assembly_line_type, car_variant and log_report_type. if we had a direct mapping, our data would have been more granular and the models would have made better predictions . For Example we don't have a mapping like what all car_variant are there for a particular assembly_line_type or what all log_report_type are there for a car_variant

if we had such a data with direct mapping, we can improve the model predictions

With time constraint at hand, I wasn't able to improve it further, but Another approach that can help n improving the model further is by adding counts of assembly_line_type, Car_variant, log_report_type with respect to each id. Adding these features, I believe will improve the model further.