In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import random 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [None]:
train = pd.read_csv('/content/gdrive/My Drive/Piramal DS Hiring Challenge/Train.csv')
train.head()

In [None]:
test = pd.read_csv('/content/gdrive/My Drive/Piramal DS Hiring Challenge/Test.csv')
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
100 * train.isnull().sum() / len(train)

In [None]:
test.isnull().sum()

In [None]:
100 * test.isnull().sum() / len(test)

In [None]:
train.nunique()

In [None]:
test.nunique()

## Feature Egineering 
\

### Converting datetime object into datetime

In [None]:
import datetime

In [None]:
train['Date of Creation'] = pd.to_datetime(train['Date of Creation'])
train['Estimated Date of Completion'] = pd.to_datetime(train['Estimated Date of Completion'])
train['Actual Date of Completion'] = pd.to_datetime(train['Actual Date of Completion'])

In [None]:
train.nunique()

In [None]:
train['Date of Creation'] = pd.to_datetime(train['Date of Creation']).dt.dayofweek
train['Estimated Date of Completion'] = pd.to_datetime(train['Estimated Date of Completion']).dt.dayofweek
train['Actual Date of Completion'] = pd.to_datetime(train['Actual Date of Completion']).dt.dayofweek

In [None]:
train.nunique()

In [None]:
test['Date of Creation'] = pd.to_datetime(test['Date of Creation'])
test['Estimated Date of Completion'] = pd.to_datetime(test['Estimated Date of Completion'])
test['Actual Date of Completion'] = pd.to_datetime(test['Actual Date of Completion'])

In [None]:
test['Date of Creation'] = pd.to_datetime(test['Date of Creation']).dt.dayofweek
test['Estimated Date of Completion'] = pd.to_datetime(test['Estimated Date of Completion']).dt.dayofweek
test['Actual Date of Completion'] = pd.to_datetime(test['Actual Date of Completion']).dt.dayofweek

In [None]:
test.nunique()

In [None]:
train_x = train.drop(['L_Id','Problem Category'], axis =1)
test_x = test.drop(['L_Id'], axis =1)
train_y = train['Problem Category']


In [None]:
train_y

In [None]:
# Correlation Coefficient Matrix => Train Dataset

corr = train.corr()
plt.figure(figsize = (15,10))
sns.heatmap(corr, cmap = 'YlGnBu', annot = True, linewidths = 0.5);

In [None]:
sns.countplot(x = 'Problem Category', palette = 'GnBu_d', data = train);

### 1. Dealing with Missing values

In [None]:
from sklearn.impute import KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [None]:
# For train dataset
imputer.fit(train_x)
train_x = imputer.transform(train_x)

In [None]:
train_x = pd.DataFrame(train_x)

In [None]:
# For test dataset
imputer.fit(test_x)
test_x = imputer.transform(test_x)

In [None]:
test_x = pd.DataFrame(test_x)

In [None]:
X = train_x.copy()
y = train_y.copy()

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [None]:
from imblearn.over_sampling import SMOTE
oversampler = SMOTE(random_state = 10)
smote_train, smote_train_target = oversampler.fit_sample(X_train,y_train)
smote_test,smote_test_target = oversampler.fit_sample(X_test,y_test)

In [None]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(smote_train)
X_test = sc.transform(smote_test)
X_whole = sc.transform(X.copy())
test_v = sc.transform(test_x)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# n_estimators=500, random_state=10, learning_rate=0.9
ada = RandomForestClassifier()
ada_fit = ada.fit(smote_train, smote_train_target)

In [None]:
y_ada_pred_test = ada_fit.predict(smote_test)
y_ada_pred_train = ada_fit.predict(smote_train)

In [None]:
print(metrics.f1_score(smote_test_target, y_ada_pred_test, average ='macro'))#test accuracy
print(metrics.f1_score(smote_train_target, y_ada_pred_train, average ='macro'))#train accuracy

In [None]:
# Importing GridSearch and RandomSearch

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold

In [None]:
# Different parameters we want to test

params = {'max_depth' : [5, 8, 15, 25, 30, 'none'],
          'n_estimators' : [120, 300, 500, 800, 1200],
          'min_samples_split' : [1, 2, 5, 10, 15, 100],
          'min_samples_leaf' : [1, 2, 5, 10],
          'max_features' : ['log2', 'sqrt', 'none']}

In [None]:
ada = RandomForestClassifier(random_state=10 )

In [None]:
folds = 20
param_comb = 10

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1231)

random_search = RandomizedSearchCV(ada, param_distributions=params, n_iter=param_comb, scoring='recall', n_jobs=4, cv=skf.split(X_train, smote_train_target), verbose=3, random_state=1231 )

# Here we go

# random_search.fit( X_train, smote_train_target, average = 'macro')

In [None]:
random_search.fit( X_train, smote_train_target)

In [None]:
random_search.best_score_

In [None]:
random_search.best_params_

In [None]:
random_search.best_estimator_

In [None]:
ada = RandomForestClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.4,
                   n_estimators=500, random_state=1110) # 400 default old one 

ada_fit = ada.fit(X_train, smote_train_target)

In [None]:
y_ada_pred_test = ada_fit.predict(X_test)
y_ada_pred_train = ada_fit.predict(X_train)

In [None]:
print(metrics.recall_score(smote_test_target, y_ada_pred_test))#test accuracy
print(metrics.recall_score(smote_train_target, y_ada_pred_train))#train accuracy