# Modelling

In [2]:
import pandas as pd
X_train= pd.read_csv('X_train.csv')
X_test= pd.read_csv('X_test.csv')
y_train= pd.read_csv('y_train.csv')
y_test= pd.read_csv('y_test.csv')

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

**Logistic Regression**

In [3]:
# Modelling
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

# Predict & Evaluate on train data
y_pred_train = lr.predict(X_train) # default thresold value is 0.5
print("Train Accuracy :", accuracy_score(y_train, y_pred_train))

# Cross Validation on train data
from sklearn.model_selection import cross_val_score
print('CV Score :', cross_val_score(lr,X_train,y_train,cv=5,scoring="accuracy").mean())

# Predict on test data
y_pred_test = lr.predict(X_test)
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))

Train Accuracy : 0.9709491335983379
CV Score : 0.9664449664779629
Test Accuracy : 0.9783242087455062


**KNN**

In [None]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Define the estimator and parameter grid
estimator = KNeighborsClassifier()
param_grid = {'n_neighbors': list(range(1, 50)), 'p': [1, 2]}

# Perform Grid Search
knn_grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy')
knn_grid.fit(X_train, y_train)

# KNN Model with Best Parameters
knn_grid.best_estimator_

In [None]:
# Modelling
knn = knn_grid.best_estimator_
knn.fit(X_train,y_train)

# Predicting
y_pred = knn.predict(X_train)
print ("KNN Model Accuracy:", accuracy_score(y_train, y_pred))

#Cross Validation on Train data
print ("KNN Model Cross Validation Accuracy:", cross_val_score(knn, X_train, y_train, cv=5, scoring="accuracy").mean())

# Predicting on Test data
y_pred_test = knn.predict(X_test)
print ("KNN Model Test Accuracy:", accuracy_score(y_test, y_pred_test))

**SVM**

In [4]:
# Hyperparameter Tuning
from sklearn.svm import SVC

# Define the estimator and parameter grid
estimator = SVC()
param_grid = {'C': [0.01,0.1,1], 'kernel': ['linear', 'rbf','sigmoid', 'poly']}

# Perform Grid Search
svm_grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy')
svm_grid.fit(X_train, y_train)

# SVM Model with Best Parameters
svm_grid.best_estimator_

: 

: 

In [None]:
# Modelling
svm = svm_grid.best_estimator_
svm.fit(X_train,y_train)

# Predicting
y_pred = svm.predict(X_train)
print ("SVM Model Accuracy:", accuracy_score(y_train, y_pred))

#Cross Validation on Train data
print ("SVM Model Cross Validation Accuracy:", cross_val_score(svm, X_train, y_train, cv=5, scoring="accuracy").mean())

# Predicting on Test data
y_pred_test = svm.predict(X_test)
print ("SVM Model Test Accuracy:", accuracy_score(y_test, y_pred_test))

**Decision Tree**

In [4]:
model = DecisionTreeClassifier(random_state = True)
model.fit(X_train,y_train)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(20,20),dpi=300)
plot_tree(model,filled=True,
          feature_names=X_train.columns.tolist(),
          class_names=['0','1'])
plt.show()

In [5]:
# Hpper Parameter tuning
estimator = DecisionTreeClassifier(random_state=True)

param_grid = {'criterion':['gini','entropy'],
              'max_depth':list(range(1,16))}

dt_grid = GridSearchCV(estimator,param_grid,scoring='accuracy',cv=5)
dt_grid.fit(X_train,y_train)

dt = dt_grid.best_estimator_
dt

In [6]:
# Important Features 
feats_ab = pd.DataFrame(data=dt.feature_importances_,
                        index=X.columns,
                        columns=['Importance'])

imporatant_features_dt = feats_ab[feats_ab['Importance']>0].index.tolist()
imporatant_features_dt

NameError: name 'X' is not defined

**Creating Decision tree model with important parameters and important features**

In [None]:
# Selection train & Test Data
X_train_dt = X_train[imporatant_features_dt]
X_test_dt = X_test[imporatant_features_dt]

# Modelling 
dt = dt_grid.best_estimator_
dt.fit(X_train_dt,y_train)

# Evaluaion
ypred_train = dt.predict(X_train_dt)
ypred_test = dt.predict(X_test_dt)

print ("Train Accuracy :",accuracy_score(y_train,ypred_train))
print ("CV Score :",cross_val_score(dt,X_train_dt,y_train,cv=5,scoring='accuracy').mean())
print ("Test Accuracy :",accuracy_score(y_test,ypred_test))

**Random Classifier**

In [None]:
# Hyper parameter tunin
estimator = RandomForestClassifier(random_state=True)

param_grid = {'n_estimators':list(range(1,51))}

rf_grid = GridSearchCV(estimator,param_grid, scoring="accuracy",cv=5)
rf_grid.fit(X_train,y_train)

rf = rf_grid.best_estimator_
rf

In [None]:
# Important features
feats_rf = pd.DataFrame(data = rf.feature_importances_,
                        index=X.columns,
                        columns = ['Importance'])

imporatant_features_rf = feats_rf[feats_rf['Importance']>0].index.tolist()
imporatant_features_rf

In [None]:
X_train_rf = X_train[imporatant_features_rf]
X_test_rf = X_test[imporatant_features_rf]

#Modelling
rf = rf_grid.best_estimator_
rf.fit(X_train_rf,y_train)

# Evaluation
ypred_train = rf.predict(X_train_rf)
ypred_test = rf.predict(X_test_rf)

print ("Train Accuracy :",accuracy_score(y_train,ypred_train))
print ("CV Score :",cross_val_score(rf,X_train_rf,y_train,cv=5,scoring='accuracy').mean())
print ("Test Accuracy :",accuracy_score(y_test,ypred_test))