* dummy conversion

In [1]:
import os
import pandas as pd
from collections import defaultdict

from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import r2_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [2]:
BASE_DIR = os.getcwd()
file_path = '%s/../Data_EDA.csv'%BASE_DIR

In [3]:
df = pd.read_csv(file_path, encoding="latin-1")

X_cols = [col_name for col_name in df.columns if (df[col_name].dtype == 'int64' and col_name != 'ORDER_STATUS')]
X = df[X_cols].values
y = df.ORDER_STATUS.values

In [4]:
print('-------------------------- APPLYING BASIC DATA MODELING --------------------------\n')
print('1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2')

splits = 5
random_no = 0

train_X, test_X, train_y, test_y = train_test_split(X, y, train_size = (
        splits - 1)/float(splits), random_state = random_no, stratify = y)

models = {
            'Naive Bayes'          : GaussianNB(),
            'BernoulliNB'          : BernoulliNB(),
            'K Nearest Neighbours' : KNeighborsClassifier(),
            'Linear SVC'           : LinearSVC(random_state = random_no),
            'MLP Classifier'       : MLPClassifier(random_state = random_no),
            'Logistic Regression'  : LogisticRegression(random_state = random_no),
            'Decision Trees'       : DecisionTreeClassifier(random_state = random_no),
            'Random Forest'        : RandomForestClassifier(random_state = random_no),
            'Gradient Boosting'    : GradientBoostingClassifier(random_state = random_no)
}

print('2. Iterating through multiple classification models with random_state = 0')
print('3. Applying cross validation with number of splits = 5 on train dataset')

model_cs = defaultdict(float)

for model_name, clf in models.items():
    scores = cross_val_score(clf, train_X, train_y, cv = splits, scoring='accuracy')
    model_cs[model_name] = scores.mean()

print('4. Finding mean cross validation score for each model')

for model_name, score in model_cs.items():
    print("\t{0:25} {1}".format(model_name, str(score)))

best_model = max(model_cs, key=model_cs.get)

print('\n5. Applying best model i.e. %s on the test dataset'%best_model)
model = models[best_model]
model.fit(train_X, train_y)
pred_y = model.predict(test_X)

print('6. Finding accuracy using accuracy_score/clf.score')
print('\tTest accuracy score: %f\n'%model.score(test_X, test_y))

print('7. Populating the confusion matrix')
print(confusion_matrix(test_y, pred_y))

-------------------------- APPLYING BASIC DATA MODELING --------------------------

1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2
2. Iterating through multiple classification models with random_state = 0
3. Applying cross validation with number of splits = 5 on train dataset
4. Finding mean cross validation score for each model
	Linear SVC                0.483167912533
	K Nearest Neighbours      0.610999858848
	Decision Trees            0.58909572917
	Gradient Boosting         0.621142106861
	MLP Classifier            0.541519228836
	Random Forest             0.611259384476
	BernoulliNB               0.576036278097
	Naive Bayes               0.575236926853
	Logistic Regression       0.580043372729

5. Applying best model i.e. Gradient Boosting on the test dataset
6. Finding accuracy using accuracy_score/clf.score
	Test accuracy score: 0.621060

7. Populating the confusion matrix
[[ 3393  7082]
 [ 2044 11564]]




## Only run this

In [None]:
print('------- APPLYING DATA MODELING WITH GRID SEARCH CV FOR PARAMETER SELECTION -------\n')
print('1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2')

splits = 5
random_no = 0

train_X, test_X, train_y, test_y = train_test_split(X, y, train_size = (
        splits - 1)/float(splits), random_state = random_no, stratify = y)

models = {
            'K Nearest Neighbours'   : KNeighborsClassifier(),
            'SVC'                    : SVC(random_state = random_no),
            'MLP Classifier'         : MLPClassifier(random_state = random_no),
            'Logistic Regression'    : LogisticRegression(random_state = random_no),
            'Decision Trees'         : DecisionTreeClassifier(random_state = random_no),
            'Random Forest'          : RandomForestClassifier(random_state = random_no),
            'Gradient Boosting'      : GradientBoostingClassifier(random_state = random_no)
}

params_obj = {
            'K Nearest Neighbours'   : {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']},
            'SVC'                    : {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                                        'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                                        'kernel': ['linear', 'rbf']},
            'MLP Classifier'         : {'activation' : ['logistic', 'tanh', 'relu'],
                                        'solver' : ['lbfgs', 'sgd', 'adam'],
                                        'hidden_layer_sizes' : [[10], [100], [10, 100]],
                                        'alpha' : [0.0001, 0.001, 0.01, 0.1, 1]},
            'Logistic Regression'    : {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
            'Decision Trees'         : {'criterion': ['gini', 'entropy'], 'class_weight': ['balanced', None]},
            'Random Forest'          : {'criterion': ['gini', 'entropy'], 'n_estimators': [10, 100, 1000]},
            'Gradient Boosting'      : {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 1000]}
}

print('2. Iterating through multiple classification models having random_state = 0')
print('3. Applying cross validation with number of splits = 5 on train dataset')

model_cs = defaultdict(list)

for model_name, clf in models.items():
    grid = GridSearchCV(clf, param_grid = params_obj[model_name], cv = splits, n_jobs = 3, pre_dispatch = '2*n_jobs')
    grid.fit(train_X, train_y)
    model_cs[model_name] = [grid.best_score_, grid.best_params_]

print('4. Finding mean cross validation score for each model')

for model_name, insight in model_cs.items():
    print("\t{0:25} {1:20} {2}".format(model_name, str(insight[0]), str(insight[1])))
    
best_model = [model_name for model_name, insight in model_cs.items() if insight[0] == max([x[0] for x in model_cs.values()])][0]
print('\n5. Applying best model i.e. %s on the test dataset'%best_model)

------- APPLYING DATA MODELING WITH GRID SEARCH CV FOR PARAMETER SELECTION -------

1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2
2. Iterating through multiple classification models having random_state = 0
3. Applying cross validation with number of splits = 5 on train dataset


In [None]:
# model = models[best_model]

In [None]:
# model.fit(train_X, train_y)
# pred_y = model.predict(test_X)

# print('6. Finding accuracy using accuracy_score/clf.score')
# print('\tTest accuracy score: %f\n'%model.score(test_X, test_y))

# print('7. Populating the confusion matrix')
# print(confusion_matrix(test_y, pred_y))

In [5]:
for component in range(1,5):
    pca = PCA(n_components=component)
    pca.fit(X)
    print(component, pca.explained_variance_ratio_)

1 [ 0.65346604]
2 [ 0.65346604  0.28585104]
3 [ 0.65346604  0.28585104  0.041134  ]
4 [ 0.65346604  0.28585104  0.041134    0.01890879]


In [6]:
print('---------------- APPLYING BASIC DATA MODELING ON PCA APPLIED DATA ----------------\n')
print('1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2')

splits = 5
random_no = 0

pca = PCA(n_components = 2, random_state=random_no)
X_p = pca.fit(X).transform(X)

train_X, test_X, train_y, test_y = train_test_split(X_p, y, train_size = (
        splits - 1)/float(splits), random_state = random_no, stratify = y)

models = {
            'Naive Bayes'          : GaussianNB(),
            'BernoulliNB'          : BernoulliNB(),
            'K Nearest Neighbours' : KNeighborsClassifier(),
            'MLP Classifier'       : MLPClassifier(random_state = random_no),
            'Linear SVC'           : LinearSVC(random_state = random_no),
            'Logistic Regression'  : LogisticRegression(random_state = random_no),
            'Decision Trees'       : DecisionTreeClassifier(random_state = random_no),
            'Random Forest'        : RandomForestClassifier(random_state = random_no),
            'Gradient Boosting'    : GradientBoostingClassifier(random_state = random_no)
}

print('2. Iterating through multiple classification models with random_state = 0')
print('3. Applying cross validation with number of splits = 5 on train dataset')

model_cs = defaultdict(float)

for model_name, clf in models.items():
    scores = cross_val_score(clf, train_X, train_y, cv = splits, scoring='accuracy')
    model_cs[model_name] = scores.mean()

print('4. Finding mean cross validation score for each model')

for model_name, score in model_cs.items():
    print("\t{0:25} {1}".format(model_name, str(score)))

best_model = max(model_cs, key=model_cs.get)

print('\n5. Applying best model i.e. %s on the test dataset'%best_model)
model = models[best_model]
model.fit(train_X, train_y)
pred_y = model.predict(test_X)

print('6. Finding accuracy using accuracy_score/clf.score')
print('\tTest accuracy score: %f\n'%model.score(test_X, test_y))

print('7. Populating the confusion matrix')
print(confusion_matrix(test_y, pred_y))

---------------- APPLYING BASIC DATA MODELING ON PCA APPLIED DATA ----------------

1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2
2. Iterating through multiple classification models with random_state = 0
3. Applying cross validation with number of splits = 5 on train dataset
4. Finding mean cross validation score for each model
	Linear SVC                0.513977989893
	K Nearest Neighbours      0.607906243755
	Decision Trees            0.583801358279
	Gradient Boosting         0.608093110592
	MLP Classifier            0.534097016353
	Random Forest             0.591742906533
	BernoulliNB               0.57572484648
	Naive Bayes               0.57605705461
	Logistic Regression       0.57482167895

5. Applying best model i.e. Gradient Boosting on the test dataset
6. Finding accuracy using accuracy_score/clf.score
	Test accuracy score: 0.610721

7. Populating the confusion matrix
[[ 2809  7666]
 [ 1709 11899]]




In [None]:
print('--------------- APPLYING BASIC DATA MODELING ON MIN-MAX SCALED DATA --------------\n')
print('1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2')

splits = 5
random_no = 0

X_scaled = MinMaxScaler().fit(X).transform(X)

train_X, test_X, train_y, test_y = train_test_split(X_scaled, y, train_size = (
        splits - 1)/float(splits), random_state = random_no, stratify = y)

models = {
            'Naive Bayes'          : GaussianNB(),
            'BernoulliNB'          : BernoulliNB(),
            'K Nearest Neighbours' : KNeighborsClassifier(),
            'MLP Classifier'       : MLPClassifier(random_state = random_no),
            'Linear SVC'           : LinearSVC(random_state = random_no),
            'Logistic Regression'  : LogisticRegression(random_state = random_no),
            'Decision Trees'       : DecisionTreeClassifier(random_state = random_no),
            'Random Forest'        : RandomForestClassifier(random_state = random_no),
            'Gradient Boosting'    : GradientBoostingClassifier(random_state = random_no)
}

print('2. Iterating through multiple classification models with random_state = 0')
print('3. Applying cross validation with number of splits = 5 on train dataset')

model_cs = defaultdict(float)

for model_name, clf in models.items():
    scores = cross_val_score(clf, train_X, train_y, cv = splits, scoring='accuracy')
    model_cs[model_name] = scores.mean()

print('4. Finding mean cross validation score for each model')

for model_name, score in model_cs.items():
    print("\t{0:25} {1}".format(model_name, str(score)))

best_model = max(model_cs, key=model_cs.get)

print('\n5. Applying best model i.e. %s on the test dataset'%best_model)
model = models[best_model]
model.fit(train_X, train_y)
pred_y = model.predict(test_X)

print('6. Finding accuracy using accuracy_score/clf.score')
print('\tTest accuracy score: %f\n'%model.score(test_X, test_y))

print('7. Populating the confusion matrix')
print(confusion_matrix(test_y, pred_y))

--------------- APPLYING BASIC DATA MODELING ON MIN-MAX SCALED DATA --------------

1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2
2. Iterating through multiple classification models with random_state = 0

In [7]:
for component in range(1,5):
    pca = PCA(n_components=component)
    pca.fit(X_scaled)
    print(component, pca.explained_variance_ratio_)

NameError: name 'X_scaled' is not defined

In [13]:
pca = PCA(n_components=0.95, random_state=random_no)
pca.fit(X_scaled)
print(pca.explained_variance_ratio_)

[ 0.21370274  0.19434621  0.10001686  0.0922707   0.06556503  0.05720512
  0.05282751  0.04830007  0.04729018  0.03825636  0.03633813  0.01770744]


In [14]:
print('------ APPLYING BASIC DATA MODELING ON MIN-MAX SCALED AND PCA APPLIED DATA ------\n')
print('1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2')

splits = 5
random_no = 0

X_scaled = MinMaxScaler().fit(X).transform(X)
pca = PCA(n_components = 0.95, random_state=random_no)
X_p = pca.fit(X_scaled).transform(X)

train_X, test_X, train_y, test_y = train_test_split(X_p, y, train_size = (
        splits - 1)/float(splits), random_state = random_no, stratify = y)

models = {
            'Naive Bayes'          : GaussianNB(),
            'BernoulliNB'          : BernoulliNB(),
            'K Nearest Neighbours' : KNeighborsClassifier(),
            'MLP Classifier'       : MLPClassifier(random_state = random_no),
            'Linear SVC'           : LinearSVC(random_state = random_no),
            'Logistic Regression'  : LogisticRegression(random_state = random_no),
            'Decision Trees'       : DecisionTreeClassifier(random_state = random_no),
            'Random Forest'        : RandomForestClassifier(random_state = random_no),
            'Gradient Boosting'    : GradientBoostingClassifier(random_state = random_no)
}

print('2. Iterating through multiple classification models with random_state = 0')
print('3. Applying cross validation with number of splits = 5 on train dataset')

model_cs = defaultdict(float)

for model_name, clf in models.items():
    scores = cross_val_score(clf, train_X, train_y, cv = splits, scoring='accuracy')
    model_cs[model_name] = scores.mean()

print('4. Finding mean cross validation score for each model')

for model_name, score in model_cs.items():
    print("\t{0:25} {1}".format(model_name, str(score)))

best_model = max(model_cs, key=model_cs.get)

print('\n5. Applying best model i.e. %s on the test dataset'%best_model)
model = models[best_model]
model.fit(train_X, train_y)
pred_y = model.predict(test_X)

print('6. Finding accuracy using accuracy_score/clf.score')
print('\tTest accuracy score: %f\n'%model.score(test_X, test_y))

print('7. Populating the confusion matrix')
print(confusion_matrix(test_y, pred_y))

------ APPLYING BASIC DATA MODELING ON MIN-MAX SCALED AND PCA APPLIED DATA ------

1. Applying stratified split b/w train and test. train/test split ratio is 0.8/0.2
2. Iterating through multiple classification models with random_state = 0
3. Applying cross validation with number of splits = 5 on train dataset
4. Finding mean cross validation score for each model
	Linear SVC                0.497265286144
	Naive Bayes               0.575226555032
	Gradient Boosting         0.615401382301
	Decision Trees            0.591825973788
	Logistic Regression       0.579129825294
	Random Forest             0.613937659522
	K Nearest Neighbours      0.611165914147

5. Applying best model i.e. Gradient Boosting on the test dataset
6. Finding accuracy using accuracy_score/clf.score
	Test accuracy score: 0.613254

7. Populating the confusion matrix
[[ 3152  7323]
 [ 1991 11617]]
