In [0]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score


class entryPoint():

  def printaccuracy(self,y_test,predict,model):
    print(model," report")
    print("-------------------------------------")
    print(" ")
    print(" Confusion Matrix " ,confusion_matrix(y_test,predict))
    print(classification_report(y_test,predict))
    print(" ")
    print("-------------------------------------")
    print(" ")
    
  def normalizedata(self,X):
    SS = StandardScaler()
    X = SS.fit_transform(X)
    print("Normalization done")
    return X
	
	
  def removeoutliers(self,data,inplace=False):
    prev_rows = len(data)
    data_copy = data.copy()
    z_score = np.abs(stats.zscore(data_copy))
    data_copy = data_copy[(z_score < 3).all(axis=1)]
    if inplace:
      data=data_copy
    print("Before removing outliers , rows - ", prev_rows)
    print("After removing outliers , rows -", len(data_copy))
    print("Number of records deleted - ", (prev_rows - len(data_copy)))
    return data_copy

  def train_split(self,X,y,test_size=0.2,random_state=0):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
    return X_train,X_test,y_train,y_test

  def knn(self,X_train,y_train,X_test,y_test):
    print("Knn")
    knn_error = []
    for i in range(2,10):
      knn = KNeighborsClassifier(n_neighbors=i)
      knn.fit(X_train,y_train)
      knn_predict= knn.predict(X_test)
      print(type(knn_predict))
      print(type(y_test))
      knn_error.append(np.mean(y_test!=knn_predict))
    plt.plot(range(2,50),knn_error)
    plt.xlabel("K value")
    plt.ylabel("Error")
	
  def knn_grid_search(self,X_train,y_train,X_test,y_test,crosVali):
    print("Knn Grid Search Starting...")
    neighbors={'n_neighbors':np.array(range(10,100,30))}
    knn_grid=GridSearchCV(KNeighborsClassifier(),neighbors,verbose=False,refit=True,cv=crosVali)
    knn_grid.fit(X_train,y_train)
    knn_predict = knn_grid.predict(X_test)
    self.printaccuracy(y_test,knn_predict,"KNN")
    print("Best Hyperparameters " + str(knn_grid.best_params_) + " Best Score: " + str(knn_grid.best_score_))
    flScore = f1_score(y_test,knn_predict)
    return flScore
    
  def logisticRegression(self,X_train,y_train,X_test,y_test):
    print("Logistic Regression classification Starting...")
    Co_reg= np.logspace(-4, 4, 20)
    penalty_reg = ['l1','l2']
    max_iteration = [10,100,1000]
    score = []
    for pen in penalty_reg:
      for i in Co_reg:
        for it in max_iteration:
          clf = LogisticRegression(random_state=0, solver='liblinear', penalty=pen , C=i, max_iter=it).fit(X_train, y_train.values.ravel())
          score.append(clf.score(X_test, y_test.values.ravel()))
    
    print("Best Score : " + str(max(score)))
        
  def svm_model(self,X_train,y_train,X_test,y_test,inp_params,croVali):
    print("SVM Classification Starting...")
    svm = SVC(kernel='rbf',random_state=0)	
    params = inp_params
    svm_grid = GridSearchCV(svm, params, verbose=1,return_train_score=True)
    svm_grid.fit(X_train,y_train.ravel())
    svm_predict = svm_grid.predict(X_test)
    self.printaccuracy(y_test,svm_predict,"SVM")
    print("Best Hyperparameters " + str(svm_grid.best_params_) + " Best Score: " + str(svm_grid.best_score_))
    return f1_score(y_test,svm_predict)
		
  def decisionTreeClassifier(self,X_train,y_train,X_test,y_test,inp_params):
    print("Decisiontree Classifier Starting...")
    params = inp_params
    decisionTree_grid = GridSearchCV(DecisionTreeClassifier(), params, verbose=1, cv=3,return_train_score=True)
    decisionTree_grid.fit(X_train,y_train.ravel())
    decisionTree_predict = decisionTree_grid.predict(X_test)
    self.printaccuracy(y_test,decisionTree_predict,"DecisionTree")
    print("Best Hyperparameters " + str(decisionTree_predict.best_params_) + " Best Score: " + str(decisionTree_predict.best_score_))
    return f1_score(y_test,decisionTree_predict)
    
  def randomForest(self,X_train,y_train,X_test,y_test,inp_params):
    print("randomForest Classifier Starting...")
    rf = RandomForestClassifier()
    params = inp_params
    rf_grid = GridSearchCV(rf, params, verbose=1, cv=3)
    rf_grid.fit(X_train,y_train.ravel())
    rf_predict = rf_grid.predict(X_test)
    self.printaccuracy(y_test,rf_predict,"RandomForest")
    print("Best Hyperparameters " + str(rf_grid.best_params_) + " Best Score: " + str(rf_grid.best_score_))
    return f1_score(y_test,rf_predict)
    
  def adaBoost(self,X_train,y_train,X_test,y_test,inp_params):
    print("AdaBoost Classifier Starting...")
    ab = AdaBoostClassifier()
    params = inp_params
    ab_grid = GridSearchCV(ab, params, verbose=1, cv=3)
    ab_grid.fit(X_train,y_train)
    ab_predict = ab_grid.predict(X_test)
    self.printaccuracy(y_test,ab_predict,"AdaBoost")
    print("Best Hyperparameters " + str(ab_grid.best_params_) + " Best Score: " + str(ab_grid.best_score_))
    return f1_score(y_test,ab_predict)
    
  def gaussianNaiveBaive(self,X_train,y_train,X_test,y_test):
    print("GaussianNaiveBaive Classifier Starting... ")
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    gnb_predict = gnb.predict(X_test)
    self.printaccuracy(y_test,gnb_predict,"Naive Bayes")
    return f1_score(y_test,gnb_predict)
		
  def neuralNetworks(self,X_train,y_train,X_test,y_test,inp_params):
    print("NeuralNetworks Classifier Starting...")
    nn = MLPClassifier(solver='sgd',random_state=0)
    params = inp_params
    nn_grid = GridSearchCV(nn, params, cv=3)
    nn_grid.fit(X_train,y_train)
    nn_predict = nn_grid.predict(X_test)
    self.printaccuracy(y_test,nn_predict,"Neural Networks")
    print("Best Hyperparameters " + str(nn_grid.best_params_) + " Best Score: " + str(nn_grid.best_score_))
    return f1_score(y_test,nn_predict)

  def train_models(self,X_train,y_train,X_test,y_test,svm_params,decisiontree_params,random_forest,adaboost_params,nn_params):
    f1scores = []
    #f1scores.append(self.knn(X_train,y_train,X_test,y_test))
    #f1scores.append(self.knn_grid_search(X_train,y_train,X_test,y_test,2))
    f1scores.append(self.svm_model(X_train,y_train,X_test,y_test,svm_params,2))
    return f1scores

  def creditCardDataset(self):
    #For credit card Defaulters 
    df = pd.read_csv("credit.csv")
    df.drop(df.columns[0], axis=1, inplace=True)
    df.dropna(axis=0, inplace=True)
    df = df.iloc[1:]
    df = df.astype(float)
    df = self.removeoutliers(df,inplace=True)
    X = df.iloc[:,:23]
    y = df.iloc[:,23:24]
    X = entrypoint.normalizedata(X)
    X_train,X_test,y_train,y_test = self.train_split(X,y)
    svm_params = { 'C' : np.logspace(0, 3, 4), 'gamma' : np.logspace(-2, 1, 4)}
    decisiontree_params = {'max_depth' : np.linspace(1, 10, 10, endpoint=True),'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)}
    random_forest_params = {'n_estimators' : np.linspace(10,100,10),'max_depth' : np.linspace(1,6,2)}
    adaBoost_params = {'n_estimators' : np.linspace(10,100,10)}
    nn_params = {'hidden_layer_sizes': np.arange(30,150,20),'learning_rate': ['constant','invscaling','adaptive'],'max_iter': np.arange(20,200,50)}
    flscores = self.train_models(X_train,y_train.values.ravel(),X_test,y_test.values.ravel(),svm_params,decisiontree_params,random_forest_params,
                                 adaBoost_params,nn_params)
    print(flscores)
	



In [9]:
entrypoint = entryPoint()
entrypoint.creditCardDataset()

Before removing outliers , rows -  30000
After removing outliers , rows - 26429
Number of records deleted -  3571
Normalization done
81.3% training accuracy for C=1.0 gamma=0.01
81.4% training accuracy for C=1.0 gamma=0.10
78.7% training accuracy for C=1.0 gamma=1.00
77.5% training accuracy for C=1.0 gamma=10.00
81.5% training accuracy for C=10.0 gamma=0.01
79.5% training accuracy for C=10.0 gamma=0.10
76.3% training accuracy for C=10.0 gamma=1.00
76.9% training accuracy for C=10.0 gamma=10.00
81.5% training accuracy for C=100.0 gamma=0.01
76.6% training accuracy for C=100.0 gamma=0.10
74.7% training accuracy for C=100.0 gamma=1.00
76.8% training accuracy for C=100.0 gamma=10.00
80.4% training accuracy for C=1000.0 gamma=0.01
73.3% training accuracy for C=1000.0 gamma=0.10
74.2% training accuracy for C=1000.0 gamma=1.00


KeyboardInterrupt: ignored