In [23]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score


class entryPoint():

    def printaccuracy(self,y_test,predict,model):
        print(model," report")
        print("-------------------------------------")
        print(" ")
        print(" Confusion Matrix " ,confusion_matrix(y_test,predict))
        print(classification_report(y_test,predict))
        print(" ")
        print("-------------------------------------")
        print(" ")
    
    def normalizedata(self,X):
        SS = StandardScaler()
        X = SS.fit_transform(X)
        print("Normalization done")
        return X

    def removeoutliers(self,data,inplace=False):
        prev_rows = len(data)
        data_copy = data.copy()
        z_score = np.abs(stats.zscore(data_copy))
        data_copy = data_copy[(z_score < 3).all(axis=1)]
        if inplace:
          data=data_copy
        print("Before removing outliers , rows - ", prev_rows)
        print("After removing outliers , rows -", len(data_copy))
        print("Number of records deleted - ", (prev_rows - len(data_copy)))
        return data_copy

    def train_split(self,X,y,test_size=0.2,random_state=0):
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
        return X_train,X_test,y_train,y_test

    def knn(self,X_train,y_train,X_test,y_test):
        print("Knn")
        knn_error = []
        for i in range(2,10):
          knn = KNeighborsClassifier(n_neighbors=i)
          knn.fit(X_train,y_train)
          knn_predict= knn.predict(X_test)
          print(type(knn_predict))
          print(type(y_test))
          knn_error.append(np.mean(y_test!=knn_predict))
        plt.plot(range(2,50),knn_error)
        plt.xlabel("K value")
        plt.ylabel("Error")
    
    def knn_grid_search(self,X_train,y_train,X_test,y_test):
        print("Knn Grid Search Starting...")
        neighbors={'n_neighbors':np.array(range(2,10))}
        knn_grid=GridSearchCV(KNeighborsClassifier(),neighbors,verbose=False,refit=True,cv=3)
        knn_grid.fit(X_train,y_train.values.ravel())
        knn_predict = knn_grid.predict(X_test)
        self.printaccuracy(y_test,knn_predict,"KNN")
        print("Best Hyperparameters " + str(knn_grid.best_params_) + " Best Score: " + str(knn_grid.best_score_))
        flScore = f1_score(y_test,knn_predict)
        return flScore
    
    def logisticRegression(self,X_train,y_train,X_test,y_test):
        print("Logistic Regression classification Starting...")
        Co_reg= np.logspace(-4, 4, 20)
        penalty_reg = ['l1','l2']
        max_iteration = [10,100,1000]
        score = []
        for pen in penalty_reg:
            for i in Co_reg:
                for it in max_iteration:
                    clf = LogisticRegression(random_state=0, solver='liblinear', penalty=pen , C=i, max_iter=it).fit(X_train, y_train.values.ravel())
                    score.append(clf.score(X_test, y_test.values.ravel()))

        print("Best Score : " + str(max(score)))
        
    def svm_model(self,X_train,y_train,X_test,y_test,inp_params):
        print("SVM Classification Starting...")
        svm = SVC(kernel='rbf',random_state=0)	
        params = inp_params
        svm_grid = GridSearchCV(svm, params, verbose=1, cv=3,return_train_score=True)
        svm_grid.fit(X_train,y_train.ravel())
        svm_predict = svm_grid.predict(X_test)
        self.printaccuracy(y_test,svm_predict,"SVM")
        print("Best Hyperparameters " + str(svm_grid.best_params_) + " Best Score: " + str(svm_grid.best_score_))
        return f1_score(y_test,svm_predict)

    def decisionTreeClassifier(self,X_train,y_train,X_test,y_test,inp_params):
        print("Decisiontree Classifier Starting...")
        params = inp_params
        decisionTree_grid = GridSearchCV(DecisionTreeClassifier(), params, verbose=1, cv=3,return_train_score=True)
        decisionTree_grid.fit(X_train,y_train.ravel())
        decisionTree_predict = decisionTree_grid.predict(X_test)
        self.printaccuracy(y_test,decisionTree_predict,"DecisionTree")
        print("Best Hyperparameters " + str(decisionTree_predict.best_params_) + " Best Score: " + str(decisionTree_predict.best_score_))
        return f1_score(y_test,decisionTree_predict)
    
    def randomForest(self,X_train,y_train,X_test,y_test,inp_params):
        print("randomForest Classifier Starting...")
        rf = RandomForestClassifier()
        params = inp_params
        rf_grid = GridSearchCV(rf, params, verbose=1, cv=3)
        rf_grid.fit(X_train,y_train.ravel())
        rf_predict = rf_grid.predict(X_test)
        self.printaccuracy(y_test,rf_predict,"RandomForest")
        print("Best Hyperparameters " + str(rf_grid.best_params_) + " Best Score: " + str(rf_grid.best_score_))
        return f1_score(y_test,rf_predict)

    def adaBoost(self,X_train,y_train,X_test,y_test,inp_params):
        print("AdaBoost Classifier Starting...")
        ab = AdaBoostClassifier()
        params = inp_params
        ab_grid = GridSearchCV(ab, params, verbose=1, cv=3)
        ab_grid.fit(X_train,y_train)
        ab_predict = ab_grid.predict(X_test)
        self.printaccuracy(y_test,ab_predict,"AdaBoost")
        print("Best Hyperparameters " + str(ab_grid.best_params_) + " Best Score: " + str(ab_grid.best_score_))
        return f1_score(y_test,ab_predict)
    
    def gaussianNaiveBaise(self,X_train,y_train,X_test,y_test):
        print("GaussianNaiveBaive Classifier Starting... ")
        gnb = GaussianNB()
        gnb.fit(X_train,y_train)
        gnb_predict = gnb.predict(X_test)
        self.printaccuracy(y_test,gnb_predict,"Naive Bayes")
        return f1_score(y_test,gnb_predict)

    def neuralNetworks(self,X_train,y_train,X_test,y_test,inp_params):
        print("NeuralNetworks Classifier Starting...")
        nn = MLPClassifier(solver='sgd',random_state=0)
        params = inp_params
        nn_grid = GridSearchCV(nn, params, cv=3)
        nn_grid.fit(X_train,y_train)
        nn_predict = nn_grid.predict(X_test)
        self.printaccuracy(y_test,nn_predict,"Neural Networks")
        print("Best Hyperparameters " + str(nn_grid.best_params_) + " Best Score: " + str(nn_grid.best_score_))
        return f1_score(y_test,nn_predict)

    def train_models(self,X_train,y_train,X_test,y_test,):
        f1scores = []
        f1scores.append(1)
        f1scores.append(self.knn_grid_search(X_train,y_train,X_test,y_test))
        f1scores.append(self.logisticRegression(X_train,y_train,X_test,y_test))
        f1scores.append(self.randomForest(X_train,y_train,X_test,y_test))
        f1scores.append(self.decisionTreeClassifier(X_train,y_train,X_test,y_test))
        f1scores.append(self.adaBoost(X_train,y_train,X_test,y_test))
        f1scores.append(self.gaussianNaiveBaise(X_train,y_train,X_test,y_test))
        f1scores.append(self.neuralNetworks(X_train,y_train,X_test,y_test))
        f1scores.append(self.neuralNetworks(X_train,y_train,X_test,y_test))
        f1scores.append(self.randomForest(X_train,y_train,X_test,y_test))
        f1scores.append(self.svm_model(X_train,y_train,X_test,y_test))     
        return f1scores

    def creditCardDataset(self):
        #For credit card Defaulters 
        df = pd.read_csv("../Datasets/Credit_card/credit.csv")
        df.drop(df.columns[0], axis=1, inplace=True)
        df.dropna(axis=0, inplace=True)
        df = df.iloc[1:]
        df = df.astype(float)
        df = self.removeoutliers(df,inplace=True)
        X = df.iloc[:,:23]
        y = df.iloc[:,23:24]
        X = self.normalizedata(X)
        X_train,X_test,y_train,y_test = self.train_split(X,y)
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)
        flscores = self.train_models(X_train,y_train,X_test,y_test)
        print(flscores)


    def thoratic(self):
        data = pd.read_csv("../Datasets/9.ThoraticSurgeryData/ThoraricSurgery.arff",delimiter = ',',names=["DGN", "PRE4", "PRE5", "PRE6","PRE7","PRE8","PRE9","PRE10","PRE11","PRE14","PRE17","PRE19","PRE25","PRE30","PRE32","AGE","Risk1Y"])
        data.head()
        #Preprocessing
        X = pd.DataFrame(data,columns=["DGN", "PRE4", "PRE5", "PRE6","PRE7","PRE8","PRE9","PRE10","PRE11","PRE14","PRE17","PRE19","PRE25","PRE30","PRE32","AGE"])
        cat = ["DGN","PRE6","PRE7","PRE8","PRE9","PRE10","PRE11","PRE14","PRE17","PRE19","PRE25","PRE30","PRE32"]
        for i in cat:
            X[i] = pd.Categorical(X[i]).codes
        y = data.iloc[:,16:17]
        y['Risk1Y'] = pd.Categorical(y['Risk1Y']).codes
        X = self.normalizedata(X)
        X_train,X_test,y_train,y_test = self.train_split(X,y)
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)
        flscores = self.train_models(X_train,y_train,X_test,y_test)
        print(flscores)
        
    def seismicbumps(self):
        data = pd.read_csv("../Datasets/SeismicBumps/seismic-bumps.arff",delimiter = ',',names=["seismic","seismoacoustic","shift","genergy","gpuls","gdenergy","gdpuls","ghazard","nbumps","nbumps2","nbumps3","nbumps4","nbumps5","nbumps6","nbumps7","nbumps89","energy","maxenergy","class"])
        data.head()
        #Preprocessing
        X = pd.DataFrame(data,columns=["seismic","seismoacoustic","shift","genergy","gpuls","gdenergy","gdpuls","ghazard","nbumps","nbumps2","nbumps3","nbumps4","nbumps5","nbumps6","nbumps7","nbumps89","energy","maxenergy"])
        cat = ["seismic","seismoacoustic","shift","ghazard"]
        for i in cat:
            X[i] = pd.Categorical(X[i]).codes
        y = data.iloc[:,18:29]
        y['class'] = pd.Categorical(y['class']).codes
        X = self.normalizedata(X)
        X_train,X_test,y_train,y_test = self.train_split(X,y)
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)
        flscores = self.train_models(X_train,y_train,X_test,y_test)
        print(flscores)
        
    def steel_plates_faults(self):
        #Multiclass
        long_list= ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
        data = pd.read_csv("../Datasets/SteelPlatesFaults/Faults.NNA",delimiter = '\s+',names=long_list)
        X = pd.DataFrame(data,columns=['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas'])
        y = data.iloc[:,27:34]
        #Converting 7 columns into one y 'class' column
        def fun1(x):
            for i in range(len(x)):
                if x[i] == 1:
                    return i
        y1= []        
        for j in range(len(y)):        
            y1.append((fun1(y.iloc[j]))) 
        y2 = pd.DataFrame(y1)
        y2.columns=['Class']
        y=y2
        X = self.normalizedata(X)
        X_train,X_test,y_train,y_test = self.train_split(X,y)
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)
        flscores = self.train_models(X_train,y_train,X_test,y_test)
        print(flscores)

In [24]:
entrypoint = entryPoint()
entrypoint.steel_plates_faults()

Normalization done
(1358, 27)
(583, 27)
(1358, 1)
(583, 1)
Knn Grid Search Starting...
KNN  report
-------------------------------------
 
 Confusion Matrix  [[ 27   1   0   0   1  10  12]
 [  0  53   0   0   0   0   7]
 [  0   0 108   1   0   0   5]
 [  0   0   0  17   0   2   1]
 [  0   0   0   0  13   0   0]
 [  6   6   0   1   3  94  25]
 [  9   5   6   3   1  44 122]]
              precision    recall  f1-score   support

           0       0.64      0.53      0.58        51
           1       0.82      0.88      0.85        60
           2       0.95      0.95      0.95       114
           3       0.77      0.85      0.81        20
           4       0.72      1.00      0.84        13
           5       0.63      0.70      0.66       135
           6       0.71      0.64      0.67       190

    accuracy                           0.74       583
   macro avg       0.75      0.79      0.77       583
weighted avg       0.74      0.74      0.74       583

 
-------------------------

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].