In [66]:
import scipy.io
from scipy.io import arff
from io import StringIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import seaborn as sns

class entryPoint():

  def printaccuracy(self,y_test,predict,model):
    print(model," report")
    print("-------------------------------------")
    print(" ")
    print(" Confusion Matrix " ,confusion_matrix(y_test,predict))
    print(classification_report(y_test,predict))
    print(" ")
    print("-------------------------------------")
    print(" ")
    
  def normalizedata(self,X):
    SS = StandardScaler()
    X = SS.fit_transform(X)
    print("Normalization done")
    return X
	
	
  def removeoutliers(self,data,inplace=False):
    prev_rows = len(data)
    data_copy = data.copy()
    z_score = np.abs(stats.zscore(data_copy))
    data_copy = data_copy[(z_score < 3).all(axis=1)]
    if inplace:
      data=data_copy
    print("Before removing outliers , rows - ", prev_rows)
    print("After removing outliers , rows -", len(data_copy))
    print("Number of records deleted - ", (prev_rows - len(data_copy)))
    return data_copy

  def train_split(self,X,y,test_size=0.2,random_state=0):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
    return X_train,X_test,y_train,y_test

  def knn(self,X_train,y_train,X_test,y_test):
    print("Knn")
    knn_error = []
    for i in range(2,10):
      knn = KNeighborsClassifier(n_neighbors=i)
      knn.fit(X_train,y_train)
      knn_predict= knn.predict(X_test)
      print(type(knn_predict))
      print(type(y_test))
      knn_error.append(np.mean(y_test!=knn_predict))
    plt.plot(range(2,50),knn_error)
    plt.xlabel("K value")
    plt.ylabel("Error")
	
  def knn_grid_search(self,X_train,y_train,X_test,y_test):
    print("Knn Grid Search Starting...")
    neighbors={'n_neighbors':np.array(range(2,10))}
    knn_grid=sklearn.model_selection.GridSearchCV(KNeighborsClassifier(),neighbors,verbose=False,refit=True,cv=3)
    knn_grid.fit(X_train,y_train.values.ravel())
    knn_predict = knn_grid.predict(X_test)
    self.printaccuracy(y_test,knn_predict,"KNN")
    print("Best Hyperparameters " + str(knn_grid.best_params_) + " Best Score: " + str(knn_grid.best_score_))
    flScore = f1_score(y_test,knn_predict)
    return flScore
    
  def logisticRegression(self,X_train,y_train,X_test,y_test):
    print("Logistic Regression classification Starting...")
    Co_reg= np.logspace(-4, 4, 20)
    penalty_reg = ['l1','l2']
    max_iteration = [10,100,1000]
    score = []
    for pen in penalty_reg:
      for i in Co_reg:
        for it in max_iteration:
          clf = LogisticRegression(random_state=0, solver='liblinear', penalty=pen , C=i, max_iter=it).fit(X_train, y_train.values.ravel())
          score.append(clf.score(X_test, y_test.values.ravel()))
    
    print("Best Score : " + str(max(score)))
        
  def svm_model(self,X_train,y_train,X_test,y_test,inp_params):
    print("SVM Classification Starting...")
    svm = SVC(kernel='rbf',random_state=0)	
    params = inp_params
    svm_grid = GridSearchCV(svm, params, verbose=1, cv=3,return_train_score=True)
    svm_grid.fit(X_train,y_train.ravel())
    svm_predict = svm_grid.predict(X_test)
    self.printaccuracy(y_test,svm_predict,"SVM")
    print("Best Hyperparameters " + str(svm_grid.best_params_) + " Best Score: " + str(svm_grid.best_score_))
    return f1_score(y_test,svm_predict)
		
  def decisionTreeClassifier(self,X_train,y_train,X_test,y_test,inp_params):
    print("Decisiontree Classifier Starting...")
    params = inp_params
    decisionTree_grid = GridSearchCV(DecisionTreeClassifier(), params, verbose=1, cv=3,return_train_score=True)
    decisionTree_grid.fit(X_train,y_train.ravel())
    decisionTree_predict = decisionTree_grid.predict(X_test)
    self.printaccuracy(y_test,decisionTree_predict,"DecisionTree")
    print("Best Hyperparameters " + str(decisionTree_predict.best_params_) + " Best Score: " + str(decisionTree_predict.best_score_))
    return f1_score(y_test,decisionTree_predict)
    
  def randomForest(self,X_train,y_train,X_test,y_test,inp_params):
    print("randomForest Classifier Starting...")
    rf = RandomForestClassifier()
    params = inp_params
    rf_grid = GridSearchCV(rf, params, verbose=1, cv=3)
    rf_grid.fit(X_train,y_train.ravel())
    rf_predict = rf_grid.predict(X_test)
    self.printaccuracy(y_test,rf_predict,"RandomForest")
    print("Best Hyperparameters " + str(rf_grid.best_params_) + " Best Score: " + str(rf_grid.best_score_))
    return f1_score(y_test,rf_predict)
    
  def adaBoost(self,X_train,y_train,X_test,y_test,inp_params):
    print("AdaBoost Classifier Starting...")
    ab = AdaBoostClassifier()
    params = inp_params
    ab_grid = GridSearchCV(ab, params, verbose=1, cv=3)
    ab_grid.fit(X_train,y_train)
    ab_predict = ab_grid.predict(X_test)
    self.printaccuracy(y_test,ab_predict,"AdaBoost")
    print("Best Hyperparameters " + str(ab_grid.best_params_) + " Best Score: " + str(ab_grid.best_score_))
    return f1_score(y_test,ab_predict)
    
  def gaussianNaiveBaive(self,X_train,y_train,X_test,y_test):
    print("GaussianNaiveBaive Classifier Starting... ")
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    gnb_predict = gnb.predict(X_test)
    self.printaccuracy(y_test,gnb_predict,"Naive Bayes")
    return f1_score(y_test,gnb_predict)
		
  def neuralNetworks(self,X_train,y_train,X_test,y_test,inp_params):
    print("NeuralNetworks Classifier Starting...")
    nn = MLPClassifier(solver='sgd',random_state=0)
    params = inp_params
    nn_grid = GridSearchCV(nn, params, cv=3)
    nn_grid.fit(X_train,y_train)
    nn_predict = nn_grid.predict(X_test)
    self.printaccuracy(y_test,nn_predict,"Neural Networks")
    print("Best Hyperparameters " + str(nn_grid.best_params_) + " Best Score: " + str(nn_grid.best_score_))
    return f1_score(y_test,nn_predict)

  def train_models(self,X_train,y_train,X_test,y_test,):
    f1scores = []
    #f1scores.append(self.knn(X_train,y_train,X_test,y_test))
    f1scores.append(self.knn_grid_search(X_train,y_train,X_test,y_test))
    return f1scores


In [67]:
long_list= ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

data = pd.read_csv("../Datasets/SteelPlatesFaults/Faults.NNA",delimiter = '\s+',names=long_list)

X = pd.DataFrame(data,columns=['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas'])

y = data.iloc[:,27:34]
y[600:601]




#need to make 7 classes 

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
600,0,0,1,0,0,0,0


In [68]:
#Converting 7 columns into one y 'class' column
def fun1(x):
    for i in range(len(x)):
        if x[i] == 1:
            return i
y1= []        
for j in range(len(y)):        
    y1.append((fun1(y.iloc[j])))        

In [69]:
y2 = pd.DataFrame(y1)

In [70]:
y2.columns=['Class']

In [71]:
y =y2
y


Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [73]:
entrypoint = entryPoint()
X = entrypoint.normalizedata(X)
X_train,X_test,y_train,y_test = entrypoint.train_split(X,y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
flscores = entrypoint.train_models(X_train,y_train,X_test,y_test)
print(flscores)                                        

Normalization done
(1358, 27)
(583, 27)
(1358, 1)
(583, 1)
Knn Grid Search Starting...
KNN  report
-------------------------------------
 
 Confusion Matrix  [[ 20   2   0   0   0  11  11]
 [  0  50   0   0   0   2   3]
 [  0   1 112   0   0   1   3]
 [  0   0   0  26   0   0   0]
 [  0   0   0   0  18   0   0]
 [  6   5   0   0   0  76  24]
 [ 13   6   4   2   0  48 139]]
              precision    recall  f1-score   support

           0       0.51      0.45      0.48        44
           1       0.78      0.91      0.84        55
           2       0.97      0.96      0.96       117
           3       0.93      1.00      0.96        26
           4       1.00      1.00      1.00        18
           5       0.55      0.68      0.61       111
           6       0.77      0.66      0.71       212

    accuracy                           0.76       583
   macro avg       0.79      0.81      0.80       583
weighted avg       0.76      0.76      0.76       583

 
-------------------------

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].