In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

### Разделим фичи по типам, отделим таргет и преобразуем его, разделим на тренировочную и тестирующую выборку

In [2]:
data_file = pd.read_csv('data.csv')

target = 'satisfaction'
drop_features = ['Unnamed: 0', 'id', 'Arrival Delay in Minutes']
numerical_features = ['Age', 'Flight Distance', 'Departure Delay in Minutes']
categorical_features_to_onehot = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
categorical_features_other = [feat for feat in list(data_file.columns) if feat != target and feat not in numerical_features and feat not in drop_features and feat not in categorical_features_to_onehot]
 
X = data_file.drop(target,axis=1)
y = data_file[target]

enc = LabelEncoder()
y = enc.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

### Перпроцессинг данных

In [3]:
TransformColums = ColumnTransformer([
    ('Drop','drop',drop_features),
    ('Encoder', OneHotEncoder(), categorical_features_to_onehot),
    ('MinMax', MinMaxScaler(), [feature for feature in numerical_features if feature != 'Age']),
    ('StandartScaler', StandardScaler(), ['Age'])
], remainder='passthrough')



### Функция подчета метрик

In [4]:
def Metrics(model, X, y):
    y_pred = model.predict(X)
    print('Accuracy = ', accuracy_score(y, y_pred))
    print('Precision = ', precision_score(y, y_pred))
    print('Recall = ', recall_score(y, y_pred))
    print('ROC_AUC_score = ', roc_auc_score(y, y_pred))
    print('Confudion_Matrix =')
    print(confusion_matrix(y, y_pred))

### My KNN

In [5]:
from sklearn.metrics import euclidean_distances

class MyKNN(BaseEstimator, ClassifierMixin):
    def __init__(self, n_neighbors = 5):
        self.n_neighbors = n_neighbors
        
    def fit(self, X, y):
        self.fit_X = X
        self.fit_y = y
        return self
    
    
    def predict(self, X):
        distances = np.empty(X.shape[0])
        
        if len(X.shape) == 1:
            distances = euclidean_distances([X],self.fit_X)[0]
        else:
            distances = euclidean_distances(X,self.fit_X)
        distances_sorted = np.argsort(distances, axis=1)
        top_n = distances_sorted[:,:self.n_neighbors]
        
        answer = np.empty(X.shape[0])
        for i, elem in enumerate(top_n):
            local_ans = dict()
            for num in elem:
                if num in local_ans:
                    local_ans[self.fit_y[num]] += 1
                else:
                    local_ans[self.fit_y[num]] = 1
            max_count = max(local_ans, key = lambda k : local_ans.get(k))
            answer[i] = max_count

        return answer

In [6]:
knn_my_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('knn', MyKNN())
])


knn_my = GridSearchCV(knn_my_pipeline,{'knn__n_neighbors':[1,3,5,7]})
knn_my.fit(X_train, y_train)

In [7]:
Metrics(knn_my, X_test, y_test)
pickle.dump(knn_my,open('models/knn_my.pkl','wb'))

Accuracy =  0.9218998123285693
Precision =  0.9212065603853653
Recall =  0.8956289027653881
ROC_AUC_score =  0.9187363171238268
Confudion_Matrix =
[[11126   687]
 [  936  8032]]


In [8]:
knn_my.best_params_

{'knn__n_neighbors': 1}

### KNN Sklearn

In [9]:
knn_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('knn', KNeighborsClassifier())
])

knn_sk = GridSearchCV(knn_sk_pipeline,{'knn__n_neighbors':[1,3,5,7]})
knn_sk.fit(X_train, y_train)

In [10]:
Metrics(knn_sk, X_test, y_test)
pickle.dump(knn_sk,open('models/knn_sk.pkl','wb'))

Accuracy =  0.9318608344160532
Precision =  0.9495238095238095
Recall =  0.8893844781445138
ROC_AUC_score =  0.9267459087581962
Confudion_Matrix =
[[11389   424]
 [  992  7976]]


In [11]:
knn_sk.best_params_

{'knn__n_neighbors': 7}

### My LogisticRegression

In [12]:
class MyLogist(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs = 100, learning_rate = 0.1):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.w = None
        
    def fit(self, X, y):
        if self.w is None:
            self.w = np.random.randn(X.shape[1] + 1)
            
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)  # add bias as feature
        
        for i in range(self.epochs):
            self.w -= self.learning_rate*self._loss_grad(X,y)
        
        return self
        
    
    def predict(self, X):
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)  # add bias as feature
        return self._sigmoid(np.dot(X, self.w)) > 0.5
    
    def _predict_no_bias(self, X):
        return self._sigmoid(np.dot(X, self.w)) > 0.5
    
    def _sigmoid(self, X):
        return 1.0 / (1.0 + np.exp(-X))
    
    def _loss_grad(self, X, y):
        return -((y - self._predict_no_bias(X)) @ X)

In [13]:
logit_my_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('logit', MyLogist())
])

logit_my = GridSearchCV(logit_my_pipeline,{'logit__learning_rate':[0.1, 0.5],'logit__epochs':[100, 500, 1000]})
logit_my.fit(X_train, y_train)

  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 /

In [14]:
Metrics(logit_my, X_test, y_test)
pickle.dump(logit_my,open('models/logit_my.pkl','wb'))

Accuracy =  0.86766758096338
Precision =  0.851378842676311
Recall =  0.8399866190900981
ROC_AUC_score =  0.8643342898210161
Confudion_Matrix =
[[10498  1315]
 [ 1435  7533]]


  return 1.0 / (1.0 + np.exp(-X))


### LogisticRegression Sklearn


In [15]:
logit_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('logit', LogisticRegression())
])

logit_sk = GridSearchCV(logit_sk_pipeline,{'logit__C':[10,1,0.1,0.01],'logit__max_iter':[500,1000,5000]})
logit_sk.fit(X_train, y_train)

In [16]:
Metrics(logit_sk, X_test, y_test)
pickle.dump(logit_sk,open('models/logit_sk.pkl','wb'))

Accuracy =  0.8765699436985708
Precision =  0.8759835584263065
Recall =  0.8317350579839429
ROC_AUC_score =  0.8711710082097823
Confudion_Matrix =
[[10757  1056]
 [ 1509  7459]]


### My SVM

In [17]:
class MySVM(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs=1000, learning_rate=0.001, lambd = 0.01):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.lambd = lambd
        self.w = None
    
    def fit(self, X, y):
        if self.w is None:
            self.w = np.random.randn(X.shape[1] + 1)
            
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)  # add bias as feature
        y = np.where(y == 0, -1, y)
        for i in range(self.epochs):
            self.w -= self.learning_rate*self._grad_loss(X, y)
        
    def predict(self, X):
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)  # add bias as feature
        pred = np.dot(X,self.w)
        pred = np.where(pred >= 0, 1, 0)
        return pred
    
    def _grad_loss(self, X, y):
        loss = self.lambd*self.w
        for i, Xi in enumerate(X):
            if (y[i] * np.dot(self.w, Xi)) <= 0:
                loss -= y[i]*Xi
        return loss

In [18]:
svm_my_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('SVM', MySVM())
])

svm_my = GridSearchCV(svm_my_pipeline,{'SVM__epochs':[500,1000], 'SVM__learning_rate':[0.001, 0.00001]})
svm_my.fit(X_train, y_train)

In [19]:
Metrics(svm_my, X_test, y_test)
pickle.dump(svm_my,open('models/svm_my.pkl','wb'))

Accuracy =  0.8657908666570425
Precision =  0.847564405444932
Recall =  0.8400981266726137
ROC_AUC_score =  0.8626969935826456
Confudion_Matrix =
[[10458  1355]
 [ 1434  7534]]


### SVM Sklearn

In [20]:
svm_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('SVM', LinearSVC())
])
svm_sk = GridSearchCV(svm_sk_pipeline,{'SVM__C':[10, 1, 0.1, 0.01], 'SVM__loss': ['hinge', 'squared_hinge']})
svm_sk.fit(X_train, y_train)



In [21]:
Metrics(svm_sk, X_test, y_test)
pickle.dump(svm_sk,open('models/svm_sk.pkl','wb'))

Accuracy =  0.8769067898561186
Precision =  0.8830066921606119
Recall =  0.8239295272078502
ROC_AUC_score =  0.8705273641287705
Confudion_Matrix =
[[10834   979]
 [ 1579  7389]]


### My naive bayes

In [22]:
class MyNB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        labels, counts = np.unique(y, return_counts=True)
        
        self.prob_y = [count / y.shape[0] for count in counts]
        self.means_X_y = [X[y == label].mean(axis = 0) for label in labels]
        self.std_X_y = [X[y == label].std(axis = 0) for label in labels]
        
        return self
    
    def predict(self,X):
        def gaussian_prob(x,std,mean):
            return np.exp(-(x - mean)**2 / (2 * std**2)) / np.sqrt(2*np.pi*std**2)

        y = np.zeros(X.shape[0])
        
        for i, x in enumerate(X):
            var_prob = np.array(self.prob_y)
            for j in range(len(self.prob_y)):
                p = np.array([gaussian_prob(x[k], self.std_X_y[j][k], self.means_X_y[j][k]) for k in range(X.shape[1])])
                var_prob[j] *= np.prod(p)
            y[i] = np.argmax(var_prob)
        return y

In [23]:
nb_my = Pipeline([
    ('Column Transform', TransformColums),
    ('NB', MyNB())
])
nb_my.fit(X_train, y_train)

In [24]:
Metrics(nb_my, X_test, y_test)
pickle.dump(nb_my,open('models/nb_my.pkl','wb'))

Accuracy =  0.846927481834368
Precision =  0.8323188239347651
Recall =  0.8080954504906334
ROC_AUC_score =  0.842251399163881
Confudion_Matrix =
[[10353  1460]
 [ 1721  7247]]


### Naive bayes

In [25]:
nb_sk = Pipeline([
    ('Column Transform', TransformColums),
    ('NB', GaussianNB())
])
nb_sk.fit(X_train, y_train)

In [26]:
Metrics(nb_sk, X_test, y_test)

Accuracy =  0.846927481834368
Precision =  0.8323188239347651
Recall =  0.8080954504906334
ROC_AUC_score =  0.842251399163881
Confudion_Matrix =
[[10353  1460]
 [ 1721  7247]]


In [27]:
pickle.dump(nb_sk,open('models/nb_sk.pkl','wb'))