In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score,roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import math
from sklearn.naive_bayes import BernoulliNB 
from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn.model_selection import GridSearchCV
import pickle 

In [3]:
def metrics(model):
    
    print(f'Доля верных ответов на обучающей выборки = {accuracy_score(y_train, model.predict(X_train))}')
    print(f'Доля верных ответов на тестовой выборки = {accuracy_score(y_test, model.predict(X_test))}')
    
    print(f'Recall_score на обучающей выборки = {recall_score(y_train, model.predict(X_train))}')
    print(f'Recall_score на тестовой выборки = {recall_score(y_test, model.predict(X_test))}')
    
    print(f'Precision_score на обучающей выборки = {precision_score(y_train, model.predict(X_train))}')
    print(f'Precision_score на тестовой выборки = {precision_score(y_test, model.predict(X_test))}')
    
    print(f'Roc_auc_score на обучающей выборки = {roc_auc_score(y_train, model.predict(X_train))}')
    print(f'Roc_auc_score на тестовой выборки = {roc_auc_score(y_test, model.predict(X_test))}')
    
    print(f'Confusion_matrix на обучающей выборки = \n{confusion_matrix(y_train, model.predict(X_train))}')
    print(f'Confusion_matrix на тестовой выборки = \n{confusion_matrix(y_test, model.predict(X_test))}')

In [4]:
dataset = pd.read_csv('remastered_dataset.csv')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(np.array(dataset.drop(['Survived'],axis=1)), np.array(dataset['Survived']), test_size=0.2, random_state=42)

### Logistic Regression

In [6]:
class My_LogitRegression(BaseEstimator, ClassifierMixin):
    def __init__( self, learning_rate=0.01, iterations=15000 ) :        
        self.learning_rate = learning_rate        
        self.iterations = iterations
          
    def fit( self, X, Y ) :        
        self.m, self.n = X.shape        
        self.W = np.zeros( self.n )        
        self.b = 0        
        self.X = X        
        self.Y = Y
          
                  
        for i in range( self.iterations ) :            
            self.update_weights()            
        return self
      
      
    def update_weights( self ) :           
        A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
          
        tmp = ( A - self.Y.T )        
        tmp = np.reshape( tmp, self.m )        
        dW = np.dot( self.X.T, tmp ) / self.m         
        db = np.sum( tmp ) / self.m 
          
        self.W = self.W - self.learning_rate * dW    
        self.b = self.b - self.learning_rate * db
          
        return self
      
      
    def predict( self, X ) :    
        Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )        
        Y = np.where( Z > 0.5, 1, 0 )        
        return Y
  


In [7]:
model = My_LogitRegression()
params = {'iterations':[100,500,1000,2000,5000,10000,15000],'learning_rate':[1,0.1,0.01,0.001,0.0001]}
name = 'custom_LogitRegression'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [8]:
metrics(grid)

Доля верных ответов на обучающей выборки = 0.7556179775280899
Доля верных ответов на тестовой выборки = 0.776536312849162
Recall_score на обучающей выборки = 0.4664179104477612
Recall_score на тестовой выборки = 0.5540540540540541
Precision_score на обучающей выборки = 0.8012820512820513
Precision_score на тестовой выборки = 0.8541666666666666
Roc_auc_score на обучающей выборки = 0.6982990453139708
Roc_auc_score на тестовой выборки = 0.7436936936936936
Confusion_matrix на обучающей выборки = 
[[413  31]
 [143 125]]
Confusion_matrix на тестовой выборки = 
[[98  7]
 [33 41]]


In [9]:
model = LogisticRegression()
params = {}
name = 'sklearn_LogitRegression'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [10]:
model = LogisticRegression()
model.fit(X_train, y_train)
metrics(grid)

Доля верных ответов на обучающей выборки = 0.800561797752809
Доля верных ответов на тестовой выборки = 0.8044692737430168
Recall_score на обучающей выборки = 0.6902985074626866
Recall_score на тестовой выборки = 0.7432432432432432
Precision_score на обучающей выборки = 0.7581967213114754
Precision_score на тестовой выборки = 0.7746478873239436
Roc_auc_score на обучающей выборки = 0.7787078122899019
Roc_auc_score на тестовой выборки = 0.7954311454311453
Confusion_matrix на обучающей выборки = 
[[385  59]
 [ 83 185]]
Confusion_matrix на тестовой выборки = 
[[89 16]
 [19 55]]


### SVM

In [11]:
class SVM(BaseEstimator, ClassifierMixin):

    def __init__(self, lr=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = lr
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None


    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        y_ = np.where(y <= 0, -1, 1)
        
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]


    def predict_train(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)
    
    def predict(self,X):
        
        pred = self.predict_train(X)
        for i in range(len(pred)):
            
            if pred[i]>=0:
                pred[i]=1
            else:
                pred[i]=0
        return pred

In [12]:
model = SVM()
params = {'n_iters':[100,500,1000],'lr':[1,0.1,0.01,0.001,0.0001],'lambda_param':[0.001,0.01,0.1]}
name = 'custom_SVM()'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [13]:
metrics(grid)

Доля верных ответов на обучающей выборки = 0.7907303370786517
Доля верных ответов на тестовой выборки = 0.7877094972067039
Recall_score на обучающей выборки = 0.6305970149253731
Recall_score на тестовой выборки = 0.6756756756756757
Precision_score на обучающей выборки = 0.771689497716895
Precision_score на тестовой выборки = 0.78125
Roc_auc_score на обучающей выборки = 0.7589922011563803
Roc_auc_score на тестовой выборки = 0.7711711711711712
Confusion_matrix на обучающей выборки = 
[[394  50]
 [ 99 169]]
Confusion_matrix на тестовой выборки = 
[[91 14]
 [24 50]]


In [14]:
model = SVC()
params = {'C':[1,0.1,0.01]}
name = 'sklearn_SVM()'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [15]:
metrics(grid)

Доля верных ответов на обучающей выборки = 0.6699438202247191
Доля верных ответов на тестовой выборки = 0.659217877094972
Recall_score на обучающей выборки = 0.25
Recall_score на тестовой выборки = 0.25675675675675674
Precision_score на обучающей выборки = 0.6633663366336634
Precision_score на тестовой выборки = 0.76
Roc_auc_score на обучающей выборки = 0.5867117117117118
Roc_auc_score на тестовой выборки = 0.5998069498069498
Confusion_matrix на обучающей выборки = 
[[410  34]
 [201  67]]
Confusion_matrix на тестовой выборки = 
[[99  6]
 [55 19]]


### KNN

In [16]:
class KNN(BaseEstimator, ClassifierMixin):
    def __init__(self,k=3):
        
        self.k = k
        
    def fit(self,x,y):
        
        self.x = x
        self.y = y
        
    def predict(self,x):
        labels = []
        for i in x:
            labels.append(self._get_one_pred(i))
            
        return labels   
    
    def _get_one_pred(self,one_point):
        lengths = {}
        
        for i in range(len(self.x)):
            lengths[self._get_length(self.x[i],one_point)] = self.y[i]
        
        zero = 0
        one = 0
        
        for i in sorted(list(lengths.keys()))[:self.k]:
            if lengths[i] == 0:
                
                zero = zero + 1
            
            else:
                
                one = one + 1
            
        if zero> one:
            return 0
            
        else:
                
            return 1
    
    def _get_length(self,first_point,second_point):
        
        return math.sqrt(sum([(i - k)**2 for i,k in zip(first_point, second_point)]))
            
        

In [17]:
model = KNN()
params = {'k':[2,3,4,5]}
name = 'custom_KNN()'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [18]:
metrics(grid)

Доля верных ответов на обучающей выборки = 0.7752808988764045
Доля верных ответов на тестовой выборки = 0.7094972067039106
Recall_score на обучающей выборки = 0.6305970149253731
Recall_score на тестовой выборки = 0.527027027027027
Precision_score на обучающей выборки = 0.7347826086956522
Precision_score на тестовой выборки = 0.6964285714285714
Roc_auc_score на обучающей выборки = 0.7466048137689929
Roc_auc_score на тестовой выборки = 0.6825611325611326
Confusion_matrix на обучающей выборки = 
[[383  61]
 [ 99 169]]
Confusion_matrix на тестовой выборки = 
[[88 17]
 [35 39]]


In [19]:
model = KNeighborsClassifier()
params = {'n_neighbors':[2,3,4,5]}
name = 'sklearn_KNN()'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [20]:
metrics(grid)

Доля верных ответов на обучающей выборки = 0.7851123595505618
Доля верных ответов на тестовой выборки = 0.7039106145251397
Recall_score на обучающей выборки = 0.6455223880597015
Recall_score на тестовой выборки = 0.5405405405405406
Precision_score на обучающей выборки = 0.7489177489177489
Precision_score на тестовой выборки = 0.6779661016949152
Roc_auc_score на обучающей выборки = 0.7574458787145354
Roc_auc_score на тестовой выборки = 0.6797940797940798
Confusion_matrix на обучающей выборки = 
[[386  58]
 [ 95 173]]
Confusion_matrix на тестовой выборки = 
[[86 19]
 [34 40]]


### Naive Bayes

In [21]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(['Survived'],axis=1), np.array(dataset['Survived']), test_size=0.2, random_state=42)

In [22]:
class NaiveBayesClassifier(BaseEstimator, ClassifierMixin):
   
    def calc_prior(self, features, target):
        
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
       
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
      
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))

        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        for i in range(self.count):
            prior = np.log(self.prior[i])
            conditional = np.sum(np.log(self.gaussian_density(i, x)))
            posterior = prior + conditional
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

    def visualize(self, y_true, y_pred, target):
        
        tr = pd.DataFrame(data=y_true, columns=[target])
        pr = pd.DataFrame(data=y_pred, columns=[target])
        
        
        fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
        sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
        sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)
        

        fig.suptitle('True vs Predicted Comparison', fontsize=20)

        ax[0].tick_params(labelsize=12)
        ax[1].tick_params(labelsize=12)
        ax[0].set_title("True values", fontsize=18)
        ax[1].set_title("Predicted values", fontsize=18)
        plt.show()

In [23]:
model = NaiveBayesClassifier()
params = {}
name = 'custom_NaiveBayesClassifier()'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [24]:
metrics(grid)

Доля верных ответов на обучающей выборки = 0.7120786516853933
Доля верных ответов на тестовой выборки = 0.7262569832402235
Recall_score на обучающей выборки = 0.3843283582089552
Recall_score на тестовой выборки = 0.47297297297297297
Precision_score на обучающей выборки = 0.7202797202797203
Precision_score на тестовой выборки = 0.7777777777777778
Roc_auc_score на обучающей выборки = 0.6471191340594326
Roc_auc_score на тестовой выборки = 0.6888674388674388
Confusion_matrix на обучающей выборки = 
[[404  40]
 [165 103]]
Confusion_matrix на тестовой выборки = 
[[95 10]
 [39 35]]


In [25]:
model = BernoulliNB()
params = {}
name = 'sklearn_NaiveBayesClassifier()'
grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
grid.fit(X_train,y_train)
with open(name+'_best_params.txt', 'a') as f:
    f.write(str(grid.best_estimator_))
    
pkl_filename = name+'_best_model.pkl'
    
with open(pkl_filename, 'wb') as file: 
    pickle.dump(grid.best_estimator_, file)

In [26]:
metrics(grid)

Доля верных ответов на обучающей выборки = 0.7907303370786517
Доля верных ответов на тестовой выборки = 0.7877094972067039
Recall_score на обучающей выборки = 0.7238805970149254
Recall_score на тестовой выборки = 0.7432432432432432
Precision_score на обучающей выборки = 0.7211895910780669
Precision_score на тестовой выборки = 0.7432432432432432
Roc_auc_score на обучающей выборки = 0.7774808390480032
Roc_auc_score на тестовой выборки = 0.7811454311454311
Confusion_matrix на обучающей выборки = 
[[369  75]
 [ 74 194]]
Confusion_matrix на тестовой выборки = 
[[86 19]
 [19 55]]
