In [1]:
import warnings
warnings.filterwarnings("ignore")

### Импортим библиотеки

In [2]:
from pandas import read_csv , DataFrame 
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score , recall_score , f1_score
from sklearn.naive_bayes import BernoulliNB
import numpy as np
from sklearn.pipeline import Pipeline
import scipy.spatial
from math import pi,exp
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from scipy.stats import mode
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score,roc_auc_score
from sklearn.model_selection import GridSearchCV
import pickle 
from sklearn.naive_bayes import GaussianNB

### Загружаем датасет и делим на тест и трейн

In [3]:
dataset = pd.read_csv('dataset.csv',index_col='Unnamed: 0')
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(['price_range'],axis=1), 
                                                    dataset['price_range'], test_size=0.1, random_state=42,shuffle=True)



### Функия для метрик и обработки моделей 


In [4]:
def metrics(pred,true,name):
    print('\n')
    print(f'metrics for {name}\n\n\n')
    print('confusion_matrix = \n',confusion_matrix(pred,true),'\n\n\n')
    print('accuracy_score = ',accuracy_score(pred,true))
    print('recall_score = ',recall_score(pred,true))
    print('precision_score=', precision_score(pred,true))

    
def work_with_model(model,params,name):
    if str(model)=="<class 'sklearn.svm._classes.SVC'>":
        
        model = Pipeline([('clf',model(probability=True))])

    else:
        
        model = Pipeline([('clf',model())])
    
    grid = GridSearchCV(estimator=model,param_grid =params,cv=3 )  
    grid.fit(X_train,y_train)
    with open(name+'_best_params.txt', 'a') as f:
        f.write(str(grid.best_estimator_))
    
    pkl_filename = name+'_best_model.pkl'
    
    with open(pkl_filename, 'wb') as file: 
        pickle.dump(grid.best_estimator_, file)
        
    
    metrics(grid.predict(X_train),y_train,name + ' for train data ')    
    
    metrics(grid.predict(X_test),y_test,name + ' for test data ')    
    


### Логистическая регрессия

In [5]:
class My_LogisticRegression(BaseEstimator, ClassifierMixin):
    def __init__(self,lr = 0.01 , iterations = 500):      
        self.lr = lr
        self.iterations = iterations
         
    
    def sigmoid(self, x, weight):
        z = np.dot(x, weight)
        return 1 / (1 + np.exp(-z))
     
    
    def loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
     
    
    def gradient_descent(self, X, h, y):
        return np.dot(X.T, (h - y)) / y.shape[0]
 
     
    def fit(self, x,y):
        self.intercept = np.ones((x.shape[0], 1))  
        self.x = np.concatenate((self.intercept, x), axis=1)
        self.weight = np.zeros(self.x.shape[1])
        self.y = y
        
        for i in range(self.iterations):
            sigma = self.sigmoid(self.x, self.weight)
             
            loss = self.loss(sigma,self.y)
 
            dW = self.gradient_descent(self.x , sigma, self.y)
             
            #Updating the weights
            self.weight -= self.lr * dW
 
        
     
    
    def predict(self, x_new , treshold=0.5):
        x_new = np.concatenate((self.intercept[:len(x_new)], x_new), axis=1)
        result = self.sigmoid(x_new, self.weight)
        result = result >= treshold
        y_pred = np.zeros(result.shape[0])
        
        for i in range(len(y_pred)):
            if result[i] == True: 
                y_pred[i] = 1
            else:
                continue
                 
        return y_pred

In [6]:
parameters  = {'clf__iterations':[100,500,1000,2000],'clf__lr':[1,0.1,0.01,0.001,0.0001]}

work_with_model(My_LogisticRegression,parameters,'custom_logreg')



metrics for custom_logreg for train data 



confusion_matrix = 
 [[852 331]
 [ 40 577]] 



accuracy_score =  0.7938888888888889
recall_score =  0.9351701782820098
precision_score= 0.6354625550660793


metrics for custom_logreg for test data 



confusion_matrix = 
 [[104  32]
 [  4  60]] 



accuracy_score =  0.82
recall_score =  0.9375
precision_score= 0.6521739130434783


In [7]:
parameters  = {'clf__max_iter':[100,500,1000,2000]}

work_with_model(LogisticRegression,parameters,'sklearn_logreg')



metrics for sklearn_logreg for train data 



confusion_matrix = 
 [[817  55]
 [ 75 853]] 



accuracy_score =  0.9277777777777778
recall_score =  0.9191810344827587
precision_score= 0.9394273127753304


metrics for sklearn_logreg for test data 



confusion_matrix = 
 [[100   8]
 [  8  84]] 



accuracy_score =  0.92
recall_score =  0.9130434782608695
precision_score= 0.9130434782608695


# SVM

In [8]:
class MYSVM(BaseEstimator, ClassifierMixin):
   
    def __init__(self, etha=0.1, alpha=0.2, epochs=990):
        self.epochs = epochs
        self.etha = etha
        self.alpha = alpha
        self.w = None
        
        
    def fit(self, X_train, Y_train):
        
        
        for i in range(len(Y_train)):
            if Y_train.iloc[i] == 0:
                Y_train.iloc[i] = -1
        
        X_train = self._add_bias_feature(X_train)
        self.w = np.random.normal(loc=0, scale=0.05, size=X_train.shape[1])#задаем первые веса
        
        
        
        for epoch in range(self.epochs): 
            
            for i,x in enumerate(X_train):
                margin = Y_train.iloc[i]*np.dot(self.w,X_train[i])
                if margin >= 1: 
                    self.w = self.w - self.etha*self.alpha*self.w/self.epochs
                    
                else: 
                    self.w = self.w +\
                    self.etha*(Y_train.iloc[i]*X_train[i] - self.alpha*self.w/self.epochs)
                    
                
        for i in range(len(Y_train)):
            if Y_train.iloc[i]==-1:
                Y_train.iloc[i]=0
     
    
    def _add_bias_feature(self,a):
        
        a_extended = np.zeros((a.shape[0],a.shape[1]+1))
        a_extended[:,:-1] = a
        a_extended[:,-1] = int(1)  
        return a_extended
    
    
    def predict(self, X):
        
        y_pred = []
        X = X.to_numpy()
        
        for i in range(len(X)):
            y_pred.append(np.sign(1+np.dot(self.w[1:],X[i])))
        for i in range(len(y_pred)):
            if y_pred[i]==-1:
                y_pred[i]=0

        return y_pred         
    

In [9]:
parameters  = {'clf__etha':[0.1,0.2,0.3],
               'clf__alpha':[0.1,0.2,0.3], 
               'clf__epochs':[100,500,1000]}

work_with_model(MYSVM,parameters,'custom_svm')



metrics for custom_svm for train data 



confusion_matrix = 
 [[892 908]
 [  0   0]] 



accuracy_score =  0.4955555555555556
recall_score =  0.0
precision_score= 0.0


metrics for custom_svm for test data 



confusion_matrix = 
 [[108  92]
 [  0   0]] 



accuracy_score =  0.54
recall_score =  0.0
precision_score= 0.0


In [10]:
parameters  = {'clf__C':[0.1,0.2,0.3],
               'clf__gamma':[0.1,0.2,0.3], 
               'clf__max_iter':[100,500,1000]}

work_with_model(svm.SVC,parameters,'sklearn_svm')



metrics for sklearn_svm for train data 



confusion_matrix = 
 [[504   0]
 [388 908]] 



accuracy_score =  0.7844444444444445
recall_score =  0.7006172839506173
precision_score= 1.0


metrics for sklearn_svm for test data 



confusion_matrix = 
 [[  1   0]
 [107  92]] 



accuracy_score =  0.465
recall_score =  0.4623115577889447
precision_score= 1.0


### Алгоритм k ближайших соседей 

In [11]:
class K_Nearest_Neighbors_Classifier(BaseEstimator, ClassifierMixin) : 
      
    def __init__( self, K =3 ) :
          
        self.K = K
          
          
    def fit( self, X_train, Y_train ) :
          
        self.X_train = X_train.to_numpy()
          
        self.Y_train = np.array(Y_train)
          
          
        self.m, self.n = X_train.shape
      
          
    def predict( self, X_test ) :
          
        self.X_test = X_test.to_numpy()
          
          
        self.m_test, self.n = X_test.shape
          
          
        Y_predict = np.zeros( self.m_test )
          
        for i in range( self.m_test ) :
              
            x = self.X_test[i]
              
              
            neighbors = np.zeros( self.K )
              
            neighbors = self.find_neighbors( x )
              
              
            Y_predict[i] = mode( neighbors )[0][0]    
              
        return Y_predict
      
            
    def find_neighbors( self, x ) :
          
   
          
        euclidean_distances = np.zeros( self.m )
          
        for i in range( self.m ) :
              
            d = self.euclidean( x, self.X_train[i] )
              
            euclidean_distances[i] = d
          
          
        inds = euclidean_distances.argsort()
          
        Y_train_sorted = self.Y_train[inds]
          
        return Y_train_sorted[:self.K]
      
              
    def euclidean( self, x, x_train ) :
          
        return np.sqrt( np.sum( np.square( x - x_train ) ) )
  

In [12]:
parameters  = {'clf__K':[2,3,4] 
               }

work_with_model(K_Nearest_Neighbors_Classifier,parameters,'custom_KNN')



metrics for custom_KNN for train data 



confusion_matrix = 
 [[887  32]
 [  5 876]] 



accuracy_score =  0.9794444444444445
recall_score =  0.9943246311010215
precision_score= 0.9647577092511013


metrics for custom_KNN for test data 



confusion_matrix = 
 [[108   1]
 [  0  91]] 



accuracy_score =  0.995
recall_score =  1.0
precision_score= 0.9891304347826086


In [13]:
parameters  = {'clf__n_neighbors':[2,3,4]
               }

work_with_model(KNeighborsClassifier,parameters,'sklearn_KNN')



metrics for sklearn_KNN for train data 



confusion_matrix = 
 [[887  32]
 [  5 876]] 



accuracy_score =  0.9794444444444445
recall_score =  0.9943246311010215
precision_score= 0.9647577092511013


metrics for sklearn_KNN for test data 



confusion_matrix = 
 [[108   1]
 [  0  91]] 



accuracy_score =  0.995
recall_score =  1.0
precision_score= 0.9891304347826086


### Наивный байесовский классификатор

In [14]:
class NBayes(BaseEstimator, ClassifierMixin):
    
    @staticmethod
    def __split_X__(X, y):
        class_rows = {}
        for row, cls in zip(X, y):
            if cls not in class_rows:
                class_rows[cls] = list()
            class_rows[cls].append(row)
        return class_rows


    @staticmethod
    def __prob_chars__(X):
        means = np.mean(X, axis=0)
        stds = np.std(X, axis=0)
        return list(zip(means, stds))

    @staticmethod
    def __Gauss_prob__(x, mean, std):
        return (1 / np.sqrt(2 * np.pi * np.square(std))) * np.exp(-np.square(x - mean) / (2 * np.square(std)))

    def fit(self, X, y):
        y = np.array(y)
        X = X.to_numpy()
        
        class_rows = NBayes.__split_X__(X, y)
        self.cls_labels = np.unique(y)
        self.chars_by_class = {cls: NBayes.__prob_chars__(rows) for cls, rows in class_rows.items()}
        self.prob_by_class = {cls: len(rows) / len(X) for cls, rows in class_rows.items()}

    def predict(self, X):
        X = X.to_numpy()
        y = []
        for row_to_predict in X:
            result_probs = {}
            for cls in self.cls_labels:
                result_probs[cls] = self.prob_by_class[cls] \
                                    * np.prod([NBayes.__Gauss_prob__(row_to_predict[i], mean, std) for i, (mean, std) in enumerate(self.chars_by_class[cls])])
            y.append(max(result_probs.items(), key=lambda kv: kv[1])[0])
        return y


    def fit_predict(self, X_train, y_train, X_test):
        self.fit(X_train, y_train)
        return self.predict(X_test)

In [15]:
parameters  = {}

work_with_model(NBayes,parameters,'custom_NaivBaisClassificator')



metrics for custom_NaivBaisClassificator for train data 



confusion_matrix = 
 [[830  58]
 [ 62 850]] 



accuracy_score =  0.9333333333333333
recall_score =  0.9320175438596491
precision_score= 0.9361233480176211


metrics for custom_NaivBaisClassificator for test data 



confusion_matrix = 
 [[103   1]
 [  5  91]] 



accuracy_score =  0.97
recall_score =  0.9479166666666666
precision_score= 0.9891304347826086


In [16]:
parameters  = {}
work_with_model(GaussianNB,parameters,'sklearn_NaivBaisClassificator')



metrics for sklearn_NaivBaisClassificator for train data 



confusion_matrix = 
 [[831  58]
 [ 61 850]] 



accuracy_score =  0.9338888888888889
recall_score =  0.9330406147091108
precision_score= 0.9361233480176211


metrics for sklearn_NaivBaisClassificator for test data 



confusion_matrix = 
 [[103   1]
 [  5  91]] 



accuracy_score =  0.97
recall_score =  0.9479166666666666
precision_score= 0.9891304347826086
