# This notebook is for Kaggle competition: https://www.kaggle.com/c/choose-tutors

## Choose tutors
### Choose proper tutors for math exam
### Description
**In this competition your task will be to predict the probability for a tutor to be a proper one for preparing for the math exam. You will be given two datasets: train.csv (contains all features and the target) and test.csv (only features).**
### Evaluation
The evaluation metric is ROC AUC.
### Rules
You can only use these imports:

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import seaborn as sns

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Зададим random_state

In [2]:
random_state = 42

# Machine Learning algorithms classes

## Base class definition for classification models

In [3]:
class ClassificationModel:
    
    
    def __init__(self, model):
        self.model = model
        
        self.means = []
        self.stds = []
    
    
    def fit(self):
        pass
    
    
    def predict(self):
        pass
    
    
    def predict_proba(self):
        pass
    
    
    @staticmethod
    def confusion(predicted, actual):
        matrix = np.zeros((2, 2))
        matrix[0, 0] = np.sum(np.logical_and(predicted==1, actual==1))  # TP
        matrix[0, 1] = np.sum(np.logical_and(predicted==1, actual==0))  # FP
        matrix[1, 0] = np.sum(np.logical_and(predicted==0, actual==1))  # FN
        matrix[1, 1] = np.sum(np.logical_and(predicted==0, actual==0))  # TN
        return matrix
    
    
    @staticmethod
    def calc_TPR_FPR(matrix):
        TPR = matrix[0, 0]/(matrix[0, 0]+matrix[1, 0])
        FPR = matrix[0, 1]/(matrix[0, 1]+matrix[1, 1])
        return TPR, FPR
    
    
    def calc_ROC(self, X, Y, prb):
        length = len(prb)
        TPR_a = []
        FPR_a = []
        for i in range(length):
            y_predicted = self.model.predict(X, thr=prb[i])
            matrix = self.confusion(y_predicted, Y)
            TPR, FPR = self.calc_TPR_FPR(matrix)
            TPR_a.append(TPR)
            FPR_a.append(FPR)
        return TPR_a, FPR_a
    
    
    def cross_validation(self, X, y):

        test_sample_size = 0.2
        test_len = int(X.shape[0] * test_sample_size)

        # AUC for each test sample
        auc_arr = np.zeros(int(1.0/test_sample_size))
        # probability thresholds for ROC
        prb = np.linspace(1.0, 0.0, num=100)
    

        for i in range(int(1.0/test_sample_size)):

            # calc start index for test 
            start = i * test_len

            # get test data
            X_test = X[start: (start + test_len - 1), :]
            # get test labels
            y_test = y[start: (start + test_len - 1)]

            # get train data part before test sample 
            X_Train_1 = X[:(start - 1), :]
            # get train data part after test sample 
            X_Train_2 = X[(start + test_len):, :]
            # merge them
            X_Train = np.concatenate([X_Train_1, X_Train_2], axis=0)
            # get train labels part before test sample
            y_Train_1 = y[:(start - 1)]
            # get train labels part after test sample 
            y_Train_2 = y[(start + test_len):]
            # merge them
            y_Train = np.concatenate([y_Train_1, y_Train_2], axis=0)

            # train the model
            self.model.fit(X_Train, y_Train)

            # make predictions
            y_pred = self.model.predict(X_test)

            # calc ROC
            TPRs, FPRs = self.calc_ROC(X_test, y_test, prb)
        
            # calc AUC 
            auc_arr[i] = np.trapz(TPRs, x=FPRs)

        return auc_arr
    
    
    def standartization_fit(self, X):

        # save means
        self.means = X.mean(0)
        # save stds
        self.stds = np.std(X, axis=0)
    
    
    def standartization_transform(self, X):
    
        # sample scaling
        X_ = X.astype(float)

        rows, cols = X_.shape

        # centering - subtracting the average of a column from each value
        for i in range(rows):
            for j in range(cols):
                X_[i, j] -= self.means[j]

        # dividing each value by the standard deviation
        for i in range(rows):
            for j in range(cols):
                X_[i][j] /= self.stds[j]
            
        return X_

## Desicion tree definition

### Node class definition

In [4]:
class Node:
    
    
    def __init__(self, index, thr, true_branch, false_branch):
        
        self.index = index  # the index of the feature used to compare with the threshold in this node
        self.thr = thr  # treshold value
        self.true_branch = true_branch  # subtree satisfying the condition at the node
        self.false_branch = false_branch  # subtree not satisfying the condition at the node

### Leaf class definition

In [5]:
class Leaf:
    
    
    def __init__(self, answers):
        
        self.proba = self.predict_proba(answers)
        self.prediction = self.predict()
        
    
    def predict_proba(self, labels): 
        
        # return the positive class probability (number of positive elements in the leaf)  
        return np.sum(labels==1)/(labels.shape[0])
        
        
    def predict(self, thr=0.5):
        
        # compare with threshold
        if self.proba >= thr:
            return 1
        else:
            return 0
    

### Decision tree class definition

In [6]:
class DecisionTree(ClassificationModel):
    
    
    def __init__(self, max_depth=6, num_of_min_in_leaf=10, criterion='gini'):
        
        self.max_depth = max_depth
        self.min_leaf = num_of_min_in_leaf
        self.criterion = criterion
        
        self.cur_depth = 0
        self.root = None
        super().__init__(self)

        
    def fit(self, data, answers):
        
        self.root = self.build_tree(data, answers)
        
        
    def build_tree(self, data, answers):
        
        # Building a tree using a recursive function
        self.cur_depth += 1  # increment current depth
        quality, thr, index = self.find_best_split(data, answers)

        #  Base case - stop recursion when there is no gain in quality
        if (quality == 0) or (self.cur_depth >= self.max_depth):
            # create leaf
            leaf = Leaf(answers)
            # decrease current depth
            self.cur_depth -= 1
            return leaf

        true_data, false_data, true_answers, false_answers = self.split(data, answers, index, thr)

        # Build two subtrees recursively
        true_branch = self.build_tree(true_data, true_answers)
        false_branch = self.build_tree(false_data, false_answers)

        # decrease current depth before leave the function
        self.cur_depth -= 1
        
        # return the node with all subtrees
        return Node(index, thr, true_branch, false_branch)
    
    
    def find_best_split(self, data, answers):
        
        # finding the best splitting
        
        quality_criterion = self.get_estimation(answers)

        best_quality = 0
        best_thr = None
        best_index = None
    
       
        n_features = data.shape[1]   
    
        for index in range(n_features):
            
            # check only unique values of the feature, excluding repetitions
            thr_values = np.unique([row[index] for row in data])
        
            for thr in thr_values:
                
                true_data, false_data, true_answers, false_answers = self.split(data, answers, index, thr)
                
                #  skip partitions in which there are few objects left in the node
                if (len(true_data) < self.min_leaf) or (len(false_data) < self.min_leaf):
                    continue
            
                current_quality = self.quality(true_answers, false_answers, quality_criterion)
            
                #  choose the threshold at which the maximum quality gain is obtained
                if current_quality > best_quality:
                    best_quality, best_thr, best_index = current_quality, thr, index

        return best_quality, best_thr, best_index
    
    
    def get_estimation(self, labels):
        
        #  counting the number of objects of different classes
        classes = {}
        for label in labels:
            if label not in classes:
                classes[label] = 0
            classes[label] += 1
    
        #  calculation of criterion
        impurity = 1
        
        if self.criterion=='gini':
            for label in classes:
                p = classes[label] / len(labels)
                impurity -= p ** 2
        elif self.criterion=="Shennon":
            impurity = 0
            for label in classes:
                p = classes[label] / len(labels)
                impurity -= p*np.log2(p)
        
        return impurity
    
    
    def split(self, data, answers, index, thr):
    
        left = np.where(data[:, index] <= thr)
        right = np.where(data[:, index] > thr)
        
        true_data = data[left]
        false_data = data[right]
        true_answers = answers[left]
        false_answers = answers[right]
        
        return true_data, false_data, true_answers, false_answers    
    

    def quality(self, left_answers, right_answers, quality_criterion):
        
        # Quality calculation
        
        # fraction of the sample in the left subtree
        p = float(left_answers.shape[0]) / (left_answers.shape[0] + right_answers.shape[0])
    
        return quality_criterion - p * self.get_estimation(left_answers) - (1 - p) * self.get_estimation(right_answers)
    
    
    def predict(self, data, thr=0.5):
        
        length = data.shape[0]
        y_predicted = np.zeros(length)
        i = 0
        for obj in data:
            prediction = self.classify_object(obj, self.root, thr)
            y_predicted[i] = prediction
            i += 1
            
        return y_predicted
    
    def predict_proba(self, data):
        
        answers = []
        for obj in data:
            prediction = self.proba_object(obj, self.root)
            answers.append(prediction)
            
        return answers
    
    def classify_object(self, obj, node, thr=0.5):
        
        #  Stop recursion if leaf is reached
        if isinstance(node, Leaf):
            answer = node.predict(thr)
            return answer

        if obj[node.index] <= node.thr:
            return self.classify_object(obj, node.true_branch, thr)
        else:
            return self.classify_object(obj, node.false_branch, thr)
    
    
    def proba_object(self, obj, node):

        #  ОStop recursion if leaf is reached
        if isinstance(node, Leaf):
            answer = node.proba
            return answer

        if obj[node.index] <= node.thr:
            return self.proba_object(obj, node.true_branch)
        else:
            return self.proba_object(obj, node.false_branch)
        
        
    @staticmethod
    def accuracy_metric(actual, predicted):
        
        # the function of calculating the accuracy as the proportion of correct answers
        return np.sum(actual==predicted)/len(actual)


## Logistic regression class definition

In [7]:
class LogisticRegression(ClassificationModel):
    
    
    def __init__(self, mad=1e-5, alpha=0.02, reg_lasso=0, reg_redge=0, num_of_iter=1e5, debug_print=0, debug_epoch=1000):
        
        self.mad = mad  # min available dist
        self.alpha = alpha
        self.reg_lasso = reg_lasso
        self.reg_redge = reg_redge
        self.num_of_iter = num_of_iter
        self.debug_print = debug_print
        self.debug_epoch = debug_epoch
        
        self.w = np.array([0])
        self.losses = []
        super().__init__(self)
        
    
    @staticmethod
    def sigmoid(X):
    
        return ( 1.0 / ( 1.0 + np.exp(-X)))
    
    
    def _logloss(self, X, Y, sample_weights):
    
        logloss = np.log(1 + np.exp(-(X @ self.w) * Y))
        return  logloss @ sample_weights
        
    
    
    def _grad(self, X, Y, sample_weights):
        
        drv = -Y / (1 + np.exp(X @ self.w * Y))
        grad = X.T @ (drv * sample_weights) + self.reg_lasso*np.sign(self.w) + 2*self.reg_redge*self.w 
        return grad
    
    
    def fit(self, X, Y, w_elements=None):

        # used regression without intercept
        
        # init weights
        self.w = np.zeros(X.shape[1])
        # set current iteration
        cur_iter = 0
        # init 
        loss_improvement = np.inf
        # set number of iteration without loss improvement
        noiwli = 0
        loss = 0
        
        # set sample weights if None
        if w_elements is None:
            w_elements = np.ones(X.shape[0]) / X.shape[0]
            
        while (cur_iter < self.num_of_iter) and (noiwli < 10):
            # save old weights
            old_w = self.w.copy()
            # save previous loss
            loss_old = loss
            # calc loss 
            loss = self._logloss(X, Y, w_elements)
            # calc grad
            grad = self._grad(X, Y, w_elements)
            # update weights
            self.w -= self.alpha*grad
            
            # check loss improvements
            loss_improvement = np.abs(np.sum(loss) - np.sum(loss_old))
            if loss_improvement > self.mad:
                noiwli = 0
            else:
                noiwli += 1 
                
            # update current iteration
            cur_iter += 1
            # print debug information
            if self.debug_print:
                self.losses.append(np.sum(loss))
                if iter%self.debug_epoch==0:
                    print(np.sum(loss))
            
        return
    
    
    def predict(self, X, thr=0.5):
        
        # the probability of classifying an object as a positive class
        P = self.sigmoid(X @ self.w)  
        return (P >= thr)
    
    
    def predict_proba(self, X):
    
        # the probability of classifying an object as a positive class
        return self.sigmoid(X @ self.w) 
    

## Random Forest class definition

In [8]:
class RandomForest(ClassificationModel):
    
    
    def __init__(self, num_of_trees=3, max_depth=6, num_of_min_in_leaf=10, criterion='gini'):
        
        self.num_of_trees = num_of_trees;
        self.max_depth = max_depth
        self.num_of_min_in_leaf = num_of_min_in_leaf
        self.criterion = criterion
        
        self.forest = []
        self.OOBS_list = []
        self.OOBS = 0
        
        super().__init__(self)
        
        
    def fit(self, data, labels):
        
        self.forest, self.OOBS_list = self.build_forest(data, labels)
        self.OOBS = np.sum(self.OOBS_list)/len(self.OOBS_list)
        
        
    def build_forest(self, data, labels):

        forest = []
        OOBS_list = []
        # get bootstrap selection and indices of elements not included in it
        bootstrap, bootstrap_OOB_indexes = self.get_bootstrap(data, labels)
        
        cnt = 0
        for b_data, b_labels in bootstrap:
            
            dt = DecisionTree(self.max_depth, self.num_of_min_in_leaf, self.criterion)
            dt.fit(b_data, b_labels)
            forest.append(dt)
            
            # Let's make an OOB estimate for the last tree for elements not included in the bootstrap selection
            OOBS_list.append(dt.accuracy_metric(labels[bootstrap_OOB_indexes[cnt]], dt.predict(data[bootstrap_OOB_indexes[cnt]])))
            cnt += 1
                          
        return forest, np.array(OOBS_list)
                          
                          
    def get_bootstrap(self, data, labels):
        
        np.random.seed(random_state)
        n_samples = data.shape[0]
        bootstrap = []
        # indices of elements not included in the selection
        bootstrap_OOB_indexes =[]
        
        for i in range(self.num_of_trees):
            
            b_data = np.zeros(data.shape)
            b_labels = np.zeros(labels.shape)
            # generate a list of all indices
            OOB_indexes = [i for i in range(n_samples)]
            
            for j in range(n_samples):
                
                sample_index = np.random.randint(0, n_samples-1)
                b_data[j] = data[sample_index]
                b_labels[j] = labels[sample_index]   
                
                # remove the index of the element in the selection
                if sample_index in OOB_indexes:
                    OOB_indexes.remove(sample_index)
                    
            bootstrap.append((b_data, b_labels))
            bootstrap_OOB_indexes.append(OOB_indexes)
                          
        return bootstrap, bootstrap_OOB_indexes
                          
                          
    def predict_proba(self, data):
        
        predictions = np.zeros((self.num_of_trees, data.shape[0]))
        i = 0
        for tree in self.forest:
            predictions[i] = tree.predict_proba(data)
            i += 1
                   
        predictions_proba = np.zeros(data.shape[0])
        i = 0
        for obj in data:
            summ = 0
            for j in range(self.num_of_trees):
                summ += predictions[j, i] *  self.OOBS_list[j] / sum(self.OOBS_list)
            predictions_proba[i] = summ
            i += 1
            
        return predictions_proba
    
    
    def predict(self, data, thr=0.5):
        
        probs = self.predict_proba(data) 
        return (probs >= thr)

## AdaBoost class definition

In [9]:
class AdaBoost(ClassificationModel):
    
    
    def __init__(self, nom, model_name, model_construct_kwargs):
        
        self.num_of_models = nom
        self.model_name = model_name
        self.construct_kwargs = model_construct_kwargs
        
        self.n_classes = 0
        self.uniq_classes = []
        self.models_list = []
        self.model_weights = []
        
        super().__init__(self)
        
        
    def fit(self, data, classes):
        
        self.models_list, self.model_weights = self.boost(data, classes)
        
        
    def boost(self, data, classes):
        
        # selection length
        n_objects = len(data)

        # write number of classes
        self.uniq_classes = np.unique((classes))
        self.n_classes = len(self.uniq_classes)

        # weights init
        w = np.ones(n_objects) / n_objects

        models = []
        weights = []
        for n in range(self.num_of_models):
            
            # declare and fit the model
            clf = self.model_name(**self.construct_kwargs)
            clf.fit(data, classes, w_elements=w)

            # make predictions
            predictions = clf.predict(data)
            
            # get error
            e = self.get_error(predictions, classes)
            
            # leave the model if error is too large 
            if e >= 1 - 1/self.n_classes: 
                continue
            
            # calc weight
            alpha = 0.5 * np.log((1 - e) / e)

            # Find the indices of correctly classified elements
            match = (np.array(predictions) == np.array(classes))

            # Increase weights for misclassified items
            w[~match] *= np.exp(alpha)

            # normalize weights
            w /= w.sum()

            # add tree
            models.append(clf)
            weights.append(alpha)
    
        return models, weights
    
    
    def predict(self, data, thr=0.5):
    
        n_objects = len(data)
    
        y_pred = np.zeros((n_objects, self.n_classes))
    
        cnt = 0
        for clf in self.models_list:
            
            prediction = clf.predict(data, thr)
            y_pred[range(n_objects), prediction.astype(int)] += self.model_weights[cnt]
            cnt += 1
    
        y_pred = np.argmax(y_pred, axis=1)
    
        return y_pred
    
    def predict_proba(self, data):
    
        n_objects = len(data)
    
        y_pred = np.zeros(n_objects)
        
        alpha_sum = sum(self.model_weights)
        
        cnt = 0
        for clf in self.models_list:
            
            prediction = clf.predict_proba(data)
            y_pred += (self.model_weights[cnt]/alpha_sum)*prediction
            cnt += 1
    
        return y_pred

    
    @staticmethod
    def get_error(pred, y):
        return sum(pred != y) / len(y)
    

# Loading and preparing data

Load train data

In [10]:
train_df = pd.read_csv('train.csv', sep=',')

train_df.head(10)  

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0
5,5,37.0,3.0,1050.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0
6,6,54.0,3.0,800.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,52.0,1
7,7,32.0,2.0,2750.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,95.0,0
8,8,56.0,3.0,1300.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0,0
9,9,44.0,4.0,2350.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,0


In [11]:
train_df.shape

(10000, 13)

In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   10000 non-null  int64  
 1   age                  10000 non-null  float64
 2   years_of_experience  10000 non-null  float64
 3   lesson_price         10000 non-null  float64
 4   qualification        10000 non-null  float64
 5   physics              10000 non-null  float64
 6   chemistry            10000 non-null  float64
 7   biology              10000 non-null  float64
 8   english              10000 non-null  float64
 9   geography            10000 non-null  float64
 10  history              10000 non-null  float64
 11  mean_exam_points     10000 non-null  float64
 12  choose               10000 non-null  int64  
dtypes: float64(11), int64(2)
memory usage: 1015.8 KB


In [13]:
train_df.isnull().sum()

Id                     0
age                    0
years_of_experience    0
lesson_price           0
qualification          0
physics                0
chemistry              0
biology                0
english                0
geography              0
history                0
mean_exam_points       0
choose                 0
dtype: int64

In [14]:
train_df.describe()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4999.5,45.8009,1.9748,1702.44,1.7243,0.3706,0.1215,0.1172,0.0591,0.0277,0.018,64.4352,0.1109
std,2886.89568,8.030274,1.766883,523.789062,0.798845,0.48299,0.326724,0.321675,0.235824,0.16412,0.132958,13.595024,0.314024
min,0.0,23.0,0.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0
25%,2499.75,40.0,0.0,1300.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0
50%,4999.5,46.0,2.0,1550.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,0.0
75%,7499.25,51.0,3.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0
max,9999.0,68.0,9.0,3950.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,1.0


Load test data

In [15]:
test_df = pd.read_csv('test.csv', sep=',')

test_df.head(10)

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,10000,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,10001,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,10002,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,10003,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,10004,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0
5,10005,54.0,4.0,2050.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,94.0
6,10006,40.0,0.0,1400.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
7,10007,29.0,0.0,1150.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,60.0
8,10008,44.0,6.0,2900.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0
9,10009,45.0,5.0,1300.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,69.0


Remove "Id" column

In [16]:
# save test "Id"
test_ids = test_df['Id'].copy()

# remove "Id"
train_df.drop('Id', axis=1, inplace=True)
test_df.drop('Id', axis=1, inplace=True)

Shuffle data

In [17]:
train_df = train_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
train_df

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,59.0,0.0,2200.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,85.0,0
1,46.0,0.0,1250.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,53.0,0
2,53.0,1.0,1250.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,0
3,40.0,0.0,1200.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,49.0,0
4,41.0,2.0,1300.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,48.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,53.0,0.0,2200.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,83.0,1
9996,45.0,1.0,2200.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,0
9997,49.0,2.0,1550.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,63.0,0
9998,43.0,1.0,1350.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,52.0,0


Sptit train dataset into data and answers

In [18]:
X = train_df[train_df.columns[:-1]].copy()
y = train_df['choose'].copy()

In [19]:
X = X.to_numpy()
y = y.to_numpy()
X_test_arr = test_df.to_numpy()

# Models evaluation

### Decision tree

In [20]:
%%time
# create model
tree = DecisionTree(max_depth=10, num_of_min_in_leaf=50, criterion='gini')
# cross-validation
auc_arr = tree.cross_validation(X, y)
print(f"Mean AUC_ROC on cross validation for decision tree model: {auc_arr.mean()}")

Mean AUC_ROC on cross validation for decision tree model: 0.8119048130596322
Wall time: 16.3 s


In [21]:
# train on whole dataset
tree.fit(X, y)

In [22]:
# predict probabilities
tree_probs = tree.predict_proba(X_test_arr)

In [23]:
ss_df_tree = pd.DataFrame({
    "Id": test_ids,
    "choose": tree_probs,
})

ss_df_tree.head(10)

Unnamed: 0,Id,choose
0,10000,0.053191
1,10001,0.078947
2,10002,0.0
3,10003,0.019608
4,10004,0.156134
5,10005,0.607843
6,10006,0.036757
7,10007,0.156134
8,10008,0.0
9,10009,0.406977


In [24]:
ss_df_tree.to_csv("sample_submission_ermnik_DT.csv", sep=",", index=False)

### Logistic regression

In [25]:
# create model
lr = LogisticRegression(mad=1e-5, reg_lasso=0.001, reg_redge=0.00001, alpha=0.875, num_of_iter=1e6)

In [26]:
# standartization
lr.standartization_fit(X)
X_std = lr.standartization_transform(X)
X_test_std = lr.standartization_transform(X_test_arr)

In [27]:
# cross-validation
auc_arr = lr.cross_validation(X_std, y)
print(f"Mean AUC_ROC on cross validation for logistic regression model: {auc_arr.mean()}")

Mean AUC_ROC on cross validation for logistic regression model: 0.8538668584458671


In [28]:
# train on whole dataset
lr.fit(X_std, y)

In [29]:
# predict probabilities
lr_probs = lr.predict_proba(X_test_std)

In [30]:
ss_df_lr = pd.DataFrame({
    "Id": test_ids,
    "choose": lr_probs,
})

ss_df_lr.head(10)

Unnamed: 0,Id,choose
0,10000,0.241061
1,10001,0.855475
2,10002,0.197804
3,10003,0.22697
4,10004,0.848357
5,10005,0.983281
6,10006,0.396951
7,10007,0.900854
8,10008,0.001056
9,10009,0.893607


In [42]:
ss_df_lr.to_csv("sample_submission_ermnik_LR.csv", sep=",", index=False)

### Random forest

In [32]:
%%time
rf = RandomForest(num_of_trees=51, max_depth=10, num_of_min_in_leaf=50, criterion='gini')
rf.fit(X, y)
print(f"Mean out of bag score: {rf.OOBS}")

Mean out of bag score: 0.8864860372141579
Wall time: 3min 2s


In [33]:
rf_probs = rf.predict_proba(X_test_arr)

In [34]:
ss_df_rf = pd.DataFrame({
    "Id": test_ids,
    "choose": rf_probs,
})

ss_df_rf.head(10)

Unnamed: 0,Id,choose
0,10000,0.026661
1,10001,0.170656
2,10002,0.002056
3,10003,0.021787
4,10004,0.148197
5,10005,0.526403
6,10006,0.032704
7,10007,0.199144
8,10008,3.4e-05
9,10009,0.350547


In [35]:
ss_df_rf.to_csv("sample_submission_ermnik_RF.csv", sep=",", index=False)

### Adaboost

In [36]:
%%time
N = 10
konstructor_kwargs = {"mad": 1e-5, "reg_lasso": 0.001, "reg_redge": 0.00001, "alpha": 0.875, "num_of_iter": 1e6}

ABLR = AdaBoost(N, LogisticRegression, konstructor_kwargs)
auc_arr = ABLR.cross_validation(X_std, y)
print(f"Mean AUC_ROC on cross validation for AdaBoost model: {auc_arr.mean()}")

Mean AUC_ROC on cross validation for AdaBoost model: 0.8540602195878388
Wall time: 10.1 s


In [37]:
ABLR.fit(X_std, y)

In [38]:
probs = ABLR.predict_proba(X_test_std)

In [39]:
ss_df_ablr = pd.DataFrame({
    "Id": test_ids,
    "choose": probs,
})

ss_df_ablr.head(10)

Unnamed: 0,Id,choose
0,10000,0.289784
1,10001,0.816534
2,10002,0.246697
3,10003,0.301998
4,10004,0.800237
5,10005,0.956521
6,10006,0.397534
7,10007,0.8389
8,10008,0.00865
9,10009,0.842895


Сохранение в файл

In [41]:
ss_df_ablr.to_csv("sample_submission_ermnik_ABLR.csv", sep=",", index=False)