In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Создаем пути к директориям и файлам:

TRAIN_DATASET_PATH = 'train.csv'

TEST_DATASET_PATH = 'test.csv'
TEST_DATASET_EXT_PATH = 'ERazdumina_submission.csv'

In [5]:
# Загружаем датасет: 

df = pd.read_csv(TRAIN_DATASET_PATH)
df.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0


In [6]:
df.columns

Index(['Id', 'age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points', 'choose'],
      dtype='object')

In [7]:
feature_names = ['Id', 'age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points']
target_name = ['choose']

In [8]:
data, labels = df[feature_names].iloc[:].values, df[target_name].iloc[:].values

In [9]:
X_train_full = data
y_train_full = labels

In [10]:
def calc_std_feat(X):
    
    X_transformed = X.copy()
    X_mean, X_std = X_transformed.mean(axis=1), X_transformed.std(axis=1)
    X_mean, X_std = X_mean.reshape(X_transformed.shape[0], 1), X_std.reshape(X_transformed.shape[0], 1)
    
    X_transformed = (X_transformed - X_mean) / X_std
    
    return X_transformed

In [11]:
def sigmoid(z):    
    res = 1 / (1 + np.exp(-z))
    return res

In [12]:
def calc_logloss(y, y_pred):
    tol = 1e-5
    y_pred = y_pred.copy()
    y_pred = np.clip(y_pred, a_min = tol, a_max = 1 - tol)
     
    err = - np.mean(y * np.log(y_pred) + (1.0 - y) * np.log(1.0 - y_pred))
    return err

In [13]:
def eval_model(X, y, iterations, alpha=1e-4, tol=1e-5):
    
    np.random.seed(42)
    W = np.random.randn(X.shape[0])
    n = X.shape[1]
    errors = []
    
    for i in range(1, iterations+1):
        z = np.dot(W, X)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        W = W - alpha * (1/n * np.dot((y_pred - y), X.T)) 
        errors.append(err)
        

        if i > 2 and np.abs(err - errors[-2])<tol: 
            break
            
    return W

In [14]:
def predict(W, X, trash: float = 0.5):
    
    m = X.shape[1]
    
    y_pred = np.zeros((1, m))
    W = W.reshape(X.shape[0], 1)
    
    A = sigmoid(np.dot(W.T, X))
    
    y_pred = np.where(A > trash, 1, 0)
    
    return y_pred

In [15]:
def accuracy (y, y_pred):
    score = 100.0 - np.mean(np.abs(y_pred-y)*100.0)

    return score

In [16]:
X_train_tr = calc_std_feat(X_train_full.transpose())
y_train_tr = y_train_full.reshape(1, y_train_full.shape[0])

In [18]:
learning_rate = 0.1
n_iterations = 100000

W = eval_model(X_train_tr, y_train_tr, iterations=n_iterations, alpha=learning_rate)
    

In [19]:
df_test = pd.read_csv(TEST_DATASET_PATH)

In [20]:
data = df_test[feature_names].iloc[:].values

In [21]:
X_test = calc_std_feat(data.transpose())

In [22]:
test_answers = predict(W, X_test)

In [23]:
test_answers = test_answers.transpose()

In [24]:
df_test['choose'] = test_answers
df_test[['Id', 'choose']].to_csv(TEST_DATASET_EXT_PATH, index = False)