### Курсова работа по курсу Алгоритмы анализа данных
#### Классификация  - предсказание вероятности выбора класса с помощью логистической регрессии

In [137]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [138]:
# Загружаем тренировочный датасет
df = pd.read_csv('train.csv')
df

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,55.0,2.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,79.0,0
9996,9996,53.0,2.0,1350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,0
9997,9997,44.0,5.0,1750.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,59.0,0
9998,9998,41.0,0.0,1700.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,72.0,0


In [139]:
# Производим отбор признаков и целевых значений
features = ['age', 'years_of_experience', 'lesson_price', 'qualification',
            'physics', 'chemistry', 'biology', 'english', 'geography', 'history', 'mean_exam_points']
target = ['choose']

X = np.array(df[features], dtype=np.float64)
y = np.array(df[target], dtype=np.float64)

In [140]:
# функция для стандартизации входных данных
def standard_scale(x):
    res = (x - x.mean()) / x.std()
    return res

In [141]:
X = standard_scale(X)
X

array([[-0.25448068, -0.32289475,  3.87968404, ..., -0.32289475,
        -0.32289475, -0.17824786],
       [-0.22125099, -0.31898538,  2.12046501, ..., -0.32289475,
        -0.32094007, -0.21147755],
       [-0.26620881, -0.31703069,  3.09780891, ..., -0.32289475,
        -0.32289475, -0.19388536],
       ...,
       [-0.23688849, -0.31312132,  3.09780891, ..., -0.32289475,
        -0.32094007, -0.20756817],
       [-0.24275255, -0.32289475,  3.00007452, ..., -0.32289475,
        -0.32289475, -0.18215723],
       [-0.24275255, -0.31312132,  2.02273062, ..., -0.32289475,
        -0.32289475, -0.24470724]])

In [142]:
# разбиваем выборку на тренировочную и тестовую
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25,random_state=42)

In [143]:
# функция для расчета логистической регрессии 
def sigmoid(z):
    res = 1 / (1 + np.exp(-z))
    return res

In [144]:
# функция для расчета ошибки и градиентного спуска
def log_loss(w, X, y):
    m = X.shape[0]
    # используем функцию сигмоиды, написанную ранее
    A = sigmoid(np.dot(X, w.reshape(11, -1)))
        
    # labels 0, 1
    loss = -1.0 / m * np.sum(y * np.log(A) + (1 - y) * np.log(1 - A))
    
    grad = 1.0 / m * X.T @ (A - y)

    return loss, grad

In [145]:
# реализация самого градиентного спуска для поиска необходимых весов
def optimize(w, X, y, n_iterations, eta):
    # потери будем записывать в список для отображения в виде графика
    losses = []
    
    for i in range(n_iterations):        
        loss, grad = log_loss(w, X, y)
        w = w - eta * grad.reshape(w.shape)

        losses.append(loss)
        
    return w, losses

In [165]:
# функция предсказания класса
def predict(w, X):
    
    m = X.shape[0]
    
    y_predicted = np.zeros(m)

    A = np.squeeze(sigmoid(np.dot(X, w)))

    # За порог отнесения к тому или иному классу примем вероятность 0.6
    for i in range(A.shape[0]):
        if (A[i] > 0.6): 
            y_predicted[i] = 1
        elif (A[i] <= 0.6):
            y_predicted[i] = 0

    return y_predicted #A

In [166]:
# иницилизируем начальный вектор весов
w0 = np.zeros(X_train.shape[1])

n_iterations = 1000
eta = 0.1

w, losses = optimize(w0, X_train, y_train, n_iterations, eta)

y_predicted_test = predict(w, X_test)
y_predicted_train = predict(w, X_train)

# В качестве меры точности возьмем долю правильных ответов
train_accuracy = np.mean(y_predicted_train == y_train) * 100.0
test_accuracy = np.mean(y_predicted_test == y_test) * 100.0

print(f"Итоговый вектор весов w: {w}")
print(f"Точность на обучающей выборке: {train_accuracy:.3f}")
print(f"Точность на тестовой выборке: {test_accuracy:.3f}")

Итоговый вектор весов w: [ 0.12076628  0.15729723 -0.57332658  0.16014242  0.15889776  0.15537341
  0.15464574  0.15361945  0.15336639  0.1534434   0.2728645 ]
Точность на обучающей выборке: 89.160
Точность на тестовой выборке: 88.160


In [167]:
# функция предсказания вероятности класса
def calc_pred_proba(w, X):

    A = np.squeeze(sigmoid(np.dot(X, w))) 

    return A

In [168]:
# тестовый набор данных на котором делаем предсказание вероятности выбора репетитора
df_test = pd.read_csv('test.csv')
df_test.drop(['Id'], axis=1, inplace=True)
data_test = np.array(df_test, dtype=np.float64)

In [169]:
#сомневался делать ли стандартизацию выборки для предсказания
#data_test = standard_scale(data_test)

In [170]:
test_answers = calc_pred_proba(w, data_test)

In [171]:
df_pred = pd.read_csv('sample_submission.csv')
df_pred['choose'] = test_answers

In [172]:
df_pred.to_csv('to_kaggle.csv', index=False)