In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Создаем пути к директориям и файлам:

TRAIN_DATASET_PATH = 'train.csv'

TEST_DATASET_PATH = 'test.csv'
TEST_DATASET_EXT_PATH = 'ERazdumina_submission.csv'

In [4]:
# Загружаем датасет: 

df = pd.read_csv(TRAIN_DATASET_PATH)
df.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0


In [5]:
df.columns

Index(['Id', 'age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points', 'choose'],
      dtype='object')

In [6]:
feature_names = ['Id', 'age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points']
target_name = ['choose']

In [7]:
data, labels = df[feature_names].iloc[:].values, df[target_name].iloc[:].values

In [8]:
def calc_std_feat(X):
    
    X_transformed = X.copy()
    X_mean, X_std = X_transformed.mean(axis=1), X_transformed.std(axis=1)
    X_mean, X_std = X_mean.reshape(X_transformed.shape[0], 1), X_std.reshape(X_transformed.shape[0], 1)
    
    X_transformed = (X_transformed - X_mean) / X_std
    
    return X_transformed

In [9]:
X_train = calc_std_feat(data)
y_train = labels.reshape(labels.shape[0])

In [10]:
df_test = pd.read_csv(TEST_DATASET_PATH)
data = df_test[feature_names].iloc[:].values

In [11]:
X_test = calc_std_feat(data)

In [12]:
def e_metrics(x1, x2):
    
    distance = 0
    for i in range(len(x1)):
        distance += np.square(x1[i] - x2[i])
    
    return np.sqrt(distance)

In [13]:
def knn(x_train, y_train, x_test, k):
    
    answers = []
    
    for x in x_test:
        test_distances = []
            
        for i in range(len(x_train)):
            
            # расчет расстояния от классифицируемого объекта до
            # объекта обучающей выборки
            distance = e_metrics(x, x_train[i])
            
            # Записываем в список значение расстояния и ответа на объекте обучающей выборки
            test_distances.append((distance, y_train[i]))
        
        # создаем словарь со всеми возможными классами
        classes = {class_item: 0 for class_item in np.unique(y_train)}   
        
        # Сортируем список и среди первых k элементов подсчитаем частоту появления разных классов
        
        for d in sorted(test_distances)[0:k]:
            classes[d[1]] += 1
        
        # Записываем в список ответов наиболее часто встречающийся класс
        answers.append(sorted(classes, key=classes.get)[-1])
        
    return answers

In [14]:
k = 10

test_answers = knn(X_train, y_train, X_test, k)

In [15]:
df_test['choose'] = test_answers
df_test[['Id', 'choose']].to_csv(TEST_DATASET_EXT_PATH, index = False)