In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor

# classification
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# regression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor

# Загружаем данные

In [2]:
def split_data_to_X_y(df, y_cols):
    X = df.drop(y_cols, axis=1)
    y = df[y_cols]
    return X, y

## train

In [3]:
df_train = pd.read_csv('data/Nti2018_task2Train.csv', index_col='index')

# удаляем ненужные колонки
cols_to_drop = ['brs_id']
df_train = df_train.drop(cols_to_drop, axis=1)

# находим колонки которые нужно предсказывать
y_cols = df_train.columns[df_train.columns.str.contains('Avg')]

# удаляем дубликаты
# df_train.drop_duplicates(inplace=True)

print(df_train.shape)
df_train.head()

(4720, 69)


Unnamed: 0_level_0,Направление,Основа,Формирующее подразделение (институт),Формирующее подразделение (департамент),Курс,Год начала,Срок освоения,Средняя оценка,Средняя оценка по первой сдаче,Код,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4031,uncass18ggl5g0000j12bmf3eofkej3s,сверхплановое место,"Физической культуры, спорта и молодежной политики",0,5,2013,5.0,0.0,0.0,43.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,uncass18ggl5g0000kail4d1g44ti8bs,сверхплановое место,Строительный,0,3,2015,4.0,60.49,57.65,08.00.00,...,0.986207,0.0,0.0,0.666667,0.956915,0.0,0.0,0.0,0.0,0.0
5292,uncass18ggl5g0000kaou92csl390qkc,госбюджетное место,Радиоэлектроники и информационных технологий -...,Школа профессионального и академического образ...,2,2016,5.5,56.1,56.1,11.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1028,uncass18ggl5g0000kail4d1g44ti8bs,сверхплановое место,Строительный,0,2,2016,3.6,0.0,0.0,08.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2508,uncass18ggl5g0000kail4d1g44ti8bs,госбюджетное место,Строительный,0,3,2015,4.0,70.34,69.71,08.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
data_train, y_train = split_data_to_X_y(df_train, y_cols)
print(data_train.shape, y_train.shape)

(4720, 43) (4720, 26)


## test

In [5]:
data_test = pd.read_csv('data/xTestFinal_2.csv', index_col='index')
print(data_test.shape)
data_test.head()

(1180, 43)


Unnamed: 0_level_0,Направление,Основа,Формирующее подразделение (институт),Формирующее подразделение (департамент),Курс,Год начала,Срок освоения,Средняя оценка,Средняя оценка по первой сдаче,Код,...,Биология,Физика,Академический рисунок,Обществознание,Английский язык,Вступительный экзамен по программе,Иностранный язык,История искусств,id,username
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,uncass18hc2jg0000l6udkbc8999b154,госбюджетное место,Естественных наук и математики,Школа наук,1,2017,2.0,0.0,0.0,04.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,298459,egormat97
1,uncass18ggl5g0000kain7dt2j01t94k,госбюджетное место,Новых материалов и технологий,Машиностроения,3,2015,5.0,0.0,0.0,15.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,160700,pirojok_rojok
2,uncass18ggl5g0000kaqsplbivu3ie48,сверхплановое место,Государственного управления и предпринимательства,Государственного и муниципального управления,1,2017,5.0,0.0,0.0,38.00.00,...,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,200655,mvlysenko
3,uncass18ggl5g0000kainfe9qstfrflk,госбюджетное место,Новых материалов и технологий,Машиностроения,2,2016,4.0,61.26,60.57,23.00.00,...,0.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,280247,petr_kozlov
4,uncass18hc2jg0000l7f8qta25odb73g,госбюджетное место,Химико-технологический,0,1,2017,2.0,0.0,0.0,18.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146479,SiD28ru


# Генерируем признаки

In [28]:
# объединим train и test в один датафрейм
train_size = len(data_train)
print(train_size)

data = pd.concat([data_train, data_test])
print(data.shape)

4720
(5900, 43)


## Категориальные

In [29]:
cat_cols = [
    'Направление',
    'Основа',
    'Формирующее подразделение (институт)',
    'Формирующее подразделение (департамент)',
    'Курс',
    'Год начала',
    'Код',
    'ОКСО',
    'Направление.1',
    'Уровень',
    'Форма',
    'Олимпиада',
]

In [42]:
def get_cat_features(data, cat_cols):
    return pd.get_dummies(data[cat_cols], columns=cat_cols, drop_first=False)

X_train_cat = get_cat_features(data_train, cat_cols)
print(X_train_cat.shape)

X_test_cat = get_cat_features(data_test, cat_cols)
print(X_test_cat.shape)

(4720, 854)
(1180, 629)


In [43]:
def prepare_str_col(col):
    if col.dtype != object:
        return col
    return col.str.lower().str.replace(' +', ' ').str.strip()

In [44]:
data = data.apply(prepare_str_col)
X_cat = get_cat_features(data, cat_cols)
print(X_cat.shape)

(5900, 885)


# Count Encoder

In [47]:
def count_ecoder(col):
    counts = col.value_counts()
    return col.map(counts)

X_count = data[cat_cols].apply(count_ecoder)
print(X_count.shape)

(5900, 12)


## Числовые признаки - оценки за ЕГЭ и сроки

In [48]:
ege_cols = [
    'Физическая культура',
    'Рисунок',
    'Химия',
    'География',
    'Информатика и ИКТ',
    'История',
    'Математика',
    'Композиционный рисунок',
    'Композиция',
    'Рисунок и композиция',
    'Русский язык',
    'Творческий конкурс',
    'Собеседование',
    'Филология (отечественная)',
    'Творческое сочинение',
    'Литература',
    'Филология',
    'Биология',
    'Физика',
    'Академический рисунок',
    'Обществознание',
    'Английский язык',
    'Вступительный экзамен по программе',
    'Иностранный язык',
    'История искусств',
]

numerical_cols = [
    'Курс',
    'Год начала',
    'Срок освоения',
    'Средняя оценка ',
    'Средняя оценка по первой сдаче',
]

X_cont = data[ege_cols + numerical_cols]
print(X_cont.shape)
X_cont.head()

(5900, 30)


Unnamed: 0_level_0,Физическая культура,Рисунок,Химия,География,Информатика и ИКТ,История,Математика,Композиционный рисунок,Композиция,Рисунок и композиция,...,Обществознание,Английский язык,Вступительный экзамен по программе,Иностранный язык,История искусств,Курс,Год начала,Срок освоения,Средняя оценка,Средняя оценка по первой сдаче
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5,2013,5.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,2015,4.0,60.49,57.65
5292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2,2016,5.5,56.1,56.1
1028,0.0,0.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2,2016,3.6,0.0,0.0
2508,0.0,0.0,0.0,0.0,0.0,0.0,70.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,2015,4.0,70.34,69.71


## Бинарные признаки - сдавал ли ЕГЭ

In [49]:
X_bin = data[ege_cols] != 0
print(X_bin.shape)
X_bin.head()

(5900, 25)


Unnamed: 0_level_0,Физическая культура,Рисунок,Химия,География,Информатика и ИКТ,История,Математика,Композиционный рисунок,Композиция,Рисунок и композиция,...,Литература,Филология,Биология,Физика,Академический рисунок,Обществознание,Английский язык,Вступительный экзамен по программе,Иностранный язык,История искусств
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4031,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3880,False,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False
5292,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1028,False,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2508,False,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False


## Объединяем все признаки

In [50]:
X = pd.concat([X_cat, X_count, X_cont, X_bin], axis=1)
print(X.shape)

# разбиваем обратно на train и test
X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]
print(X_train.shape, X_test.shape)

(5900, 952)
(4720, 952) (1180, 952)


# Обучаем и оцениваем модель
## Предсказываем, проходил ли курс или нет

In [51]:
def print_score(y_bin, y_bin_pred):
    for col in y_cols:
        print('{}'.format(col))
        top_class_rate = y_train_bin[col].value_counts(normalize=True)[0]
        acc = accuracy_score(y_train_bin[col], y_train_bin_pred[col])
        prec = precision_score(y_train_bin[col], y_train_bin_pred[col])
        rec = recall_score(y_train_bin[col], y_train_bin_pred[col])
        print('\t{:.3} {:.3}'.format(acc, top_class_rate))
        print('\t{:.3} {:.3}'.format(prec, rec))

In [52]:
y_train_bin = y_train != 0
y_train_bin.head()

Unnamed: 0_level_0,Final Avg,Final Exam Avg,HW Avg,TC Avg,ig Avg,metrology Avg,ng Avg,sootv Avg,stand Avg,termodin Avg,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4031,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3880,False,False,False,False,False,False,False,False,False,False,...,True,False,False,True,True,False,False,False,False,False
5292,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1028,True,False,False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2508,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [53]:
model = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
mo_model = MultiOutputClassifier(model)
%time mo_model.fit(X_train, y_train_bin)

CPU times: user 6min 27s, sys: 14.8 s, total: 6min 42s
Wall time: 50.8 s


MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1)

In [61]:
%time y_test_bin_pred = mo_model.predict(X_test)
y_test_bin_pred = pd.DataFrame(y_test_bin_pred, index=X_test.index, columns=y_cols)
print(y_test_bin_pred.shape)
y_test_bin_pred.head()

CPU times: user 11.9 s, sys: 1.4 s, total: 13.3 s
Wall time: 8.18 s
(1180, 26)


Unnamed: 0_level_0,Final Avg,Final Exam Avg,HW Avg,TC Avg,ig Avg,metrology Avg,ng Avg,sootv Avg,stand Avg,termodin Avg,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Предсказываем среднюю оценку за курс

In [62]:
def get_score(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    msez = mean_squared_error(y, np.zeros(y.shape))
    return 10 * (1 - mse / msez)

print(get_score(y_train, np.zeros(y_train.shape)))
print(get_score(y_train, y_train))

0.0
10.0


### 1

In [63]:
model = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
mo_model = MultiOutputRegressor(model)
%time mo_model.fit(X_train, y_train)

CPU times: user 2h 17min 5s, sys: 5.12 s, total: 2h 17min 10s
Wall time: 4min 20s


MultiOutputRegressor(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
           n_jobs=1)

In [64]:
y_test_pred = mo_model.predict(X_test)
y_test_pred = pd.DataFrame(y_test_pred, index=data_test.index, columns=y_cols)
y_test_pred.head()

y_test_pred[~y_test_bin_pred] = 0
y_test_pred.to_csv('res/out_1_1.csv')

Unnamed: 0_level_0,Final Avg,Final Exam Avg,HW Avg,TC Avg,ig Avg,metrology Avg,ng Avg,sootv Avg,stand Avg,termodin Avg,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,3.40691e-06,0.0,0.0,9.453755e-07,0.0,0.0,2e-06,...,0.253597,0.0,0.0,0.0,2.362775e-06,0.0,5e-06,0.0007364264,0.0,0.207822
1,0.315762,0.276757,0.0,0.303438,0.0,0.299486,8.858696e-05,0.279573,0.266167,2e-06,...,0.0,0.29209,0.0,0.0,1.492774e-07,0.0,0.252551,2.248076e-05,0.0,7.9e-05
2,0.241049,0.010448,0.044971,2.439983e-06,0.001988,0.0,0.0003471256,0.0,0.0,2e-06,...,0.0,0.004764,0.000125,0.0,0.002072103,0.0,0.000167,0.06751222,0.0,0.051418
3,0.0,0.0,0.000222,5.591089e-07,0.0,0.0,3.445109e-05,0.0,0.0,2e-06,...,0.0,0.0,0.000514,0.0,2.325862e-06,0.0,0.230871,0.00441522,0.0,0.002678
4,0.0,0.0,0.0,3.40691e-06,0.0,0.0,9.453755e-07,0.0,0.0,2e-06,...,0.0,0.0,0.0,0.0,2.362775e-06,0.0,4e-06,9.506184e-07,5.7e-05,3e-06


### 2

In [69]:
model = LinearRegression(n_jobs=-1)
mo_model = MultiOutputRegressor(model)
%time mo_model.fit(X_train, y_train)

CPU times: user 1min 39s, sys: 9.54 s, total: 1min 48s
Wall time: 6.47 s


MultiOutputRegressor(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False),
           n_jobs=1)

In [71]:
y_test_pred = mo_model.predict(X_test)
y_test_pred = pd.DataFrame(y_test_pred, index=data_test.index, columns=y_cols)
y_test_pred.head()

# y_test_pred[~y_test_bin_pred] = 0
y_test_pred.to_csv('res/out_1_2.csv')