In [65]:
import itertools

import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor

# classification
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# regression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor

# Загружаем данные

In [66]:
def split_data_to_X_y(df, y_cols):
    X = df.drop(y_cols, axis=1)
    y = df[y_cols]
    return X, y

## train

In [67]:
df_train = pd.read_csv('data/Nti2018_task2Train.csv', index_col='index')

# удаляем ненужные колонки
cols_to_drop = ['brs_id']
df_train = df_train.drop(cols_to_drop, axis=1)

# находим колонки которые нужно предсказывать
y_cols = df_train.columns[df_train.columns.str.contains('Avg')]

# удаляем дубликаты
# df_train.drop_duplicates(inplace=True)

print(df_train.shape)
df_train.head()

(4720, 69)


Unnamed: 0_level_0,Направление,Основа,Формирующее подразделение (институт),Формирующее подразделение (департамент),Курс,Год начала,Срок освоения,Средняя оценка,Средняя оценка по первой сдаче,Код,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4031,uncass18ggl5g0000j12bmf3eofkej3s,сверхплановое место,"Физической культуры, спорта и молодежной политики",0,5,2013,5.0,0.0,0.0,43.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,uncass18ggl5g0000kail4d1g44ti8bs,сверхплановое место,Строительный,0,3,2015,4.0,60.49,57.65,08.00.00,...,0.986207,0.0,0.0,0.666667,0.956915,0.0,0.0,0.0,0.0,0.0
5292,uncass18ggl5g0000kaou92csl390qkc,госбюджетное место,Радиоэлектроники и информационных технологий -...,Школа профессионального и академического образ...,2,2016,5.5,56.1,56.1,11.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1028,uncass18ggl5g0000kail4d1g44ti8bs,сверхплановое место,Строительный,0,2,2016,3.6,0.0,0.0,08.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2508,uncass18ggl5g0000kail4d1g44ti8bs,госбюджетное место,Строительный,0,3,2015,4.0,70.34,69.71,08.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
# data_train = data_train.drop_duplicates()
# print(data_train.shape)

In [69]:
data_train, y_train = split_data_to_X_y(df_train, y_cols)
print(data_train.shape, y_train.shape)

(4720, 43) (4720, 26)


## test

In [42]:
data_test = pd.read_csv('data/xTestFinal_2.csv', index_col='index')
print(data_test.shape)
data_test.head()

(1180, 43)


Unnamed: 0_level_0,Направление,Основа,Формирующее подразделение (институт),Формирующее подразделение (департамент),Курс,Год начала,Срок освоения,Средняя оценка,Средняя оценка по первой сдаче,Код,...,Биология,Физика,Академический рисунок,Обществознание,Английский язык,Вступительный экзамен по программе,Иностранный язык,История искусств,id,username
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,uncass18hc2jg0000l6udkbc8999b154,госбюджетное место,Естественных наук и математики,Школа наук,1,2017,2.0,0.0,0.0,04.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,298459,egormat97
1,uncass18ggl5g0000kain7dt2j01t94k,госбюджетное место,Новых материалов и технологий,Машиностроения,3,2015,5.0,0.0,0.0,15.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,160700,pirojok_rojok
2,uncass18ggl5g0000kaqsplbivu3ie48,сверхплановое место,Государственного управления и предпринимательства,Государственного и муниципального управления,1,2017,5.0,0.0,0.0,38.00.00,...,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,200655,mvlysenko
3,uncass18ggl5g0000kainfe9qstfrflk,госбюджетное место,Новых материалов и технологий,Машиностроения,2,2016,4.0,61.26,60.57,23.00.00,...,0.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,280247,petr_kozlov
4,uncass18hc2jg0000l7f8qta25odb73g,госбюджетное место,Химико-технологический,0,1,2017,2.0,0.0,0.0,18.00.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146479,SiD28ru


In [70]:
data_train, data_test, y_train, y_test = train_test_split(data_train, y_train, 
                                                          test_size=0.33, 
                                                          random_state=13, 
                                                         )
print(data_train.shape, data_test.shape)

(3162, 43) (1558, 43)


# Генерируем признаки

In [71]:
# объединим train и test в один датафрейм
train_size = len(data_train)
print(train_size)

data = pd.concat([data_train, data_test])
print(data.shape)

3162
(4720, 43)


## Категориальные

In [72]:
cat_cols = [
    'Направление',
    'Основа',
    'Формирующее подразделение (институт)',
    'Формирующее подразделение (департамент)',
    'Курс',
    'Год начала',
    'Код',
    'ОКСО',
    'Направление.1',
    'Уровень',
    'Форма',
    'Олимпиада',
]

In [73]:
def get_cat_features(data, cat_cols):
    return pd.get_dummies(data[cat_cols], columns=cat_cols, drop_first=False)

X_cat = get_cat_features(data, cat_cols)
print(X_cat.shape)

# X_cat = X_cat.loc[:, X_cat.sum() > 5]
# print(X_cat.shape)

(4720, 854)


# Count Encoder

In [74]:
def count_ecoder(col):
    counts = col.value_counts()
    return col.map(counts)

X_count = data[cat_cols].apply(count_ecoder)
print(X_count.shape)

(4720, 12)


## Числовые признаки - оценки за ЕГЭ и сроки

In [75]:
ege_cols = [
    'Физическая культура',
    'Рисунок',
    'Химия',
    'География',
    'Информатика и ИКТ',
    'История',
    'Математика',
    'Композиционный рисунок',
    'Композиция',
    'Рисунок и композиция',
    'Русский язык',
    'Творческий конкурс',
    'Собеседование',
    'Филология (отечественная)',
    'Творческое сочинение',
    'Литература',
    'Филология',
    'Биология',
    'Физика',
    'Академический рисунок',
    'Обществознание',
    'Английский язык',
    'Вступительный экзамен по программе',
    'Иностранный язык',
    'История искусств',
]

numerical_cols = [
    'Курс',
    'Год начала',
    'Срок освоения',
    'Средняя оценка ',
    'Средняя оценка по первой сдаче',
]

X_cont = data[ege_cols + numerical_cols]
print(X_cont.shape)
X_cont.head()

(4720, 30)


Unnamed: 0_level_0,Физическая культура,Рисунок,Химия,География,Информатика и ИКТ,История,Математика,Композиционный рисунок,Композиция,Рисунок и композиция,...,Обществознание,Английский язык,Вступительный экзамен по программе,Иностранный язык,История искусств,Курс,Год начала,Срок освоения,Средняя оценка,Средняя оценка по первой сдаче
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1813,0.0,0.0,0.0,0.0,83.0,0.0,78.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2,2016,4.0,60.85,62.42
1856,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2,2016,4.0,54.45,55.53
846,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,2015,4.0,50.45,51.1
3899,0.0,0.0,0.0,0.0,0.0,0.0,76.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,2015,4.0,89.01,92.56
2744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,2015,5.0,85.83,85.83


## Бинарные признаки - сдавал ли ЕГЭ

In [76]:
X_bin = data[ege_cols + numerical_cols] != 0
print(X_bin.shape)
X_bin.head()

(4720, 30)


Unnamed: 0_level_0,Физическая культура,Рисунок,Химия,География,Информатика и ИКТ,История,Математика,Композиционный рисунок,Композиция,Рисунок и композиция,...,Обществознание,Английский язык,Вступительный экзамен по программе,Иностранный язык,История искусств,Курс,Год начала,Срок освоения,Средняя оценка,Средняя оценка по первой сдаче
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1813,False,False,False,False,True,False,True,False,False,False,...,False,False,False,False,False,True,True,True,True,True
1856,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,True,True,True,True,True
846,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,True,True,True,True,True
3899,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,True,True,True,True,True
2744,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,True,True,True


In [77]:
X_ege_comb = pd.DataFrame(index=data.index)
for comb in itertools.combinations(ege_cols, 2):
    comb_sum = data[list(comb)].sum(axis=1)
    X_ege_comb['sum_'+str(comb)] = comb_sum
print(X_ege_comb.shape)
X_ege_comb.head()

(4720, 300)


Unnamed: 0_level_0,"sum_('Физическая культура', 'Рисунок')","sum_('Физическая культура', 'Химия')","sum_('Физическая культура', 'География')","sum_('Физическая культура', 'Информатика и ИКТ')","sum_('Физическая культура', 'История')","sum_('Физическая культура', 'Математика')","sum_('Физическая культура', 'Композиционный рисунок')","sum_('Физическая культура', 'Композиция')","sum_('Физическая культура', 'Рисунок и композиция')","sum_('Физическая культура', 'Русский язык')",...,"sum_('Обществознание', 'Английский язык')","sum_('Обществознание', 'Вступительный экзамен по программе')","sum_('Обществознание', 'Иностранный язык')","sum_('Обществознание', 'История искусств')","sum_('Английский язык', 'Вступительный экзамен по программе')","sum_('Английский язык', 'Иностранный язык')","sum_('Английский язык', 'История искусств')","sum_('Вступительный экзамен по программе', 'Иностранный язык')","sum_('Вступительный экзамен по программе', 'История искусств')","sum_('Иностранный язык', 'История искусств')"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1813,0.0,0.0,0.0,83.0,0.0,78.0,0.0,0.0,0.0,66.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1856,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,0.0,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
846,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,84.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3899,0.0,0.0,0.0,0.0,0.0,76.0,0.0,0.0,0.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Еще по егэ

In [84]:
X_ege = pd.DataFrame(index=data.index)
X_ege['ege_count'] = (data[ege_cols] != 0).sum(axis=1)
X_ege['ege_sum'] = data[ege_cols].sum(axis=1)
X_ege['ege_mean'] = X_ege['ege_sum'] / X_ege['ege_count']
X_ege['ege_mean'].fillna(0, inplace=True)
X_ege.head()

Unnamed: 0_level_0,ege_count,ege_sum,ege_mean
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1813,3,227.0,75.666667
1856,3,175.0,58.333333
846,3,180.0,60.0
3899,3,238.0,79.333333
2744,0,0.0,0.0


# 2

In [103]:
X_stat = pd.DataFrame(index=data.index)

for col_name in ege_cols:
    col = data_train[col_name] != 0
    stats = y_train.groupby(col).apply(lambda x: (x != 0).sum())#.sum()
    tmp = X_bin[col_name].apply(lambda x: qwe.loc[x])
    X_stat = pd.concat([X_stat, tmp], axis=1)
print(X_stat.shape)

(4720, 650)


## Объединяем все признаки

In [104]:
X = pd.concat([
    X_cat, 
    X_count, 
    X_cont, 
    X_bin, 
#     X_ege_comb, 
    X_ege, 
    X_stat, 
], axis=1)
print(X.shape)

# разбиваем обратно на train и test
X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]
print(X_train.shape, X_test.shape)

(4720, 1579)
(3162, 1579) (1558, 1579)


# Обучаем и оцениваем модель
## Предсказываем, проходил ли курс или нет

In [105]:
def print_score(y_bin, y_bin_pred):
    for col in y_cols:
        print('{}'.format(col))
        top_class_rate = y_train_bin[col].value_counts(normalize=True)[0]
        acc = accuracy_score(y_train_bin[col], y_train_bin_pred[col])
        prec = precision_score(y_train_bin[col], y_train_bin_pred[col])
        rec = recall_score(y_train_bin[col], y_train_bin_pred[col])
        print('\t{:.3} {:.3}'.format(acc, top_class_rate))
        print('\t{:.3} {:.3}'.format(prec, rec))

In [106]:
y_train_bin = y_train != 0
y_train_bin.head()

Unnamed: 0_level_0,Final Avg,Final Exam Avg,HW Avg,TC Avg,ig Avg,metrology Avg,ng Avg,sootv Avg,stand Avg,termodin Avg,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1813,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1856,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
846,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
3899,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
2744,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [107]:
def get_train_test_probs(model, X_train, y_train, X_test):
    X_train_prob = pd.DataFrame(index=X_train.index, columns=y_cols)
    for col in y_cols:
        X_train_prob[col] = cross_val_predict(
            model, 
            X_train, 
            y_train_bin[col], 
            method='predict_proba', 
            cv=3, 
        )[:, 1]
    
    mo_model = MultiOutputClassifier(model)
    mo_model.fit(X_train, y_train_bin)
    probas_list = mo_model.predict_proba(X_test)
    probas_list = [probs[:, 1] for probs in probas_list]
    X_test_prob = np.vstack(probas_list).swapaxes(0, 1)
    X_test_prob = pd.DataFrame(X_test_prob, index=X_test.index, columns=y_cols)

    return X_train_prob, X_test_prob

def get_probs(X_train, y_train, X_test, models):
    X_train_prob = pd.DataFrame(index=X_train.index)
    X_test_prob = pd.DataFrame(index=X_test.index)
    for model in models:
        X_tr_prob, X_t_prob = get_train_test_probs(\
            model, \
            X_train, \
            y_train, \
            X_test, \
        )
        X_train_prob = pd.concat([X_train_prob, X_tr_prob], axis=1)
        X_test_prob = pd.concat([X_test_prob, X_t_prob], axis=1)
    return X_train_prob, X_test_prob

models = [
    RandomForestClassifier(n_estimators=1000, n_jobs=-1), 
    LogisticRegression(n_jobs=1), 
]
%time X_train_prob, X_test_prob = get_probs(X_train, y_train, X_test, models)
print(X_train_prob.shape, X_test_prob.shape)

  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)


CPU times: user 31min 11s, sys: 1min 5s, total: 32min 17s
Wall time: 14min 9s
(3162, 52) (1558, 52)


## Предсказываем среднюю оценку за курс

In [108]:
def get_score(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    msez = mean_squared_error(y, np.zeros(y.shape))
    return 10 * (1 - mse / msez)

print(get_score(y_train, np.zeros(y_train.shape)))
print(get_score(y_train, y_train))

0.0
10.0


In [109]:
X_train_ = pd.concat([X_train, X_train_prob], axis=1)
X_test_ = pd.concat([X_test, X_test_prob], axis=1)
print(X_train_.shape, X_test_.shape)

(3162, 1631) (1558, 1631)


## RandomForest

In [110]:
model = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
mo_model = MultiOutputRegressor(model)
%time mo_model.fit(X_train_, y_train)
y_test_pred = mo_model.predict(X_test_)
y_test_pred = pd.DataFrame(y_test_pred, index=data_test.index, columns=y_cols)

CPU times: user 3h 50min 51s, sys: 8.45 s, total: 3h 50min 59s
Wall time: 7min 11s


In [111]:
print(get_score(y_test, y_test_pred))

3.18022683571


In [60]:
# y_test_pred.to_csv('res/out_3_1.csv')

In [61]:
y_train_min = y_train[y_train != 0].min()
y_train_min

Final Avg         0.050000
Final Exam Avg    0.062500
HW Avg            0.020833
TC Avg            0.003922
ig Avg            0.189076
metrology Avg     0.200000
ng Avg            0.006494
sootv Avg         0.012500
stand Avg         0.040000
termodin Avg      0.010256
БП Avg            0.037037
ДЗ Avg            0.007937
ДР Avg            0.011204
Дз1 Avg           0.004386
Дз2 Avg           1.000000
З Avg             0.040816
ИЗ Avg            0.022727
ИТ Avg            0.035000
КЗ Avg            0.104167
ПЗ Avg            0.041667
Практ Avg         0.002299
РТ Avg            0.023810
Т Avg             0.001000
ТЗ Avg            0.008772
Тест Avg          0.001923
УЗ Avg            0.003922
dtype: float64

In [62]:
y_train['Дз2 Avg'].value_counts()

0.0    4606
1.0     114
Name: Дз2 Avg, dtype: int64

In [63]:
y_test_pred_2 = y_test_pred.copy()
y_test_pred_2[(y_test_pred_2 != 0) & (y_test_pred_2 < y_train_min)] = 0
y_test_pred_2

Unnamed: 0_level_0,Final Avg,Final Exam Avg,HW Avg,TC Avg,ig Avg,metrology Avg,ng Avg,sootv Avg,stand Avg,termodin Avg,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.019303,0.000000,0.000000,0.000000,...,0.025102,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.066837
1,0.108290,0.000000,0.000000,0.075846,0.000000,0.252967,0.036273,0.152110,0.185372,0.000000,...,0.000000,0.122010,0.000000,0.000000,0.000000,0.0,0.061822,0.000000,0.000000,0.000000
2,0.069022,0.000000,0.028981,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.063091,0.000000,0.000000,0.000000,0.0,0.001176,0.078380,0.004649,0.075838
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.415972,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.444109,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.069648,0.000000,0.000000,0.000000,0.062290,0.067812,0.000000,...,0.000000,0.224211,0.000000,0.000000,0.000000,0.0,0.040820,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.276514,0.000000,0.000000,0.000000,0.0,0.030514,0.000000,0.000000,0.000000
8,0.570069,0.000000,0.000000,0.000000,0.000000,0.671746,0.000000,0.625965,0.571103,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.026991,0.0,0.000000,0.048010,0.001987,0.080949
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.128013,0.000000,0.0,0.130894,0.000000,0.136741,0.006716


In [64]:
y_test_pred_2.to_csv('res/out_3_2.csv')

## LinearRegression

In [33]:
model = LinearRegression(n_jobs=-1)
mo_model = MultiOutputRegressor(model)
%time mo_model.fit(X_train_, y_train)
y_test_pred = mo_model.predict(X_test_)
y_test_pred = pd.DataFrame(y_test_pred, index=data_test.index, columns=y_cols)

CPU times: user 1min 31s, sys: 6.06 s, total: 1min 37s
Wall time: 5.7 s


In [34]:
y_test_pred.to_csv('res/out_3_3.csv')

In [35]:
y_test_pred_2 = y_test_pred.copy()
y_test_pred_2[(y_test_pred_2 != 0) & (y_test_pred_2 < y_train_min)] = 0
y_test_pred_2.head()

Unnamed: 0_level_0,Final Avg,Final Exam Avg,HW Avg,TC Avg,ig Avg,metrology Avg,ng Avg,sootv Avg,stand Avg,termodin Avg,...,ИЗ Avg,ИТ Avg,КЗ Avg,ПЗ Avg,Практ Avg,РТ Avg,Т Avg,ТЗ Avg,Тест Avg,УЗ Avg
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06319,0.0,0.0,0.0,0.0,0.0,0.004105,0.0,0.015876,0.049518
1,0.11982,0.0,0.0,0.128869,0.0,0.250261,0.021674,0.218428,0.210617,0.0,...,0.047714,0.180198,0.0,0.0,0.057734,0.0,0.128518,0.0,0.006252,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.015101,0.0,0.0,0.0,...,0.0,0.104387,0.0,0.0,0.0,0.0,0.011081,0.0161,0.0,0.012386
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.073902,0.0,0.0,0.0,0.0,0.398731,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02523,0.0,0.008918,0.0


In [36]:
y_test_pred_2.to_csv('res/out_3_4.csv')

## XGBoost

In [None]:
model = xgb.XGBClassifier(n_estimators=1000)
mo_model = MultiOutputRegressor(model)
%time mo_model.fit(X_train_, y_train)
y_test_pred = mo_model.predict(X_test_)
y_test_pred = pd.DataFrame(y_test_pred, index=data_test.index, columns=y_cols)
get_score(y_test, y_test_pred)