In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 
from sklearn.datasets import make_multilabel_classification 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import preprocessing

import math
import tensorflow as tf
from tensorflow.python.data import Dataset

In [2]:
pd.options.display.max_rows = 20
pd.options.display.float_format = '{:.6f}'.format

In [3]:
#Загрузка данных

problem_train_features = pd.read_csv("D:/ds_problem/problem_train.csv", na_values="?", low_memory=False )
problem_train_labels = pd.read_csv("D:/ds_problem/problem_labels.csv", na_values="?", low_memory=False )
problem_test_features = pd.read_csv("D:/ds_problem/problem_test.csv", na_values="?", low_memory=False )


#Объединение тренировочной и тестовой выборки для обработки данных
problem_full_features = pd.concat([problem_train_features,problem_test_features],axis=0)
problem_full_features = problem_full_features.drop_duplicates(subset='id')

In [4]:
#Функция предварительной обработки данных. Сокращает размерность пространства признаков до 25 путем отбрасывания
# признаков с неопределнными значениями (т.к нет информации, что конкретно означают признаки и какие значения
# можно было бы подставить вместо NaN, было принято такое решение) и признаков, имеющих константное значение на всей выборке.
def process_features(features):
    
    #удаляем все series имеющие хотя бы 1 NaN значение признака
    features = features.dropna(axis=1, how='any')
    
    #разбиваем dataframe на два, в одном - столбцы типа object, в другом - числовые значения 
    subframe_type_object = features.select_dtypes(include=['object']).copy()
    subframe_type_num = features.select_dtypes(exclude=['object']).copy()
    
    #для всех всех столбцов с буквами применяем one hot encoding
    subframe_type_object_encoded = pd.DataFrame()
    for serie in subframe_type_object.columns:
            one_hot_columns = pd.get_dummies(subframe_type_object[serie], prefix=serie)
            subframe_type_object_encoded=pd.concat([subframe_type_object_encoded, one_hot_columns], axis=1)
   
    #из dataframe с числами удаляем столбцы с постоянным значением (дисперсия==0.) и столбец id  
    subframe_type_num = subframe_type_num.drop(axis=1,columns=['id'])
    subframe_type_num = subframe_type_num.loc[:, subframe_type_num.var() != 0.0]
   
    #масштабируем численные значения на отрезок [0,1] методом min-max scaling
    subframe_type_num_scaled = pd.DataFrame()
    subframe_type_num_scaled=(subframe_type_num-subframe_type_num.min())/(subframe_type_num.max()-subframe_type_num.min())
    
    #объединяем датафреймы
    processed_features = pd.concat([subframe_type_num_scaled,subframe_type_object_encoded], axis=1)
    
    #снова разбиваем на train и test выборки
    processed_train_features = processed_features.head(8000)
    processed_test_features = processed_features.tail(2000)
    return  processed_train_features, processed_test_features


In [5]:
#Предварительная обработка датафреймов с признаками и метками
training_features, test_features = process_features(problem_full_features)
#Столбец id не нужен
training_labels = problem_train_labels.drop(axis=1,columns=['id'])

In [6]:
#извлечение данных из dataframe в numpy array
x_train = training_features.values
y_train = training_labels.values
x_test = test_features.values

In [7]:
# Определение модели - логистическая регрессия
def model(X, W, B): 

    return tf.nn.sigmoid(tf.matmul(X, W)+ B) 


In [8]:
#Инициализация весов - случаные значения из нормального распределения
def init_weights(shape): 
    return tf.Variable(tf.random_normal(shape, stddev=0.01)) 

In [23]:
#learning rate, кол-во шагов обучения (training_epochs),
learning_rate = 0.05 
training_epochs = 2500
batch_size = 800

#кол-во входов и выходов(классов),суммарное количество батчей
num_input = x_train.shape[1] 
num_classes = y_train.shape[1] 
num_batches = int(x_train.shape[0]/batch_size) 

#Объявление модели
x = tf.placeholder("float", [None, num_input]) 
y = tf.placeholder("float", [None, num_classes]) 
b = tf.Variable(tf.zeros([num_classes]))
w = init_weights([num_input, num_classes]) 
predictions = model(x, w, b) 

#Функция ошибки - log loss. Reduction.NONE - рассчитывается отдельно для каждого класса
loss = tf.losses.log_loss(labels=y, predictions=predictions, epsilon=0.00000001, reduction=tf.losses.Reduction.NONE)

#Для оптимизации считается сумма значений по каждому классу для текущего батча,
# а затем берется среднее арифметическое по классам
loss_sum = tf.reduce_mean(tf.reduce_sum(loss,axis=0))
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(loss_sum) 

init = tf.global_variables_initializer() 

In [24]:
with tf.Session() as sess: 
    sess.run(init) 
    sess.run(tf.local_variables_initializer()) 
    
    for epoch in range(training_epochs): 
        #случайная выборка тренировочных примеров в текущий батч
        indices = np.random.choice(num_input, batch_size)
        x_batch, y_batch = x_train[indices], y_train[indices]   
        _, log_loss, l_sum = sess.run([optimizer, loss, loss_sum], 
                              feed_dict = {x : x_batch, y : y_batch})
        
        print("Log loss for %d epoch %.3f" %(epoch,l_sum)) #Вывод текущей ошибки
    print("\n")
    res_loss = np.sum(log_loss,axis=0)
    for cl, l_loss in enumerate(res_loss): #вывод ошибок по каждому классу по итогам обучения на последнем батче
        print("Log loss for %d class = %.3f" %(cl,l_loss))
     
    print("FINAL MEAN LOSS = %.3f (batch size = %d)" %(l_sum,batch_size)) #итоговая log_loss (средняя по всем классам по итогам обучения на последнем батче)

    sess.run(tf.local_variables_initializer())

    #Вероятности принадлежности классам для примеров из тестовой выборки
    probabilities = sess.run(predictions, feed_dict = {x : x_test})



Log loss for 0 epoch 552.950
Log loss for 1 epoch 886.447
Log loss for 2 epoch 980.002
Log loss for 3 epoch 1122.587
Log loss for 4 epoch 781.190
Log loss for 5 epoch 723.044
Log loss for 6 epoch 811.318
Log loss for 7 epoch 471.853
Log loss for 8 epoch 678.991
Log loss for 9 epoch 957.260
Log loss for 10 epoch 326.432
Log loss for 11 epoch 440.480
Log loss for 12 epoch 521.758
Log loss for 13 epoch 650.719
Log loss for 14 epoch 560.877
Log loss for 15 epoch 289.728
Log loss for 16 epoch 356.288
Log loss for 17 epoch 356.072
Log loss for 18 epoch 448.048
Log loss for 19 epoch 388.545
Log loss for 20 epoch 288.845
Log loss for 21 epoch 374.620
Log loss for 22 epoch 359.808
Log loss for 23 epoch 321.973
Log loss for 24 epoch 366.022
Log loss for 25 epoch 440.349
Log loss for 26 epoch 345.196
Log loss for 27 epoch 271.410
Log loss for 28 epoch 419.612
Log loss for 29 epoch 252.811
Log loss for 30 epoch 316.594
Log loss for 31 epoch 257.758
Log loss for 32 epoch 256.378
Log loss for 33 epo

Log loss for 378 epoch 63.711
Log loss for 379 epoch 74.441
Log loss for 380 epoch 74.069
Log loss for 381 epoch 85.096
Log loss for 382 epoch 80.422
Log loss for 383 epoch 79.629
Log loss for 384 epoch 66.797
Log loss for 385 epoch 68.351
Log loss for 386 epoch 63.217
Log loss for 387 epoch 66.297
Log loss for 388 epoch 60.252
Log loss for 389 epoch 59.669
Log loss for 390 epoch 54.748
Log loss for 391 epoch 55.794
Log loss for 392 epoch 51.751
Log loss for 393 epoch 55.881
Log loss for 394 epoch 57.521
Log loss for 395 epoch 60.166
Log loss for 396 epoch 55.849
Log loss for 397 epoch 57.117
Log loss for 398 epoch 55.343
Log loss for 399 epoch 57.627
Log loss for 400 epoch 57.840
Log loss for 401 epoch 62.946
Log loss for 402 epoch 61.913
Log loss for 403 epoch 69.067
Log loss for 404 epoch 62.172
Log loss for 405 epoch 64.458
Log loss for 406 epoch 60.024
Log loss for 407 epoch 61.529
Log loss for 408 epoch 57.465
Log loss for 409 epoch 57.373
Log loss for 410 epoch 51.711
Log loss f

Log loss for 722 epoch 39.310
Log loss for 723 epoch 38.529
Log loss for 724 epoch 37.814
Log loss for 725 epoch 37.218
Log loss for 726 epoch 36.977
Log loss for 727 epoch 38.858
Log loss for 728 epoch 37.808
Log loss for 729 epoch 39.981
Log loss for 730 epoch 39.358
Log loss for 731 epoch 41.035
Log loss for 732 epoch 41.202
Log loss for 733 epoch 41.372
Log loss for 734 epoch 40.383
Log loss for 735 epoch 41.439
Log loss for 736 epoch 39.036
Log loss for 737 epoch 38.254
Log loss for 738 epoch 37.018
Log loss for 739 epoch 36.903
Log loss for 740 epoch 38.935
Log loss for 741 epoch 38.770
Log loss for 742 epoch 36.984
Log loss for 743 epoch 36.193
Log loss for 744 epoch 37.155
Log loss for 745 epoch 34.619
Log loss for 746 epoch 39.973
Log loss for 747 epoch 38.357
Log loss for 748 epoch 39.688
Log loss for 749 epoch 40.374
Log loss for 750 epoch 40.480
Log loss for 751 epoch 41.671
Log loss for 752 epoch 39.949
Log loss for 753 epoch 38.571
Log loss for 754 epoch 40.582
Log loss f

Log loss for 1069 epoch 31.123
Log loss for 1070 epoch 32.022
Log loss for 1071 epoch 31.509
Log loss for 1072 epoch 32.064
Log loss for 1073 epoch 32.293
Log loss for 1074 epoch 31.640
Log loss for 1075 epoch 34.258
Log loss for 1076 epoch 32.274
Log loss for 1077 epoch 34.744
Log loss for 1078 epoch 34.362
Log loss for 1079 epoch 32.659
Log loss for 1080 epoch 32.454
Log loss for 1081 epoch 30.079
Log loss for 1082 epoch 30.331
Log loss for 1083 epoch 30.385
Log loss for 1084 epoch 30.385
Log loss for 1085 epoch 29.531
Log loss for 1086 epoch 32.035
Log loss for 1087 epoch 30.715
Log loss for 1088 epoch 30.443
Log loss for 1089 epoch 31.294
Log loss for 1090 epoch 30.916
Log loss for 1091 epoch 30.594
Log loss for 1092 epoch 30.086
Log loss for 1093 epoch 31.414
Log loss for 1094 epoch 31.558
Log loss for 1095 epoch 31.280
Log loss for 1096 epoch 32.243
Log loss for 1097 epoch 31.586
Log loss for 1098 epoch 31.826
Log loss for 1099 epoch 30.800
Log loss for 1100 epoch 30.326
Log loss

Log loss for 1453 epoch 25.550
Log loss for 1454 epoch 25.622
Log loss for 1455 epoch 25.705
Log loss for 1456 epoch 24.837
Log loss for 1457 epoch 24.731
Log loss for 1458 epoch 25.005
Log loss for 1459 epoch 24.757
Log loss for 1460 epoch 25.718
Log loss for 1461 epoch 25.377
Log loss for 1462 epoch 24.631
Log loss for 1463 epoch 25.547
Log loss for 1464 epoch 25.234
Log loss for 1465 epoch 24.382
Log loss for 1466 epoch 24.120
Log loss for 1467 epoch 25.418
Log loss for 1468 epoch 25.085
Log loss for 1469 epoch 25.669
Log loss for 1470 epoch 24.576
Log loss for 1471 epoch 23.275
Log loss for 1472 epoch 27.252
Log loss for 1473 epoch 26.854
Log loss for 1474 epoch 27.119
Log loss for 1475 epoch 26.283
Log loss for 1476 epoch 25.846
Log loss for 1477 epoch 24.926
Log loss for 1478 epoch 26.590
Log loss for 1479 epoch 25.765
Log loss for 1480 epoch 25.510
Log loss for 1481 epoch 24.016
Log loss for 1482 epoch 25.928
Log loss for 1483 epoch 26.692
Log loss for 1484 epoch 25.728
Log loss

Log loss for 1844 epoch 22.199
Log loss for 1845 epoch 21.155
Log loss for 1846 epoch 21.191
Log loss for 1847 epoch 20.506
Log loss for 1848 epoch 22.866
Log loss for 1849 epoch 21.886
Log loss for 1850 epoch 22.042
Log loss for 1851 epoch 21.600
Log loss for 1852 epoch 22.238
Log loss for 1853 epoch 20.380
Log loss for 1854 epoch 21.647
Log loss for 1855 epoch 21.690
Log loss for 1856 epoch 19.590
Log loss for 1857 epoch 21.808
Log loss for 1858 epoch 20.773
Log loss for 1859 epoch 21.502
Log loss for 1860 epoch 21.806
Log loss for 1861 epoch 21.641
Log loss for 1862 epoch 20.693
Log loss for 1863 epoch 22.029
Log loss for 1864 epoch 21.824
Log loss for 1865 epoch 20.273
Log loss for 1866 epoch 21.261
Log loss for 1867 epoch 21.471
Log loss for 1868 epoch 20.482
Log loss for 1869 epoch 21.023
Log loss for 1870 epoch 20.569
Log loss for 1871 epoch 21.571
Log loss for 1872 epoch 21.067
Log loss for 1873 epoch 21.731
Log loss for 1874 epoch 21.458
Log loss for 1875 epoch 21.008
Log loss

Log loss for 2229 epoch 18.639
Log loss for 2230 epoch 18.945
Log loss for 2231 epoch 18.769
Log loss for 2232 epoch 18.236
Log loss for 2233 epoch 18.172
Log loss for 2234 epoch 19.349
Log loss for 2235 epoch 18.354
Log loss for 2236 epoch 19.068
Log loss for 2237 epoch 19.374
Log loss for 2238 epoch 18.119
Log loss for 2239 epoch 18.789
Log loss for 2240 epoch 20.023
Log loss for 2241 epoch 19.438
Log loss for 2242 epoch 18.811
Log loss for 2243 epoch 18.700
Log loss for 2244 epoch 18.432
Log loss for 2245 epoch 19.956
Log loss for 2246 epoch 18.965
Log loss for 2247 epoch 18.623
Log loss for 2248 epoch 18.935
Log loss for 2249 epoch 18.956
Log loss for 2250 epoch 19.424
Log loss for 2251 epoch 18.648
Log loss for 2252 epoch 18.476
Log loss for 2253 epoch 20.218
Log loss for 2254 epoch 18.815
Log loss for 2255 epoch 18.763
Log loss for 2256 epoch 18.137
Log loss for 2257 epoch 17.905
Log loss for 2258 epoch 18.720
Log loss for 2259 epoch 20.177
Log loss for 2260 epoch 17.162
Log loss

In [25]:
#Формирование Dataframe и запись в файл
results = pd.DataFrame()
results['id'] = problem_test_features['id']
probs_df=pd.DataFrame(probabilities, columns=['class_%d' %index for index in range(14)])
results = pd.concat([results, probs_df] , axis=1)
results.to_csv('D:/ds_problem/problem_test_labels.csv', sep=',')

In [26]:
results.head(10)

Unnamed: 0,id,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9,class_10,class_11,class_12,class_13
0,7957,0.999992,0.999908,0.0,0.0,5e-06,0.0,0.0,0.998788,0.0,1.0,1.0,0.0,0.0,0.461383
1,693,0.0,0.0,1.2e-05,0.022749,8.9e-05,0.0,0.0,0.0,0.0,0.999882,0.922921,0.987997,1.6e-05,0.000112
2,15558,0.0,1.0,0.999999,0.003548,0.0,0.037688,0.067361,2e-05,0.0,0.999996,0.455242,0.503187,0.012027,0.0
3,15614,1e-06,6.5e-05,0.008798,0.000247,1.3e-05,8e-06,0.021381,0.0,0.0,1.0,0.999869,2e-06,4e-06,5e-06
4,8149,0.000281,0.007505,0.004781,0.0,0.0,0.002586,0.005858,0.999983,0.072306,0.999999,1.0,0.0,1e-06,0.0
5,711,0.0,1e-06,2e-06,1e-06,0.0,0.0,0.0,5e-06,0.0,0.001196,0.999665,3e-06,8e-06,0.0
6,12177,0.999671,4e-06,4.4e-05,1.1e-05,0.0,0.0,0.0,0.255162,0.0,0.999966,0.993994,1e-06,0.0,0.016422
7,17235,0.991028,0.344699,0.009976,0.0,0.0,3e-06,0.006943,0.158547,0.322424,0.999991,0.999999,0.0,0.0,0.199957
8,17993,0.988291,0.0,0.0,5.6e-05,0.0,0.093207,0.003557,0.0,0.0,0.80336,0.997376,0.052577,5e-06,0.047235
9,2164,0.985764,0.993728,1.0,0.0,0.0,6.8e-05,0.001484,0.028284,2.7e-05,1.0,0.999696,1e-06,1e-06,6e-06
