In [2]:
import pandas as pd
import numpy as np
import scipy as sc
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

Выгрузка данных

In [3]:
data = pd.read_csv('data/orange_small_churn_data.txt')
label = pd.read_csv('data/orange_small_churn_labels.txt', header=None,names=['label'])

Выделение вешественных и категориальных признаков

In [10]:
numeric_label = data.columns[:190]
categorical_label = data.columns[190:]

In [11]:
data[numeric_label].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 190 entries, Var1 to Var190
dtypes: float64(189), int64(1)
memory usage: 58.0 MB


In [12]:
numeric_data = data[numeric_label]
cat_data = data[categorical_label]

Отбор признаков у которых ненулевых значений больше половины 

In [138]:
def without_null(data, n =25000):
    correct_name = []
    for name in data.columns:
        if np.sum(~data[name].isnull()) >= n:
            correct_name.append(name)
    return correct_name

In [93]:
correct_name = without_null(numeric_data, n =25000)
len(correct_name)

39

In [96]:
cat_name = without_null(cat_data, n =25000)
len(cat_name)

28

Замена пропусков на среднее значенние у вещественных признаков 

In [19]:
correct_data = data[correct_name]
mean_data = correct_data.mean(axis='index')

for i, name in enumerate(correct_name):
    correct_data[name].fillna(mean_data[i], inplace=True)

LabelEncoding для категориальных признаков



In [20]:
cat_data = data[cat_name]
transform_cat_data = cat_data.copy()
for name in cat_data.columns:
    transform_cat_data[name] = LabelEncoder().fit_transform(cat_data[name])


Балансиврока данных

In [21]:
label.value_counts()

label
-1       37024
 1        2976
dtype: int64

In [22]:
index_b = label[label['label'] == 1].index

In [23]:
k = len(label[label['label'] == -1]) - len(label[label['label'] == 1])
k

34048

In [24]:
np.random.seed(42)
indices_to_add = np.random.choice(index_b, k)

In [25]:
indices_to_add

array([11241, 17017, 14879, ..., 37101, 20973, 25393], dtype=int64)

In [26]:
total_data = correct_data.merge(transform_cat_data, left_index=True, right_index=True)

Создание фрейма, который надо добавить по методике oversampling

In [27]:
data_add = total_data.iloc[indices_to_add, :]
label_add = label.iloc[indices_to_add, :]

In [28]:
total_data = pd.concat([total_data, data_add])
total_label = pd.concat([label, label_add])

In [29]:
total_data.shape

(74048, 67)

Разделение на обучение и тест

In [30]:
train_data, test_data, train_target, test_target = train_test_split(total_data, total_label, random_state=42, test_size=0.3) 

In [31]:
cv_spliter = ShuffleSplit(n_splits=5, test_size=0.3)

In [32]:
cv_res = cross_val_score(LogisticRegression(), train_data, train_target, cv=cv_spliter, scoring='f1')

In [33]:
cv_res

array([0.5334575 , 0.60050598, 0.57474601, 0.56309078, 0.56746182])

Обучение на всех данных и проверка на отложенной выборке

In [34]:
model = LogisticRegression().fit(train_data, train_target)

Оценка качества

In [35]:
predict_train = model.predict(train_data)
predict_test = model.predict(test_data)

In [36]:
print(f"f1_score by train data: {f1_score(predict_train, train_target)}")
print(f"f1_score by test data: {f1_score(predict_test, test_target)}")
print()

print(f"accuracy_score by train data: {accuracy_score(predict_train, train_target)}")
print(f"accuracy_score by test data: {accuracy_score(predict_test, test_target)}")
print()

print(f"roc_score by train data: {roc_auc_score(predict_train, train_target)}")
print(f"roc_score by test data: {roc_auc_score(predict_test, test_target)}")
print()

f1_score by train data: 0.5637819290888296
f1_score by test data: 0.562474889513861

accuracy_score by train data: 0.5585051993903498
accuracy_score by test data: 0.5588116137744767

roc_score by train data: 0.5585290849160585
roc_score by test data: 0.5588468928980397



Base line  получен качество составило примерно 0.55

Baseline:

1.Отбор признаков по пропускам

2.преобразование кат признаков labelencoder

3.балансировка по oversampling

4.logostocregression 

Upgrade model

Преобразуем данные вещественные оставим также, а категориальные изменим.

Выведим статистику по кол уникальных значений кат признаков

In [37]:
cat_data = data[cat_name]
for name in cat_data.columns:
    print(name, len(cat_data[name].unique()))
     

Var192 355
Var193 50
Var195 23
Var196 4
Var197 221
Var198 3891
Var199 4401
Var202 5543
Var203 6
Var204 100
Var205 4
Var206 22
Var207 14
Var208 3
Var210 6
Var211 2
Var212 78
Var216 1819
Var217 12471
Var218 3
Var219 23
Var220 3891
Var221 7
Var222 3891
Var223 5
Var226 23
Var227 7
Var228 30


ПРизнаки с 2 значаниями закодируем как бинарные

Признаки со значениями меньше 5 закодируем one_hot

Признаки со значениями больше 5 закодируем hash-trick векторами размера 5

In [38]:
counts_unique_value = np.array([len(cat_data[name].unique()) for name in cat_data.columns])
counts_unique_value

array([  355,    50,    23,     4,   221,  3891,  4401,  5543,     6,
         100,     4,    22,    14,     3,     6,     2,    78,  1819,
       12471,     3,    23,  3891,     7,  3891,     5,    23,     7,
          30])

In [39]:
binar_column = cat_data.columns[counts_unique_value <=2]
ohe_column = cat_data.columns[(counts_unique_value > 2)&(counts_unique_value <= 5)]
hash_column = cat_data.columns[counts_unique_value > 5]

Бинарные данные

In [40]:
binar_data = cat_data[binar_column]

for name in binar_data.columns:
  binar_data[name] = LabelEncoder().fit_transform(binar_data[name])

In [41]:
binar_data

Unnamed: 0,Var211
0,0
1,0
2,0
3,0
4,0
...,...
39995,0
39996,0
39997,0
39998,0


Горячее кодирование

In [42]:
ohe_data = OneHotEncoder(sparse=False).fit_transform(cat_data[ohe_column])

In [43]:
ohe_data

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

Hash trick

In [44]:
hashes = np.array([hash(value) % 5 for value in cat_data[hash_column[0]].unique()])

In [45]:
hashes

array([4, 4, 3, 2, 1, 3, 0, 0, 0, 3, 2, 4, 0, 3, 2, 4, 2, 3, 2, 0, 2, 3,
       4, 3, 1, 1, 2, 4, 4, 2, 1, 3, 0, 1, 2, 1, 0, 4, 0, 3, 2, 1, 3, 0,
       3, 0, 0, 2, 1, 1, 4, 0, 3, 2, 2, 0, 3, 4, 2, 2, 2, 2, 1, 0, 4, 2,
       0, 4, 0, 3, 2, 2, 3, 3, 1, 4, 2, 3, 3, 3, 2, 0, 1, 0, 1, 1, 0, 3,
       4, 4, 2, 4, 1, 2, 4, 0, 3, 1, 3, 1, 0, 2, 3, 2, 1, 4, 2, 1, 3, 0,
       1, 0, 4, 4, 3, 0, 0, 2, 2, 4, 1, 2, 4, 0, 4, 2, 1, 0, 3, 3, 3, 1,
       0, 0, 4, 0, 1, 3, 3, 3, 0, 4, 2, 2, 4, 1, 1, 0, 0, 3, 3, 1, 3, 4,
       3, 4, 4, 2, 3, 3, 0, 0, 0, 0, 3, 4, 4, 2, 3, 2, 0, 3, 0, 4, 0, 2,
       3, 3, 0, 2, 4, 1, 0, 3, 1, 1, 2, 0, 1, 1, 0, 1, 3, 0, 4, 0, 3, 0,
       3, 0, 0, 2, 2, 2, 1, 0, 0, 2, 3, 3, 2, 3, 0, 1, 4, 2, 2, 1, 1, 0,
       3, 2, 4, 0, 1, 4, 0, 0, 4, 2, 0, 0, 2, 0, 1, 4, 2, 3, 4, 2, 1, 2,
       0, 4, 3, 1, 4, 4, 2, 3, 2, 0, 2, 3, 4, 4, 0, 0, 0, 0, 0, 1, 4, 1,
       1, 4, 3, 2, 4, 4, 4, 3, 0, 4, 2, 1, 1, 3, 3, 0, 4, 1, 2, 3, 3, 2,
       4, 1, 2, 4, 4, 2, 4, 2, 2, 4, 0, 3, 1, 4, 2,

In [46]:
def vector_hash(data, n_vector, name):
  data_hash = []
  for value in data:
    vector = [0] * n_vector
    change_value = name + value if type(value) != float else name
    value_hash = hash(change_value) % n_vector
    vector[value_hash] = 1
    data_hash.append(vector)
  return np.array(data_hash)

In [47]:
test = vector_hash(cat_data[hash_column[0]], 5, hash_column[0])
test

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0]])

In [48]:
hash_data = []
for name in hash_column:
  vector_feat = vector_hash(cat_data[name], 5, name)
  hash_data.append(vector_feat)
hash_data = np.hstack(hash_data)

In [49]:
hash_data.shape

(40000, 110)

Объеденение  ктаегориальных признаков

In [50]:
cat_data_up = np.hstack((binar_data, ohe_data, hash_data))
cat_data_up = pd.DataFrame(cat_data_up)


Объеденение с вещественными признаками

In [51]:
total_data_up = correct_data.merge(cat_data_up, left_index=True, right_index=True)

Балансировка

In [52]:
data_add_up = total_data_up.iloc[indices_to_add, :]
label_add_up = label.iloc[indices_to_add, :]

In [53]:
total_data_up = pd.concat([total_data_up, data_add_up])
total_label_up = pd.concat([label, label_add_up])

In [54]:
total_data_up.shape

(74048, 169)

Обучение модели

In [55]:
train_data_up, test_data_up, train_target_up, test_target_up = train_test_split(total_data_up, total_label_up, random_state=42, test_size=0.3) 

In [56]:
cv_spliter = ShuffleSplit(n_splits=5, test_size=0.3)

In [57]:
cv_res = cross_val_score(LogisticRegression(), train_data_up, train_target_up, cv=cv_spliter, scoring='f1')

In [58]:
cv_res

array([0.56184942, 0.54716733, 0.52826551, 0.55334037, 0.55381441])

Логистическая регрессия отработала не лучше на новых признаках, попробуем xgboost c новыми признаками

In [59]:
import xgboost as xgb

In [60]:
cv_res = cross_val_score(xgb.XGBClassifier(n_estimators=100, max_depth=5), train_data_up, train_target_up, cv=cv_spliter, scoring='f1')



In [61]:
print(cv_res)
print(cv_res.mean())

[0.8570554  0.86100457 0.85853778 0.85729954 0.85315883]
0.8574112220001586


Обучение на всех данных и предсказание на тесте

In [62]:
model_up = xgb.XGBClassifier(n_estimators=100, max_depth=5).fit(train_data_up, train_target_up)



In [63]:
predict_train_up = model_up.predict(train_data_up)
predict_test_up = model_up.predict(test_data_up)

In [64]:
print(f"f1_score by train data: {f1_score(predict_train_up, train_target_up)}")
print(f"f1_score by test data: {f1_score(predict_test_up, test_target_up)}")
print()

print(f"accuracy_score by train data: {accuracy_score(predict_train_up, train_target_up)}")
print(f"accuracy_score by test data: {accuracy_score(predict_test_up, test_target_up)}")
print()

print(f"roc_score by train data: {roc_auc_score(predict_train_up, train_target_up)}")
print(f"roc_score by test data: {roc_auc_score(predict_test_up, test_target_up)}")
print()

f1_score by train data: 0.8936004596252571
f1_score by test data: 0.8643565631843358

accuracy_score by train data: 0.8892404452761754
accuracy_score by test data: 0.8577987846049966

roc_score by train data: 0.8917973941657298
roc_score by test data: 0.8613766849725412



Попробуем xgboost на простом преобразование категориальных данных

In [65]:
cv_res_old = cross_val_score(xgb.XGBClassifier(n_estimators=100, max_depth=5), train_data, train_target, cv=cv_spliter, 
                             scoring='f1')



In [66]:
print(cv_res_old)
print(cv_res_old.mean())

[0.86453577 0.87171033 0.8691623  0.86764082 0.86576279]
0.8677624006747061


Использование более сложной обработки категориальных признаков не дает дополнительного прироста качества, оставим простой labelencoding

In [67]:
model_old_up =  xgb.XGBClassifier(n_estimators=100, max_depth=5).fit(train_data, train_target)



In [68]:
predict_train = model_old_up.predict(train_data)
predict_test = model_old_up.predict(test_data)

In [69]:
print(f"f1_score by train data: {f1_score(predict_train, train_target)}")
print(f"f1_score by test data: {f1_score(predict_test, test_target)}")
print()

print(f"accuracy_score by train data: {accuracy_score(predict_train, train_target)}")
print(f"accuracy_score by test data: {accuracy_score(predict_test, test_target)}")
print()

print(f"roc_score by train data: {roc_auc_score(predict_train, train_target)}")
print(f"roc_score by test data: {roc_auc_score(predict_test, test_target)}")
print()

f1_score by train data: 0.8958642158316039
f1_score by test data: 0.8696697728346288

accuracy_score by train data: 0.8915748654332182
accuracy_score by test data: 0.8633805986945757

roc_score by train data: 0.8941746079875249
roc_score by test data: 0.8669997600242892



Прогноз вероятности

In [70]:
predict_train_proba = model_old_up.predict_proba(train_data)
predict_test_proba = model_old_up.predict_proba(test_data)
predict_train_proba # 2 столбец

array([[0.66767305, 0.33232695],
       [0.28389835, 0.71610165],
       [0.8012305 , 0.19876952],
       ...,
       [0.14735246, 0.85264754],
       [0.9307357 , 0.06926431],
       [0.9680642 , 0.03193581]], dtype=float32)

Обучение и прогноз на даных из соревнования

Выгрузка данных соревнований

In [156]:
train_data = pd.read_csv('data/orange_small_churn_train_data.csv')
test_data = pd.read_csv('data/orange_small_churn_test_data.csv')

In [157]:
train_data.drop(labels='ID', inplace=True, axis='columns')
train_data.dropna(subset=['labels'], inplace=True)

In [158]:
train_labels = train_data['labels']
train_data = train_data.iloc[:,:-1]

In [159]:
numeric_label = train_data.columns[:190]
categorical_label = train_data.columns[190:]

In [160]:
numeric_data = train_data[numeric_label]
cat_data = train_data[categorical_label]

Отбор не нулевых признаков

In [161]:
correct_name_num = without_null(numeric_data, n=15000)
len(correct_name_num)

38

In [162]:
correct_name_cat = without_null(cat_data, n=15000)
len(correct_name_cat)

28

Замена пропусков в тестовой выборке

Вещественные признаки

In [163]:
correct_data_num = train_data[correct_name_num]
mean_data = correct_data_num.mean(axis='index')

for i, name in enumerate(correct_name_num):
    correct_data_num[name].fillna(mean_data[i], inplace=True)

Категорилальные признаки

In [164]:
cat_data_train = train_data[correct_name_cat]
transform_cat_data_train = cat_data_train.copy()
for name in cat_data_train.columns:
    transform_cat_data_train[name] = LabelEncoder().fit_transform(cat_data_train[name])

In [165]:
total_data_train = correct_data_num.merge(transform_cat_data_train, left_index=True, right_index=True)

In [166]:
total_data_train.head()

Unnamed: 0,Var6,Var7,Var13,Var21,Var22,Var24,Var25,Var28,Var35,Var38,...,Var217,Var218,Var219,Var220,Var221,Var222,Var223,Var226,Var227,Var228
0,3052.0,6.791141,1245.570151,480.0,600.0,20.0,480.0,200.0,0.0,82752.0,...,2636,1,10,1977,0,2579,0,14,0,28
1,1813.0,7.0,636.0,212.0,265.0,2.0,128.0,166.56,0.0,2706120.0,...,6740,1,10,1984,4,256,0,22,2,2
2,1953.0,7.0,448.0,176.0,220.0,0.0,72.0,311.76,0.0,4698780.0,...,6737,0,10,228,6,1758,0,7,3,24
3,1533.0,7.0,4.0,332.0,415.0,0.0,144.0,220.08,5.0,864384.0,...,2456,1,10,2679,4,1832,0,22,2,7
4,686.0,7.0,0.0,160.0,200.0,2.0,48.0,278.0,0.0,4364880.0,...,11,1,10,1154,4,965,0,12,2,7


Проверка на баланс классов

In [167]:
train_labels.value_counts()

-1.0    16921
 1.0     1377
Name: labels, dtype: int64

Балансировка

In [174]:
np.random.seed(42)

n_over = len(train_labels[train_labels == -1]) - len(train_labels[train_labels == 1])
add_data_index = np.random.choice(train_labels[train_labels==1].index, n_over)
add_data = total_data_train.iloc[add_data_index]
add_labels = train_labels.iloc[add_data_index]

Формирование конченого набора данных

In [176]:
bal_data_train = pd.concat([total_data_train, add_data]) 
bal_labels = pd.concat([train_labels, add_labels]) 

Обучение модели

In [185]:
cv_spliter = ShuffleSplit(n_splits=5, test_size=0.4)

In [186]:
model = xgb.XGBClassifier(n_estimators=100, max_depth=5)

In [187]:
cv_res = cross_val_score(model, bal_data_train, bal_labels, cv=cv_spliter, scoring='roc_auc')



In [188]:
cv_res

array([0.97633091, 0.97507868, 0.97158119, 0.97330225, 0.97533773])

In [193]:
model_total = model.fit(bal_data_train, bal_labels)



Предсказание на тесте

Предобработка данных на тесте

In [190]:
correct_data_num_test = test_data[correct_name_num]
mean_data = correct_data_num_test.mean(axis='index')

for i, name in enumerate(correct_name_num):
    correct_data_num_test[name].fillna(mean_data[i], inplace=True)

ВОт тут нужно преобразовывать уже готовым инколингом а не обучать его на тестовом наборе

In [191]:
cat_data_test = test_data[correct_name_cat]
transform_cat_data_test = cat_data_test.copy()
for name in cat_data_test.columns:
    transform_cat_data_test[name] = LabelEncoder().fit_transform(cat_data_test[name])

In [192]:
total_data_test = correct_data_num_test.merge(transform_cat_data_test, left_index=True, right_index=True)

In [194]:
predict_test_proba = model_total.predict_proba(total_data_test)


In [195]:
out_df = pd.DataFrame(enumerate(predict_test_proba[:,1]), columns=['Id', 'result'])
out_df.to_csv('output_df.csv', sep=',', index=False)