In [1]:
import pandas as pd
import numpy as np
import scipy as sc
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

Выгрузка данных

In [3]:
data = pd.read_csv('data/orange_small_churn_data.txt')
label = pd.read_csv('data/orange_small_churn_labels.txt', header=None,names=['label'])

Выделение вешественных и категориальных признаков

In [4]:
numeric_label = data.columns[:190]
categorical_label = data.columns[190:]

In [5]:
data[numeric_label].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 190 entries, Var1 to Var190
dtypes: float64(189), int64(1)
memory usage: 58.0 MB


In [6]:
numeric_data = data[numeric_label]
cat_data = data[categorical_label]

Отбор признаков у которых ненулевых значений больше половины 

In [7]:
correct_name = []
for name in numeric_data.columns:
    if np.sum(~numeric_data[name].isnull()) >= 25000:
        correct_name.append(name)
len(correct_name)

39

In [8]:
cat_name = []
for name in cat_data.columns:
    if np.sum(~cat_data[name].isnull()) >= 25000:
        cat_name.append(name)
len(cat_name)

28

Замена пропусков на среднее значенние у вещественных признаков 

In [9]:
correct_data = data[correct_name]
mean_data = correct_data.mean(axis='index')

for i, name in enumerate(correct_name):
    correct_data[name].fillna(mean_data[i], inplace=True)

LabelEncoding для категориальных признаков



In [10]:
cat_data = data[cat_name]
transform_cat_data = cat_data.copy()
for name in cat_data.columns:
    transform_cat_data[name] = LabelEncoder().fit_transform(cat_data[name])


Балансиврока данных

In [11]:
label.value_counts()

label
-1       37024
 1        2976
dtype: int64

In [12]:
index_b = label[label['label'] == 1].index

In [13]:
k = len(label[label['label'] == -1]) - len(label[label['label'] == 1])
k

34048

In [14]:
np.random.seed(42)
indices_to_add = np.random.choice(index_b, k)

In [15]:
indices_to_add

array([11241, 17017, 14879, ..., 37101, 20973, 25393], dtype=int64)

In [16]:
total_data = correct_data.merge(transform_cat_data, left_index=True, right_index=True)

Создание фрейма, который надо добавить по методике oversampling

In [17]:
data_add = total_data.iloc[indices_to_add, :]
label_add = label.iloc[indices_to_add, :]

In [18]:
total_data = pd.concat([total_data, data_add])
total_label = pd.concat([label, label_add])

In [19]:
total_data.shape

(74048, 67)

Разделение на обучение и тест

In [20]:
train_data, test_data, train_target, test_target = train_test_split(total_data, total_label, random_state=42, test_size=0.3) 

In [21]:
cv_spliter = ShuffleSplit(n_splits=5, test_size=0.3)

In [22]:
cv_res = cross_val_score(LogisticRegression(), train_data, train_target, cv=cv_spliter, scoring='f1')

In [23]:
cv_res

array([0.5334575 , 0.60050598, 0.57474601, 0.56309078, 0.56746182])

Обучение на всех данных и проверка на отложенной выборке

In [24]:
model = LogisticRegression().fit(train_data, train_target)

Оценка качества

In [25]:
predict_train = model.predict(train_data)
predict_test = model.predict(test_data)

In [26]:
print(f"f1_score by train data: {f1_score(predict_train, train_target)}")
print(f"f1_score by test data: {f1_score(predict_test, test_target)}")
print()

print(f"accuracy_score by train data: {accuracy_score(predict_train, train_target)}")
print(f"accuracy_score by test data: {accuracy_score(predict_test, test_target)}")
print()

print(f"roc_score by train data: {roc_auc_score(predict_train, train_target)}")
print(f"roc_score by test data: {roc_auc_score(predict_test, test_target)}")
print()

f1_score by train data: 0.5637819290888296
f1_score by test data: 0.562474889513861

accuracy_score by train data: 0.5585051993903498
accuracy_score by test data: 0.5588116137744767

roc_score by train data: 0.5585290849160585
roc_score by test data: 0.5588468928980397



Base line  получен качество составило примерно 0.55

Baseline:

1.Отбор признаков по пропускам

2.преобразование кат признаков labelencoder

3.балансировка по oversampling

4.logostocregression 

Upgrade model

Преобразуем данные вещественные оставим также, а категориальные изменим.

Выведим статистику по кол уникальных значений кат признаков

In [27]:
cat_data = data[cat_name]
for name in cat_data.columns:
    print(name, len(cat_data[name].unique()))
     

Var192 355
Var193 50
Var195 23
Var196 4
Var197 221
Var198 3891
Var199 4401
Var202 5543
Var203 6
Var204 100
Var205 4
Var206 22
Var207 14
Var208 3
Var210 6
Var211 2
Var212 78
Var216 1819
Var217 12471
Var218 3
Var219 23
Var220 3891
Var221 7
Var222 3891
Var223 5
Var226 23
Var227 7
Var228 30


ПРизнаки с 2 значаниями закодируем как бинарные

Признаки со значениями меньше 5 закодируем one_hot

Признаки со значениями больше 5 закодируем hash-trick векторами размера 5

In [28]:
counts_unique_value = np.array([len(cat_data[name].unique()) for name in cat_data.columns])
counts_unique_value

array([  355,    50,    23,     4,   221,  3891,  4401,  5543,     6,
         100,     4,    22,    14,     3,     6,     2,    78,  1819,
       12471,     3,    23,  3891,     7,  3891,     5,    23,     7,
          30])

In [29]:
binar_column = cat_data.columns[counts_unique_value <=2]
ohe_column = cat_data.columns[(counts_unique_value > 2)&(counts_unique_value <= 5)]
hash_column = cat_data.columns[counts_unique_value > 5]

Бинарные данные

In [30]:
binar_data = cat_data[binar_column]

for name in binar_data.columns:
  binar_data[name] = LabelEncoder().fit_transform(binar_data[name])

In [31]:
binar_data

Unnamed: 0,Var211
0,0
1,0
2,0
3,0
4,0
...,...
39995,0
39996,0
39997,0
39998,0


Горячее кодирование

In [32]:
ohe_data = OneHotEncoder(sparse=False).fit_transform(cat_data[ohe_column])

In [33]:
ohe_data

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

Hash trick

In [34]:
hashes = np.array([hash(value) % 5 for value in cat_data[hash_column[0]].unique()])

In [35]:
hashes

array([2, 1, 3, 2, 1, 0, 1, 3, 3, 3, 4, 3, 1, 0, 4, 2, 0, 3, 0, 0, 3, 0,
       1, 3, 3, 4, 0, 4, 3, 2, 3, 1, 2, 2, 1, 4, 1, 1, 2, 2, 2, 1, 1, 0,
       3, 0, 0, 2, 4, 2, 4, 0, 1, 4, 4, 3, 1, 2, 3, 4, 3, 4, 2, 1, 0, 0,
       2, 4, 4, 1, 2, 1, 2, 1, 3, 3, 1, 2, 3, 0, 1, 1, 0, 3, 0, 0, 4, 1,
       2, 3, 3, 3, 1, 3, 4, 2, 0, 1, 3, 1, 0, 4, 4, 0, 2, 4, 2, 3, 4, 3,
       4, 4, 2, 0, 3, 2, 1, 3, 3, 1, 2, 3, 4, 3, 1, 4, 1, 3, 4, 3, 1, 2,
       0, 3, 0, 2, 2, 3, 4, 3, 3, 0, 3, 4, 1, 4, 1, 4, 3, 2, 1, 2, 0, 4,
       4, 2, 0, 3, 3, 3, 0, 4, 1, 3, 4, 2, 2, 0, 0, 3, 4, 1, 3, 0, 1, 3,
       3, 2, 2, 3, 0, 3, 1, 0, 1, 1, 3, 4, 3, 3, 2, 0, 2, 0, 4, 2, 3, 3,
       3, 0, 4, 2, 1, 1, 0, 0, 3, 2, 2, 4, 4, 1, 4, 4, 3, 1, 4, 2, 3, 2,
       1, 3, 0, 2, 2, 2, 4, 2, 2, 0, 2, 4, 4, 3, 3, 1, 1, 4, 0, 4, 4, 0,
       2, 1, 3, 4, 4, 0, 2, 2, 1, 1, 3, 0, 0, 3, 1, 3, 2, 1, 3, 3, 3, 0,
       2, 0, 0, 2, 0, 3, 0, 1, 2, 1, 2, 2, 2, 0, 0, 3, 1, 0, 1, 4, 2, 0,
       1, 4, 3, 1, 3, 0, 3, 2, 4, 0, 0, 4, 4, 0, 4,

In [36]:
def vector_hash(data, n_vector, name):
  data_hash = []
  for value in data:
    vector = [0] * n_vector
    change_value = name + value if type(value) != float else name
    value_hash = hash(change_value) % n_vector
    vector[value_hash] = 1
    data_hash.append(vector)
  return np.array(data_hash)

In [37]:
test = vector_hash(cat_data[hash_column[0]], 5, hash_column[0])
test

array([[1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0]])

In [38]:
hash_data = []
for name in hash_column:
  vector_feat = vector_hash(cat_data[name], 5, name)
  hash_data.append(vector_feat)
hash_data = np.hstack(hash_data)

In [39]:
hash_data.shape

(40000, 110)

Объеденение  ктаегориальных признаков

In [40]:
cat_data_up = np.hstack((binar_data, ohe_data, hash_data))
cat_data_up = pd.DataFrame(cat_data_up)


Объеденение с вещественными признаками

In [41]:
total_data_up = correct_data.merge(cat_data_up, left_index=True, right_index=True)

Балансировка

In [42]:
data_add_up = total_data_up.iloc[indices_to_add, :]
label_add_up = label.iloc[indices_to_add, :]

In [43]:
total_data_up = pd.concat([total_data_up, data_add_up])
total_label_up = pd.concat([label, label_add_up])

In [44]:
total_data_up.shape

(74048, 169)

Обучение модели

In [45]:
train_data_up, test_data_up, train_target_up, test_target_up = train_test_split(total_data_up, total_label_up, random_state=42, test_size=0.3) 

In [46]:
cv_spliter = ShuffleSplit(n_splits=5, test_size=0.3)

In [47]:
cv_res = cross_val_score(LogisticRegression(), train_data_up, train_target_up, cv=cv_spliter, scoring='f1')

In [48]:
cv_res

array([0.56113635, 0.55036726, 0.52670756, 0.56043739, 0.54648481])

Логистическая регрессия отработала не лучше на новых признаках, попробуем xgboost c новыми признаками

In [49]:
import xgboost as xgb

In [56]:
cv_res = cross_val_score(xgb.XGBClassifier(n_estimators=100, max_depth=5), train_data_up, train_target_up, cv=cv_spliter, scoring='f1')



In [58]:
print(cv_res)
print(cv_res.mean())

[0.84973966 0.86051633 0.85833384 0.86334189 0.8514115 ]
0.8566686461163016


Обучение на всех данных и предсказание на тесте

In [52]:
model_up = xgb.XGBClassifier(n_estimators=100, max_depth=5).fit(train_data_up, train_target_up)



In [54]:
predict_train_up = model_up.predict(train_data_up)
predict_test_up = model_up.predict(test_data_up)

In [55]:
print(f"f1_score by train data: {f1_score(predict_train_up, train_target_up)}")
print(f"f1_score by test data: {f1_score(predict_test_up, test_target_up)}")
print()

print(f"accuracy_score by train data: {accuracy_score(predict_train_up, train_target_up)}")
print(f"accuracy_score by test data: {accuracy_score(predict_test_up, test_target_up)}")
print()

print(f"roc_score by train data: {roc_auc_score(predict_train_up, train_target_up)}")
print(f"roc_score by test data: {roc_auc_score(predict_test_up, test_target_up)}")
print()

f1_score by train data: 0.8895424593276499
f1_score by test data: 0.8602307329416304

accuracy_score by train data: 0.8855169486620492
accuracy_score by test data: 0.8532973216295296

roc_score by train data: 0.8875097658710969
roc_score by test data: 0.8570144254262632



Попробуем xgboost на простом преобразование категориальных данных

In [59]:
cv_res_old = cross_val_score(xgb.XGBClassifier(n_estimators=100, max_depth=5), train_data, train_target, cv=cv_spliter, 
                             scoring='f1')



In [61]:
print(cv_res_old)
print(cv_res_old.mean())

[0.86894708 0.86747281 0.86192875 0.86557377 0.86349749]
0.8654839806023749


Использование более сложной обработки категориальных признаков не дает дополнительного прироста качества, оставим простой labelencoding

In [62]:
model_old_up =  xgb.XGBClassifier(n_estimators=100, max_depth=5).fit(train_data, train_target)



In [69]:
predict_train = model_old_up.predict(train_data)
predict_test = model_old_up.predict(test_data)

In [66]:
print(f"f1_score by train data: {f1_score(predict_train, train_target)}")
print(f"f1_score by test data: {f1_score(predict_test, test_target)}")
print()

print(f"accuracy_score by train data: {accuracy_score(predict_train, train_target)}")
print(f"accuracy_score by test data: {accuracy_score(predict_test, test_target)}")
print()

print(f"roc_score by train data: {roc_auc_score(predict_train, train_target)}")
print(f"roc_score by test data: {roc_auc_score(predict_test, test_target)}")
print()

f1_score by train data: 0.8958642158316039
f1_score by test data: 0.8696697728346288

accuracy_score by train data: 0.8915748654332182
accuracy_score by test data: 0.8633805986945757

roc_score by train data: 0.8941746079875249
roc_score by test data: 0.8669997600242892



Прогноз вероятности

In [73]:
predict_train_proba = model_old_up.predict_proba(train_data)
predict_test_proba = model_old_up.predict_proba(test_data)
predict_train_proba # 2 столбец

array([[0.66767305, 0.33232695],
       [0.28389835, 0.71610165],
       [0.8012305 , 0.19876952],
       ...,
       [0.14735246, 0.85264754],
       [0.9307357 , 0.06926431],
       [0.9680642 , 0.03193581]], dtype=float32)

Обучение и прогноз на даных из соревнования