In [1]:
import pandas as pd
import numpy as np
import scipy as sc
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

Выгрузка данных

In [3]:
data = pd.read_csv('data/orange_small_churn_data.txt')
label = pd.read_csv('data/orange_small_churn_labels.txt', header=None,names=['label'])

In [4]:
label


Unnamed: 0,label
0,-1
1,-1
2,-1
3,1
4,-1
...,...
39995,-1
39996,-1
39997,-1
39998,-1


Выделение вешественных и категориальных признаков

In [5]:
numeric_label = data.columns[:190]
categorical_label = data.columns[190:]

In [6]:
data[numeric_label].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 190 entries, Var1 to Var190
dtypes: float64(189), int64(1)
memory usage: 58.0 MB


In [7]:
numeric_data = data[numeric_label]
cat_data = data[categorical_label]

Отбор признаков у которых ненулевых значений больше половины 

In [9]:
correct_name = []
for name in numeric_data.columns:
    if np.sum(~numeric_data[name].isnull()) >= 20000:
        correct_name.append(name)
len(correct_name)

41

In [10]:
cat_name = []
for name in cat_data.columns:
    if np.sum(~cat_data[name].isnull()) >= 20000:
        cat_name.append(name)
len(cat_name)

28

Замена пропусков на среднее значенние у вещественных признаков 

In [11]:
correct_data = data[correct_name]
mean_data = correct_data.mean(axis='index')

for i, name in enumerate(correct_name):
    correct_data[name].fillna(mean_data[i], inplace=True)

LabelEncoding для категориальных признаков



In [None]:
cat_data = data[cat_name]
transform_cat_data = cat_data.copy()
for name in cat_data.columns:
      transform_cat_data[name] = LabelEncoder().fit_transform(cat_data[name])


Балансиврока данных

In [None]:
label.value_counts()

label
-1       37024
 1        2976
dtype: int64

In [None]:
index_b = label[label['label'] == 1].index

In [None]:
k = len(label[label['label'] == -1]) - len(label[label['label'] == 1])
k

34048

In [None]:
np.random.seed(42)
indices_to_add = np.random.choice(index_b, k)

In [None]:
indices_to_add

array([11241, 17017, 14879, ..., 37101, 20973, 25393])

In [None]:
total_data = correct_data.merge(transform_cat_data, left_index=True, right_index=True)

Создание фрейма, который надо добавить по методике oversampling

In [None]:
data_add = total_data.iloc[indices_to_add, :]
label_add = label.iloc[indices_to_add, :]

In [None]:
total_data = pd.concat([total_data, data_add])
total_label = pd.concat([label, label_add])

In [None]:
total_label.shape

(74048, 1)

Разделение на обучение и тест

In [None]:
train_data, test_data, train_target, test_target = train_test_split(total_data, total_label, random_state=42, test_size=0.3) 

In [None]:
cv_spliter = ShuffleSplit(n_splits=5, test_size=0.3)

In [None]:
cv_res = cross_val_score(LogisticRegression(), train_data, train_target, cv=cv_spliter, scoring='f1')

In [None]:
cv_res

array([0.54564533, 0.55044166, 0.5541015 , 0.55508311, 0.5503347 ])

Обучение на всех данных и проверка на отложенной выборке

In [None]:
model = LogisticRegression().fit(train_data, train_target)

Оценка качества

In [None]:
predict_train = model.predict(train_data)
predict_test = model.predict(test_data)

In [None]:
print(f"f1_score by train data: {f1_score(predict_train, train_target)}")
print(f"f1_score by test data: {f1_score(predict_test, test_target)}")
print()

print(f"accuracy_score by train data: {accuracy_score(predict_train, train_target)}")
print(f"accuracy_score by test data: {accuracy_score(predict_test, test_target)}")
print()

print(f"roc_score by train data: {roc_auc_score(predict_train, train_target)}")
print(f"roc_score by test data: {roc_auc_score(predict_test, test_target)}")
print()

f1_score by train data: 0.5526466126903454
f1_score by test data: 0.5482557601554547

accuracy_score by train data: 0.561881426890205
accuracy_score by test data: 0.5604771550753995

roc_score by train data: 0.5620057970695448
roc_score by test data: 0.560600143452491



Base line  получен качество составило примерно 0.55

Baseline:

1.Отбор признаков по пропускам

2.преобразование кат признаков labelencoder

3.балансировка по oversampling

4.logostocregression 