In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [4]:
data = pd.read_csv('./data/telecom_churn.csv')

In [5]:
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [6]:
# В sklearn чтобы обучать модели нам нужно их все перевести в числа.

In [7]:
# Выкинем признаки 'State' и 'Voice mail plan'. Укажем что это именно столбцы(axis=1)
# и что исходный датафрейм должен поменяться (inplace=True)
data.drop(['State', 'Voice mail plan'], axis=1, inplace=True)

In [11]:
# Признак 'International plan' переведем из 'Yes/No' в '1/0'
data['International plan'] = data['International plan'].map({'Yes': 1, 'No': 0})

In [12]:
data.head()

Unnamed: 0,Account length,Area code,International plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,128,415,0,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,415,0,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,137,415,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,84,408,1,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,75,415,1,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [13]:
# Сейчас все признаки уже численные, кроме целевого('Churn') который булевый
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 18 columns):
Account length            3333 non-null int64
Area code                 3333 non-null int64
International plan        3333 non-null int64
Number vmail messages     3333 non-null int64
Total day minutes         3333 non-null float64
Total day calls           3333 non-null int64
Total day charge          3333 non-null float64
Total eve minutes         3333 non-null float64
Total eve calls           3333 non-null int64
Total eve charge          3333 non-null float64
Total night minutes       3333 non-null float64
Total night calls         3333 non-null int64
Total night charge        3333 non-null float64
Total intl minutes        3333 non-null float64
Total intl calls          3333 non-null int64
Total intl charge         3333 non-null float64
Customer service calls    3333 non-null int64
Churn                     3333 non-null bool
dtypes: bool(1), float64(8), int64(9)
memory usage

In [14]:
# Приведем целевой признак к типу int
y = data['Churn'].astype('int')

In [21]:
# Тогда матрица X - будет все остальное:
X = data.drop('Churn', axis=1)

In [22]:
X.shape, y.shape

((3333, 17), (3333,))

In [30]:
# Разобъем X и y на две подвыборки(70% и 30%):
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [31]:
# Вернет 4 объекта - матрицу X разобьет на две части и вектор y тоже разобьет на два вектора
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17)

In [32]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((2333, 17), (1000, 17), (2333,), (1000,))

In [27]:
# создаем экземпляр, но random_state лучше фиксировать
first_tree = DecisionTreeClassifier(random_state=17)

In [29]:
cross_val_score(first_tree, X_train, y_train, cv=5)

array([0.9143469 , 0.91220557, 0.92291221, 0.90772532, 0.91416309])

In [33]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.9142706160222772

In [34]:
from sklearn.neighbors import KNeighborsClassifier

In [36]:
first_knn = KNeighborsClassifier()

In [37]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

0.8671274043984523

 **настраиваем max_depth для дерева** 

In [38]:
# наша задача как-то сравнить две модели, выбрать лучшую. В модели будем изменять только максимальную глубину
from sklearn.model_selection import GridSearchCV

In [39]:
# Зададим словарь параметров дерева которое мы перебираем
tree_params = {'max_depth': np.arange(1, 11)}

In [41]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=2)

In [42]:
%%time
tree_grid.fit(X_train, y_train)

Wall time: 3.57 s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
# теперь появился обученный объект tree_grid

In [44]:
tree_grid.best_score_, tree_grid.best_params_

(0.9417059579939991, {'max_depth': 6})

In [45]:
# Тоже самое проделаем для KNN

In [46]:
knn_params = {'n_neighbors': [1, 2, 3, 4] + list(range(50, 100, 10))}

In [47]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)

In [48]:
%%time
knn_grid.fit(X_train, y_train)

Wall time: 6.24 s


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 50, 60, 70, 80, 90]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [49]:
knn_grid.best_score_, knn_grid.best_params_

(0.8658379768538362, {'n_neighbors': 4})

In [50]:
# Видим что лучший параметр 'n_neighbors' = 4,
# а вдруг лучший параметр где-то между 4 и 50
knn_params = {'n_neighbors': range(5, 30, 5)}

In [51]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)

In [52]:
%%time
knn_grid.fit(X_train, y_train)

Wall time: 3.01 s


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': range(5, 30, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
knn_grid.best_score_, knn_grid.best_params_

(0.8701243034719246, {'n_neighbors': 10})

In [54]:
# Уже лучше!!!

In [55]:
# Т.е. между деревом и методом ближайших соседей выбираем дерево.
# Посчитаем для этого метода оценку на отложенной выборке:
# Прогноз делается методом predict()
tree_valid_pred = tree_grid.predict(X_valid)

In [56]:
tree_grid.score(X_valid, y_valid)

0.946

In [57]:
from sklearn.metrics import accuracy_score # доля верных ответов

In [58]:
accuracy_score(y_valid, tree_valid_pred)

0.946

In [59]:
from sklearn.tree import export_graphviz

In [63]:
export_graphviz(tree_grid.best_estimator_, out_file='telecom_tree.dot', feature_names=X.columns, filled=True)

In [68]:
!dot -Tpng telecom_tree.dot > telecom_tree.png

"dot" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


In [None]:
# Осталось преобразовать telecom.dot в telecom.png