In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [4]:
data = pd.read_csv("telecom_churn.csv")

In [5]:
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [6]:
data.drop(["State", "Voice mail plan"], axis=1, inplace=True)

In [7]:
data["International plan"] = data["International plan"].map({"Yes": 1, "No": 0})

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Account length          3333 non-null   int64  
 1   Area code               3333 non-null   int64  
 2   International plan      3333 non-null   int64  
 3   Number vmail messages   3333 non-null   int64  
 4   Total day minutes       3333 non-null   float64
 5   Total day calls         3333 non-null   int64  
 6   Total day charge        3333 non-null   float64
 7   Total eve minutes       3333 non-null   float64
 8   Total eve calls         3333 non-null   int64  
 9   Total eve charge        3333 non-null   float64
 10  Total night minutes     3333 non-null   float64
 11  Total night calls       3333 non-null   int64  
 12  Total night charge      3333 non-null   float64
 13  Total intl minutes      3333 non-null   float64
 14  Total intl calls        3333 non-null   

In [9]:
y = data["Churn"].astype("int")

In [10]:
X = data.drop("Churn", axis=1)

In [11]:
X.shape, y.shape

((3333, 17), (3333,))

In [12]:
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=17
)

In [14]:
X_train.shape, X_valid.shape

((2333, 17), (1000, 17))

In [15]:
first_tree = DecisionTreeClassifier(random_state=17)

In [16]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.9138423504976518

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
first_knn = KNeighborsClassifier()

In [19]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

0.8671274043984523

## настраиваем max_depth для дерева

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
tree_params = {"max_depth": np.arange(1, 11), "max_features": [0.5, 0.7, 1]}

In [22]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [23]:
%%time
tree_grid.fit(X_train, y_train);

CPU times: user 268 ms, sys: 135 ms, total: 403 ms
Wall time: 3.06 s


In [24]:
tree_grid.best_score_, tree_grid.best_params_

(0.9391366681677404, {'max_depth': 6, 'max_features': 0.7})

In [25]:
knn_params = {"n_neighbors": range(5, 30, 5)}  # + list(range(50, 100, 10))}

In [26]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)

In [27]:
%%time
knn_grid.fit(X_train, y_train);

CPU times: user 2.52 s, sys: 375 ms, total: 2.9 s
Wall time: 991 ms


In [28]:
knn_grid.best_score_, knn_grid.best_params_

(0.8701289391697531, {'n_neighbors': 10})

In [29]:
tree_valid_pred = tree_grid.predict(X_valid)

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
accuracy_score(y_valid, tree_valid_pred)

0.936

In [32]:
from sklearn.metrics import confusion_matrix

In [33]:
confusion_matrix(y_valid, tree_valid_pred)

array([[858,   9],
       [ 55,  78]])

In [34]:
np.bincount(y_valid)

array([867, 133])

In [35]:
from sklearn.tree import export_graphviz

In [36]:
second_tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
second_tree.score(X_valid, y_valid)

0.905

In [37]:
export_graphviz(
    second_tree, out_file="telecom_tree2.dot", feature_names=X.columns, filled=True,
)


In [38]:
#!ls -l *.png

In [39]:
!dot -Tpng telecom_tree2.dot -o telecom_tree2.png

/bin/bash: line 1: dot: command not found


<img src='telecom_tree2.png'>