In [2]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import svm 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np

In [3]:
df = pd.read_csv('telco-customer-churn.csv')

### 0. выведем список существующих полей и типы данных :

In [4]:
df.columns
df.dtypes

age                                       int64
annualincome                              int64
calldroprate                            float64
callfailurerate                         float64
callingnum                                int64
customerid                                int64
customersuspended                        object
education                                object
gender                                   object
homeowner                                object
maritalstatus                            object
monthlybilledamount                       int64
noadditionallines                        object
numberofcomplaints                        int64
numberofmonthunpaid                       int64
numdayscontractequipmentplanexpiring      int64
occupation                               object
penaltytoswitch                           int64
state                                    object
totalminsusedinlastmonth                  int64
unpaidbalance                           

In [5]:
df.head()

Unnamed: 0,age,annualincome,calldroprate,callfailurerate,callingnum,customerid,customersuspended,education,gender,homeowner,...,totalminsusedinlastmonth,unpaidbalance,usesinternetservice,usesvoiceservice,percentagecalloutsidenetwork,totalcallduration,avgcallduration,churn,year,month
0,12,168147,0.06,0.0,4251078442,1,Yes,Bachelor or equivalent,Male,Yes,...,15,19,No,No,0.82,5971,663,0,2015,1
1,12,168147,0.06,0.0,4251078442,1,Yes,Bachelor or equivalent,Male,Yes,...,15,19,No,No,0.82,3981,995,0,2015,2
2,42,29047,0.05,0.01,4251043419,2,Yes,Bachelor or equivalent,Female,Yes,...,212,34,No,Yes,0.27,7379,737,0,2015,1
3,42,29047,0.05,0.01,4251043419,2,Yes,Bachelor or equivalent,Female,Yes,...,212,34,No,Yes,0.27,1729,432,0,2015,2
4,58,27076,0.07,0.02,4251055773,3,Yes,Master or equivalent,Female,Yes,...,216,144,No,No,0.48,3122,624,0,2015,1


## 1. Подготовка данных: 


### 1.1 удалим ненужные данные, такие как номер телефона (так как есть id клиента) и 'noadditionallines' который не несет никакую информацию и штат, так как слишком много уникальных значении (50)


In [6]:
print(df.state.nunique()) #количество штатов
to_drop = ['callingnum','noadditionallines', 'state']
churn_df = df.drop(to_drop,axis=1)

50


### 1.2 поменяем категориальные признаки на числовые 

In [7]:
print(churn_df.education.unique(), #проверим данные и посмотрим какие есть уникальные значения
churn_df.customersuspended.unique(),                     
churn_df.education.unique(),                                
churn_df.gender.unique(),                                 
churn_df.homeowner.unique(),                                
churn_df.maritalstatus.unique(),   
churn_df.occupation.unique(),
churn_df.usesinternetservice.unique(),
churn_df.usesvoiceservice.unique()
     )    

['Bachelor or equivalent' 'Master or equivalent' 'PhD or equivalent'
 'High School or below'] ['Yes' 'No'] ['Bachelor or equivalent' 'Master or equivalent' 'PhD or equivalent'
 'High School or below'] ['Male' 'Female'] ['Yes' 'No'] ['Single' 'Married'] ['Technology Related Job' 'Non-technology Related Job' 'Others'] ['No' 'Yes'] ['No' 'Yes']


In [8]:
churn_df['customersuspended'] = pd.Categorical(churn_df['customersuspended']).codes #меняем категориальные на количественные
churn_df['education'] = pd.Categorical(churn_df['education']).codes
churn_df['gender'] = pd.Categorical(churn_df['gender']).codes
churn_df['homeowner'] = pd.Categorical(churn_df['homeowner']).codes
churn_df['maritalstatus'] = pd.Categorical(churn_df['maritalstatus']).codes
churn_df['occupation'] = pd.Categorical(churn_df['occupation']).codes
churn_df['usesinternetservice'] = pd.Categorical(churn_df['usesinternetservice']).codes
churn_df['usesvoiceservice'] = pd.Categorical(churn_df['usesvoiceservice']).codes


### 1.3 Загружаем датасет и проверяем, что никакое значение переменных не превышает 95% от всех значений.

In [9]:
for column in churn_df:
    att_freq = churn_df[column].value_counts(normalize = True)
    if (att_freq >= 0.95).any():
        print (att_freq)

1    0.978405
0    0.021595
Name: customersuspended, dtype: float64
0    0.961257
1    0.038743
Name: usesinternetservice, dtype: float64
2015    1.0
Name: year, dtype: float64


### Следовательно, можем выкинуть   customersuspended, usesinternetservice

In [None]:
to_drop = ['customersuspended', 'usesinternetservice']
churn_df = churn_df.drop(to_drop,axis=1)

### 1.4 переименуем данные

In [10]:
y = churn_df['churn'] #целевая переменная
X = churn_df.drop('churn',axis=1)

In [11]:
X.shape

(20468, 25)


### 1.5 Разделим данные на тренировочные и тестовые

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.4, random_state=0)

## 2. Построение моделей и сравнение

Решим задачу как задачу классификации

In [13]:
clf = GaussianNB()
clf = clf.fit(X_train ,y_train)
clf.score(X_test, y_test)

0.90840254030288226

In [14]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

clf2 = tree.DecisionTreeClassifier()
clf3 = RandomForestClassifier()
clf4 = LogisticRegression()
clf5 = KNeighborsClassifier()

clf2 = clf2.fit(X_train ,y_train)
DT_sc = clf2.score(X_test, y_test)

clf3 = clf3.fit(X_train ,y_train)
RF_sc = clf3.score(X_test, y_test)

clf4 = clf4.fit(X_train ,y_train)
LR_sc = clf4.score(X_test, y_test)

clf5 = clf5.fit(X_train ,y_train)
KN_sc = clf5.score(X_test, y_test)

from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
clf6 = XGBClassifier()
clf6 = clf6.fit(X_train ,y_train)
XB_sc = clf6.score(X_test, y_test)

In [15]:
print('Decision Tree score: %.3f \n'% DT_sc,
     'Random Forest: %.3f \n'% RF_sc,
     'Logistic Regression: %.3f \n'%LR_sc,
     'KNeighbors:%.3f \n'%KN_sc,
     'XGBoost:%.3f \n'%XB_sc)

Decision Tree score: 0.925 
 Random Forest: 0.929 
 Logistic Regression: 0.908 
 KNeighbors:0.901 
 XGBoost:0.908 



### Точность всех моделей составила больше 90%, наиболее хорошими оказались деревья решений и случайные леса. Но если учитывать, что random forest обобщенная версия деревьев решений, то заключаем, что достаточно рассматривать random forest