Esra Bardakcı

Bu proje telekom müşterilerinin aboneliği bırakıp bırakmama durumlarını tahmin etmek üzere 5 model üzerinde kurulan algoritmayı kapsıyor.

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [4]:
filename='WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [5]:
data=pd.read_csv(filename,sep=',') #reading data
data.drop('customerID',axis=1,inplace=True) #cleaning data
data['TotalCharges'].replace(' ', np.nan, inplace=True) #replacing empty data with nan value
data=data.dropna(subset=['TotalCharges']) #droping nan values

In [11]:
data.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
#organising non-numeric columns using get_dummies function
data_update=pd.get_dummies(data,columns=['gender','Partner','Dependents','PhoneService',
                                             'MultipleLines','InternetService','OnlineSecurity',
                                             'OnlineBackup','DeviceProtection','TechSupport',
                                             'StreamingTV','StreamingMovies','Contract','PaperlessBilling',
                                             'PaymentMethod','Churn'],drop_first=True)

In [8]:
X,y= data_update.iloc[:, :-1], data_update['Churn_Yes'] #indicating X and y matrices
sc=StandardScaler() #standard scaler model
scaled_X=sc.fit_transform(X) #fitting scaler on feature set

In [10]:
y=np.array(y) #converting output column to 1d array
rkf=KFold(n_splits=5,shuffle=True,random_state=True) #k-fold model with 5 splits
print('{:<29} {:<27} {:<23} '.format('MODEL', 'Train','Test'))
dict_all={}

logreg=LogisticRegression() #logistic regression model
logreg_trains=[]
logreg_tests=[]
for train_index, test_index in rkf.split(scaled_X):
    X_train,X_test=scaled_X[train_index],scaled_X[test_index]
    y_train,y_test=y[train_index],y[test_index]
    #logreg
    model_logreg=logreg.fit(X_train,y_train) #training splitted data on training data 
    score_train_log=cross_val_score(model_logreg, X_train, y_train).mean() #validation score of our trained model
    score_test_log=model_logreg.score(X_test,y_test) #accuracy score of our model on the test data
    logreg_trains.append(score_train_log) #collecting data for each fold in a list
    logreg_tests.append(score_test_log) 
 
 #decisiontree
dtc=DecisionTreeClassifier() #decision tree model
dtc_trains=[]
dtc_tests=[]
for train_index, test_index in rkf.split(scaled_X):
    X_train,X_test=scaled_X[train_index],scaled_X[test_index]
    y_train,y_test=y[train_index],y[test_index]
    model_dtree=dtc.fit(X_train,y_train) #training splitted data on training data
    score_train_dtree=cross_val_score(model_dtree, X_train, y_train).mean() #validation score of our trained model
    score_test_dtree=model_dtree.score(X_test,y_test) #accuracy score of our model on our test data
    dtc_trains.append(score_train_dtree) #collecting data for each fold in a list
    dtc_tests.append(score_test_dtree)
    
#linearSVC
lsvc = LinearSVC(max_iter=2000) #linear svc model
lsvc_trains=[]
lsvc_tests=[]
for train_index, test_index in rkf.split(scaled_X):
    X_train,X_test=scaled_X[train_index],scaled_X[test_index]
    y_train,y_test=y[train_index],y[test_index]
    model_lsvc=lsvc.fit(X_train,y_train)
    score_train_lsvc=cross_val_score(model_lsvc,X_train,y_train).mean()
    score_test_lsvc=model_lsvc.score(X_test,y_test)
    lsvc_trains.append(score_train_lsvc)
    lsvc_tests.append(score_test_lsvc)
        
        
    
#knn
knn=KNeighborsClassifier() #KNN model
knn_trains=[]
knn_tests=[]    
for train_index, test_index in rkf.split(scaled_X):
    X_train,X_test=scaled_X[train_index],scaled_X[test_index]
    y_train,y_test=y[train_index],y[test_index]
    model_knn=knn.fit(X_train,y_train)
    score_train_knn=cross_val_score(model_knn,X_train,y_train).mean()
    score_test_knn=model_knn.score(X_test,y_test)
    knn_trains.append(score_train_knn)
    knn_tests.append(score_test_knn)
    
#mlpclassifier
mlp=MLPClassifier(max_iter=1000) #mlp classifier model
mlp_trains=[]
mlp_tests=[]
for train_index, test_index in rkf.split(scaled_X):
    X_train,X_test=scaled_X[train_index],scaled_X[test_index]
    y_train,y_test=y[train_index],y[test_index]
    model_mlp=mlp.fit(X_train,y_train)
    score_train_mlp=cross_val_score(model_mlp,X_train,y_train).mean()
    score_test_mlp=model_mlp.score(X_test,y_test)    
    mlp_trains.append(score_train_mlp)
    mlp_tests.append(score_test_mlp)

dict_all['LogisticRegression']=['LogisticRegression',np.mean(logreg_trains),np.mean(logreg_tests)]
dict_all['DecisionTree']=['DecisionTree',np.mean(dtc_trains),np.mean(dtc_tests)]
dict_all['LinearSVC']=['LinearSVC',np.mean(lsvc_trains),np.mean(lsvc_tests)]
dict_all['KNN']=['KNN',np.mean(knn_trains),np.mean(knn_tests)]
dict_all['MLPClassifier']=['MLPClassifier',np.mean(mlp_trains),np.mean(mlp_tests)]
for key, value in dict_all.items():
    model,train,test=value
    print('{:<29} {:<27} {:<30} '.format(model,train,test))
    

MODEL                         Train                       Test                    
LogisticRegression            0.8032206828498125          0.8024763401039914             
DecisionTree                  0.7262855970001975          0.7232663142325357             
LinearSVC                     0.8026159542135385          0.8014809108289077             
KNN                           0.7600607223208999          0.7566864923502787             
MLPClassifier                 0.7609842510361161          0.7650775789817424             
