In [1]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import learning_curve

from lofo import LOFOImportance, Dataset, plot_importance

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 

from sklearn.metrics import classification_report

from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') 

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
df['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [6]:
def missing_values_detect(df):

    """
    Dtype'lara göre missing valueları dict dönderir
    """

    # columnların tespiti
    object_col = (df.dtypes == 'object')
    float_col = (df.dtypes == 'float64')
    int_col = (df.dtypes == 'int')


    # liste halinde object typeına sahip featureların tutulması 
    object_list = list(object_col[object_col].index)
    float_list = list(float_col[float_col].index)
    int_list = list(int_col[int_col].index)

    # dict'lerin oluşturulması 
    object_dict = {}
    float_dict = {}
    int_dict = {}


    # missing values dicts comphrensionları
    object_dict = {i : df[i].isnull().sum() for i in object_list }
    float_dict = {j : df[j].isnull().sum() for j in float_list }
    int_dict = {k : df[k].isnull().sum() for k in int_list }
        

    return object_dict, float_dict, int_dict

In [7]:
object_dict, float_dict, int_dict = missing_values_detect(df)

In [8]:
def high_correlation_detect(df):

    """
    0.8 ve yukarısında korelasyona sahip özelliklerin çıkarımı
    """
    # korelasyon matrisinin çıkarılması
    correlated_features = set()
    correlation_matrix = df.corr()

    # yüksek korelasyona sahip featureların çıkarılması
    for i in range(len(correlation_matrix .columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.8:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)
    
    # yüksek korelasyona sahip featureların çıkarılması
    return (correlated_features, len(correlated_features)) 



In [9]:
high_correlation_detect(df)

(set(), 0)

In [10]:
def std_calculation(df, filter_std):

    """
    Belirli bir std üzerinde ki özellikleri sıralar
    """

    # normal dağılımların hesaplanması
    dict_std_values = dict(df.std(numeric_only=True))

    # scale gerektiren özellikerin çıkarımı 
    list_needs_scale = [j for i,j in zip(dict_std_values.values(), dict_std_values.keys()) if i > filter_std]


    return list_needs_scale


In [11]:
# aykırı değer analizi
def outlier_analyz(df):

    """
    Numerik değerlerin içinde ki outlier yüzde ve sayılarını bulur
    """

    # aykırı değere sahip özellik yüzdeleri
    outlier_dict_percentage = {}
    outlier_dict_values = {}

    # numerik değerlerin listelenmesi 
    list_numerical = df.select_dtypes('int', 'float').columns

    for i in list_numerical:
        
        # İlk quartile
        Q1 = np.percentile(df[i],25)
        # Üçüncü quartile
        Q3 = np.percentile(df[i],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier adımı
        outlier_step = IQR * 1.5
        
        # outlier dicts
        outlier_dict_percentage['{}'.format(i)] = len(df[(df[i] < Q1 - outlier_step) | (df[i] > Q3 + outlier_step)].index)*100/df.shape[0]
        outlier_dict_values['{}'.format(i)] = len(df[(df[i] < Q1 - outlier_step) | (df[i] > Q3 + outlier_step)].index)

        # Dataframe dönüşümü 
        df_outlier_dict_percentage = pd.DataFrame.from_dict(outlier_dict_percentage, orient='index')
        df_outlier_dict_values = pd.DataFrame.from_dict(outlier_dict_values, orient='index')

        # Dataframe'lerin birleştirimi 
        outlier_analyz = pd.concat([df_outlier_dict_percentage, df_outlier_dict_values], names = ['percentage', 'count'], axis = 1)
        outlier_analyz = outlier_analyz.rename({0:'Percantage', 0:'Count'})

    return outlier_analyz 

In [12]:
outlier_analyz(df)

Unnamed: 0,0,0.1
SeniorCitizen,16.214681,1142
tenure,0.0,0


In [13]:
def train_and_test_splitting(df,test_size, label):

    """
    Train ve test splitting işlemi
    """
    
    # etiket ve veri setinin ayrıştırılması 
    X = df.drop(label, axis=1)
    y = df[label]

    # splitting işlemi 
    X_train , X_test , y_train , y_test = train_test_split(X , 
                                                           y , 
                                                           test_size = test_size , 
                                                           random_state = 42)

    return X_train, X_test, y_train, y_test

In [15]:
def robust_scaler(X_train,y_train,X_test):

    """
    Verinin outlierdan arınmasını sağlar
    """
    
    # Model inşaası
    rs_scaler = RobustScaler()

    # Fitting işlemi
    X_train_scaled = rs_scaler.fit_transform(X_train, y_train)

    # test verisine leakage olmadan transform edilmesi 
    X_test_scaled = rs_scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

In [27]:
def feature_selection_LOFO(df, target, feature_value):
    
    """Gelen özellikleri LOFO ile feature selectiona tabii tutar"""

    # cross validation yapılması
    sample_df = df.sample(frac=0.01, random_state=42)
    sample_df.sort_values("customerID", inplace=True)

    cv = KFold(n_splits=4, shuffle=True, random_state=42)

    # target değişkeni ile özelliklerin çıkarılımı
    target = target
    features = [col for col in df.columns if col != target]

    # target değişkenine göre feature importance metriklerinin çıkarılması
    dataset = Dataset(df = df, target=target, features=[col for col in df.columns if col != target])
    lofo_imp = LOFOImportance(dataset, cv=cv, scoring="roc_auc")

    importance_df = lofo_imp.get_importance()
    
    return print(list(importance_df.head(feature_value).feature)), print(importance_df) 

In [37]:
feature_selection_LOFO(df = df,
                       target = 'Churn', 
                       feature_value=8)

100%|██████████| 20/20 [00:08<00:00,  2.45it/s]

['tenure', 'Contract', 'SeniorCitizen', 'OnlineSecurity', 'OnlineBackup', 'PaymentMethod', 'TechSupport', 'gender']
             feature  importance_mean  importance_std  val_imp_0  val_imp_1  \
8             tenure         0.026366        0.002909   0.023455   0.030243   
2           Contract         0.008902        0.003812   0.003439   0.007331   
1      SeniorCitizen         0.001395        0.001647  -0.000936   0.000770   
3     OnlineSecurity         0.001339        0.001813   0.000713  -0.000571   
15      OnlineBackup         0.001298        0.001166   0.002343  -0.000545   
13     PaymentMethod         0.001143        0.002020   0.000887   0.000640   
17       TechSupport         0.000957        0.001179   0.002037   0.002231   
11            gender         0.000404        0.000889   0.001766  -0.000724   
12  PaperlessBilling         0.000222        0.001233  -0.000445  -0.001115   
19   StreamingMovies         0.000072        0.001370  -0.001468  -0.000837   
14      TotalCh




(None, None)

In [32]:
fe_importance = ['tenure', 
                'Contract', 
                'SeniorCitizen', 
                'OnlineSecurity', 
                'OnlineBackup',
                'PaymentMethod',
                'TechSupport',
                'gender'
                ]

In [40]:
df_fe = df[['tenure', 
            'Contract', 
            'SeniorCitizen', 
            'OnlineSecurity', 
            'OnlineBackup',
            'PaymentMethod',
            'TechSupport',
            'gender',
            'Churn']]
df_fe.head()

Unnamed: 0,tenure,Contract,SeniorCitizen,OnlineSecurity,OnlineBackup,PaymentMethod,TechSupport,gender,Churn
0,1,Month-to-month,0,No,Yes,Electronic check,No,Female,No
1,34,One year,0,Yes,No,Mailed check,No,Male,No
2,2,Month-to-month,0,Yes,Yes,Mailed check,No,Male,Yes
3,45,One year,0,Yes,No,Bank transfer (automatic),Yes,Male,No
4,2,Month-to-month,0,No,No,Electronic check,No,Female,Yes


In [60]:
X_train, X_test, y_train, y_test= train_and_test_splitting(df = df_fe,
                                                           test_size=0.20, 
                                                           label = 'Churn')

In [61]:
def encoding_labels_label_encoder(y_train, y_test):

    """Label encoder aracılığı ile encoding işlemi
        leakage engellenmiştir
        Kategoriler için sıra önemli olduğunda kullanılacaktır"""


    # Model inşaası
    le = LabelEncoder()

    # Fitting işlemi 
    #x_train = le.fit_transform(x_train)
    #x_test = le.transform(x_test)

    # etiketlerde yapılması
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    return y_train, y_test

In [62]:
y_train, y_test = encoding_labels_label_encoder(y_train=y_train, y_test=y_test)

In [48]:
def encoding_labels_one_hot_encoding(x_train, x_test):

    """One hot encoder aracılığı ile encoding işlemi
        Kategoriler için sıra önemli olmadığında kullanılacaktır"""

    # Fitting işlemi 
    x_train = pd.get_dummies(x_train)
    x_test = pd.get_dummies(x_test)

    # etiketlerde yapılması
    #y_train = le.fit_transform(y_train)
    #y_test = le.transform(y_test)

    return x_train, x_test

In [63]:
X_train, X_test = encoding_labels_one_hot_encoding(x_train=X_train, 
                                x_test=X_test)

In [64]:
X_train

Unnamed: 0,tenure,SeniorCitizen,Contract_Month-to-month,Contract_One year,Contract_Two year,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,gender_Female,gender_Male
2142,21,0,0,1,0,0,0,1,1,0,0,0,0,0,1,1,0,0,1,0
1623,54,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,0,1,0
6074,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1
1362,4,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1
6754,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1
5191,23,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,1,0
5226,12,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
5390,12,1,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1


In [65]:
def tunning_fit_decision_tree(X_train, X_test, y_train, y_test):
    
    # hiperparametrelerin define edilmesi
    max_depth =  range(1,5)

    # hiperparametrelerin dict'e çevrilmesi 
    hyperparameters = dict(max_depth = max_depth)

    # model inşaası 
    dt = DecisionTreeClassifier()

    # tuning model building 
    clf = RandomizedSearchCV(dt, hyperparameters, cv=10)
    best_model = clf.fit(X_train,y_train)

    # en iyi hiper parametreler
    best_params = clf.best_params_

    # tunining sonuçları
    print('Tuning results')
    print(best_params)

    # model implemetasyonu 
    dt_tuning = DecisionTreeClassifier(max_depth = best_params['max_depth'])

    dt_tuning.fit(X_train, y_train)
    print(dt_tuning.score(X_test,y_test))

    # classification report 
    y_pred = dt_tuning.predict(X_test)

    return print(classification_report(y_pred, y_test))

In [66]:
tunning_fit_decision_tree(X_train=X_train,
                        X_test=X_test,
                        y_train=y_train,
                        y_test=y_test)



Tuning results
{'max_depth': 3}
0.7814052519517388
              precision    recall  f1-score   support

           0       0.91      0.81      0.86      1162
           1       0.42      0.63      0.50       247

    accuracy                           0.78      1409
   macro avg       0.67      0.72      0.68      1409
weighted avg       0.83      0.78      0.80      1409

