# READ DATA

In [1]:
import pandas as pd
import numpy as np

def load(data:str, sep:str):
    return pd.read_csv(data, sep=sep)

In [2]:
df = load("churn_data.csv", ";")
df

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.70,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.70,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.00,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.00,2,299.4,71,57.0,3.10,6.6
4,0,75,0,0,0.00,3,166.7,113,41.0,7.42,10.1
...,...,...,...,...,...,...,...,...,...,...,...
3328,0,192,1,1,2.67,2,156.2,77,71.7,10.78,9.9
3329,0,68,1,0,0.34,3,231.1,57,56.4,7.67,9.6
3330,0,28,1,0,0.00,2,180.8,109,56.0,14.44,14.1
3331,0,184,0,0,0.00,2,213.8,105,50.0,7.98,5.0


# SPLIT FOR TARGET

In [3]:
import pandas as pd
import numpy as np

def split(target:str, data):
    y = data[target]
    X = data.drop(target, axis = 1)
    
    return y, X

In [5]:
y, X = split("Churn", df)
y

0       0
1       0
2       0
3       0
4       0
       ..
3328    0
3329    0
3330    0
3331    0
3332    0
Name: Churn, Length: 3333, dtype: int64

In [6]:
X

Unnamed: 0,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,128,1,1,2.70,1,265.1,110,89.0,9.87,10.0
1,107,1,1,3.70,1,161.6,123,82.0,9.78,13.7
2,137,1,0,0.00,0,243.4,114,52.0,6.06,12.2
3,84,0,0,0.00,2,299.4,71,57.0,3.10,6.6
4,75,0,0,0.00,3,166.7,113,41.0,7.42,10.1
...,...,...,...,...,...,...,...,...,...,...
3328,192,1,1,2.67,2,156.2,77,71.7,10.78,9.9
3329,68,1,0,0.34,3,231.1,57,56.4,7.67,9.6
3330,28,1,0,0.00,2,180.8,109,56.0,14.44,14.1
3331,184,0,0,0.00,2,213.8,105,50.0,7.98,5.0


# TEST TRAIN SPLIT


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def tts(variables, binary):
    return train_test_split(variables, binary, test_size=0.33, random_state=1)

In [8]:
X_train, X_test, y_train, y_test = tts(X, y)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

      AccountWeeks  ContractRenewal  DataPlan  DataUsage  CustServCalls  \
2124            43                1         0       0.38              3   
510            193                1         1       3.32              1   
1009           155                1         0       0.00              3   
1171            32                1         0       0.00              1   
1993            86                1         0       0.35              0   
...            ...              ...       ...        ...            ...   
2763           116                1         1       2.21              3   
905            161                1         0       0.16              4   
1096            93                1         0       0.00              1   
235            139                1         0       0.00              5   
1061           132                1         1       2.78              1   

      DayMins  DayCalls  MonthlyCharge  OverageFee  RoamMins  
2124     27.0       117           22

# RF MODEL 

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

def tuning(X_train, y_train):
    
    parameters = {
    "criterion" : ['gini', 'entropy'],
    "max_depth" : [4, 5, 6, 7, 8, 9, 10, 11, 12],
    "max_features" : ['auto', 'log2', 'sqrt'],
    "min_samples_split" : [2, 4, 6, 8, 10]}

    random_estimator = RandomizedSearchCV(estimator = RandomForestClassifier(random_state=1),
                                          param_distributions = parameters,
                                          n_iter=10, 
                                          scoring='f1', 
                                          n_jobs=-1, 
                                          cv=5, 
                                          verbose=3, 
                                          random_state=420)
    
    random_estimator.fit(X_train, y_train)

    print ('Best Estimator: ', random_estimator.best_estimator_, ' \n')
    
    chosen_model = random_estimator.best_estimator_
    
    return chosen_model

In [10]:
rf2 = tuning(X_train, y_train)
predictions = rf2.predict(X_test)
predictions

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Estimator:  RandomForestClassifier(criterion='entropy', max_depth=7, max_features='log2',
                       min_samples_split=4, random_state=1)  



array([0, 0, 0, ..., 1, 0, 1], dtype=int64)