In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")
np.random.seed(1907)

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

print("The data size:", df.shape)

## Convert TotalCharges to numeric
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

## Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data.
df['Churn']=df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0 )

## Impute the null value with the median value

df.TotalCharges.fillna(value=df['TotalCharges'].median(),inplace=True)

df= df.drop('customerID', axis = 1)

df.head()

In [None]:
df = pd.get_dummies(df)
print("The data size:", df.shape)

labels = np.array(df['Churn'])
df= df.drop('Churn', axis = 1)

train_x, test_x, train_y, test_y = train_test_split(df, labels, test_size = 0.2, random_state = 42)


In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_features=7)

rfc = rfc.fit(train_x,train_y)

#Predict the response for test dataset
y_pred3 = rfc.predict(test_x)
y_pred_prob = rfc.predict_proba(test_x)[:,1]

print(confusion_matrix(test_y, y_pred3))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred3))
# print ("AUC Score:", roc_auc_score(test_y, y_pred3))
# print ("AUC Score prob:", roc_auc_score(test_y, y_pred_prob))
# print ("Precision:", precision_score(test_y, y_pred3))
# print ("Recall:", recall_score(test_y, y_pred3))
print ("F1 Score:", f1_score(test_y, y_pred3))

# feature_importances = pd.DataFrame(rfc.feature_importances_,
#                                   index = train_x.columns,
#                                    columns=['importance']).sort_values('importance', ascending=False)

# print(feature_importances)