In [None]:
# Imports and housekeeping
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error

In [None]:
# Import the main dataset
df = pd.read_csv('churn_clean.csv',dtype={'locationid':np.int64})

In [None]:
# Display dataset info
df.info()

In [None]:
# Drops columns with no relevance to the research question
df_data = df.drop(['CaseOrder', 'Customer_id', 'Interaction', 'UID', 'City', 'State', 'County', 'Zip', 'Lat', 'Lng',
                   'Population', 'TimeZone', 'Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure', 'Techie',
                   'Contract', 'Port_modem', 'Tablet', 'InternetService', 'Phone', 'Multiple', 'OnlineSecurity',
                   'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
                   'PaymentMethod', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8'], axis=1)

In [None]:
# Display dataset top 5 rows
df_data.head()

In [None]:
# Display data set number of rows and colums
df_data.shape

In [None]:
# Check data for null or missing values
df_data.isna().any()

In [None]:
# Check data for duplicated rows
df_data.duplicated().sum()

In [None]:
# Display summary statistics for dataset - continuous variables
df_data.describe()

In [None]:
# Display summary statistics for dataset - categorical variables
df_data.describe(include = object)

In [None]:
# Too many unique values, drop Job column
df_data = df_data.drop(['Job'], axis=1)

In [None]:
# Display summary statistics for dataset - categorical variables
df_data.describe(include = object)

In [None]:
# Convert categorical variables (excluding Churn) to numeric via pd.get_dummies
df_data = pd.get_dummies(df_data, columns = ['Area', 'Marital', 'Gender'], dtype = int)

In [None]:
# Convert categorical variables (excluding Churn) to numeric via pd.get_dummies
df_data = pd.get_dummies(df_data, columns = ['Churn'], drop_first = True, dtype = int)

In [None]:
# Display dataset info
df_data.info()

In [None]:
# Export prepared dataframe to CSV
df_data.to_csv(r'/home/wstults/anaconda3/Jupyter/d209/Data-Mining---Random-Forest/Data-Mining---Random-Forest/churn_clean_prepared.csv')

In [None]:
# Generate train/test split
y = df_data['Churn_Yes'].values
X = df_data.drop('Churn_Yes', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42, stratify=y)

In [None]:
# Create dataframes from train/test splits and export as CSV
X_train_data = pd.DataFrame(data = X_train)
X_train_data.to_csv(r'/home/wstults/anaconda3/Jupyter/d209/Data-Mining---Random-Forest/Data-Mining---Random-Forest/X_train.csv')
X_test_data = pd.DataFrame(data = X_test)
X_test_data.to_csv(r'/home/wstults/anaconda3/Jupyter/d209/Data-Mining---Random-Forest/Data-Mining---Random-Forest/X_test.csv')
y_train_data = pd.DataFrame(data = y_train)
y_train_data.to_csv(r'/home/wstults/anaconda3/Jupyter/d209/Data-Mining---Random-Forest/Data-Mining---Random-Forest/y_train.csv')
y_test_data = pd.DataFrame(data = y_test)
y_test_data.to_csv(r'/home/wstults/anaconda3/Jupyter/d209/Data-Mining---Random-Forest/Data-Mining---Random-Forest/y_test.csv')

In [None]:
# Instantiate rfc
rfc = RandomForestClassifier(n_estimators = 25, max_depth = 4, max_features = 3, bootstrap = True, random_state = 42)
# Fit rfc to the training set    
rfc.fit(X_train, y_train)
# Predict the test set labels
y_pred = rfc.predict(X_test)
# Display accuracy score
print('Test set accuracy score of Random Forest: {:.4f}'.format(accuracy_score(y_test, y_pred)))
# Display auc score
print('Test set MSE of Random Forest: {:.4f}'.format(mean_squared_error(y_test, y_pred)))

In [None]:
# Define parameters
parameters = { 
    'n_estimators': [15,25,35],
    'max_depth': [3,5,7],
    'max_features': [3,5,7],
    'bootstrap': [True,False],
    'random_state' : [42]
}
# Grid Search function
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters)
CV_rfc.fit(X_train, y_train)
# print best parameters
print(CV_rfc.best_params_)

In [None]:
# Instantiate rfc
rfc = RandomForestClassifier(n_estimators = 25, max_depth = 7, max_features = 7, bootstrap = True, random_state = 42)
# Fit rfc to the training set    
rfc.fit(X_train, y_train)
# Predict the test set labels
y_pred = rfc.predict(X_test)
# Display accuracy score
print('Test set accuracy score of Random Forest: {:.4f}'.format(accuracy_score(y_test, y_pred)))
# Display auc score
print('Test set MSE of Random Forest: {:.4f}'.format(mean_squared_error(y_test, y_pred)))

In [None]:
# Define knn
knn = KNeighborsClassifier(n_neighbors=25)
# fit knn to model
knn.fit(X_train, y_train)
# Predict the test set labels
y_pred = knn.predict(X_test)
# predict probabilities
pred_prob = knn.predict_proba(X_test)
# Display accuracy score
print('Test set accuracy score of knn: {:.4f}'.format(accuracy_score(y_test, y_pred)))
# Display auc score
print('Test set MSE of knn: {:.4f}'.format(mean_squared_error(y_test, y_pred)))

In [None]:
X_data = pd.DataFrame(df_data.drop('Churn_Yes', axis=1))
# Create a pd.Series of features importances
importances = pd.Series(data=rfc.feature_importances_,
                        index= X_data.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()