# LAB 1 Random Forest and Metrics of the UX in MVT Test

### There are 4 parts of this laborary exercise
####  A. Decision Tree
####  B. Random Forest
####  C. Example of Metrics of a business goal (Clustering Procedure)
####  D. Application of Random Forest


## A. Decision Tree
The purpose of this laboratory is to show how a Decision Tree does its splits and how Random Forest do the classifications. 2 Plots will be generated, using the a Iris data set

I - An example of Decision Tree - With a lot of splits

II - Graphical Presentation

This code was rewritten in Python 3.7

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
np.random.seed(12345)

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target
print(f"Class = ", iris.target_names, "Features = ", iris.feature_names)

tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X, y)

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(tree_clf, out_file='tree_clf.dot', 
                feature_names = iris.feature_names[2:],
                class_names = iris.target_names,
                rounded = True, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree_clf.dot', '-o', 'tree_clf.png', '-Gdpi=600'])
scores = tree_clf.score(X, y)
print(f"Scores = ",scores)
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree_clf.png')



#### Estimating Class Belongs

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[0, 7.5, 0, 3], iris=True, legend=False, plot_training=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if not iris:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    if plot_training:
        plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", label="Iris-Setosa")
        plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", label="Iris-Versicolor")
        plt.plot(X[:, 0][y==2], X[:, 1][y==2], "g^", label="Iris-Virginica")
        plt.axis(axes)
    if iris:
        plt.xlabel("Petal length", fontsize=14)
        plt.ylabel("Petal width", fontsize=14)
    else:
        plt.xlabel(r"$x_1$", fontsize=18)
        plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
    if legend:
        plt.legend(loc="lower right", fontsize=14)

plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf, X, y)
plt.plot([2.45, 2.45], [0, 3], "k-", linewidth=2)
plt.plot([2.45, 7.5], [1.75, 1.75], "k--", linewidth=2)
plt.plot([4.95, 4.95], [0, 1.75], "k:", linewidth=2)
plt.plot([4.85, 4.85], [1.75, 3], "k:", linewidth=2)
plt.text(1.40, 1.0, "Depth=0", fontsize=15)
plt.text(3.2, 1.80, "Depth=1", fontsize=13)
plt.text(4.05, 0.5, "(Depth=2)", fontsize=11)

plt.show()

In [None]:
#Prediction 
print(f"class =", tree_clf.predict([[5,1.5]]), "Probability =", tree_clf.predict_proba([[5,1.5]]))

## B. Random Forest
The purpose of this laboratory is to show idea behind Random Forest and how Random Forest do the classifications. 
using the a moon data

I - Graphical comparsion Different between Decision Tree, Decision Tree with Bagging and Random Forest Algorithms

II - Calculate the feature importance of Random Forest ALgorithms

This code was rewritten in Python 3.7

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons 
#Make two interleaving half circles
#A simple toy dataset to visualize clustering and classification algorithms.
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import accuracy_score

# Decision Tree with Bagging
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
           n_estimators=100, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

print(f"Bagging 500 Decision Tree Algorithm Score =", accuracy_score(y_test, y_pred))

# Decision Tree only
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(f"Pure Decision Tree Algorithm Score =", accuracy_score(y_test, y_pred_tree))

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
print(f"Random Forest Algorithm Score =", accuracy_score(y_test, y_pred_rf))

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
    
plt.figure(figsize=(11,4))
#
plt.subplot(121)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
#
plt.subplot(122)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.show()
#
plt.figure(figsize=(5, 4))

for i in range(15):
    tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)
    indices_with_replacement = np.random.randint(0, len(X_train), len(X_train))
    tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])
    plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.02, contour=False)
plt.title("Random Forest", fontsize=14)
plt.show()

# Compare the scores amongst different methods
# np.sum(y_pred == y_pred_rf) / len(y_pred)  
# almost identical predictions


### Deduce the Feature Importance

In [None]:
# Calculate Feature Importance
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

rnd_clf.feature_importances_


### Metrics of A Business 

In [None]:
# from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
#import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

__Churn Prediction__
Retention Rate is one of the most critical metrics.Retention Rate is an indication of how good is your product market fit (PMF).
If your PMF is not satisfactory, you should see your customers churning very soon. One of the powerful tools to improve 
Retention Rate (hence the PMF) is Churn Prediction.

we will use a Telco dataset and go over the following steps to develop a Churn Prediction model:
- Exploratory data analysis
- Feature engineering
- Investigating how the features affect Retention by using Logistic Regression

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
df_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df_data.head(10)

In [None]:
df_data.info()

### Exploratory Data Analysis

In [None]:
df_data.loc[df_data.Churn=='No','Churn'] = 0 
df_data.loc[df_data.Churn=='Yes','Churn'] = 1
pd.crosstab(df_data['Churn'], columns='count')

In [None]:
fig = plt.figure(figsize=(12,6))
df_data[pd.to_numeric(df_data['TotalCharges'], errors='coerce').isnull()]
sns.scatterplot(x='TotalCharges',y='MonthlyCharges',data=df_data,x_bins=100, y_bins=50, hue="Churn")
plt.show()

In [None]:
df_data.tenure.describe()

In [None]:
fig = plt.figure(figsize=(12,6))
labels2 = ['tenure','MonthlyCharges']  
for i in range(0,2):
    ax = fig.add_subplot(1,2,i+1)
    #plt.tight_layout()
    df_plot = df_data.groupby(labels2[i]).Churn.mean().reset_index()
    sns.scatterplot(x=labels2[i],y='Churn',data=df_plot)
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
df_data[pd.to_numeric(df_data['TotalCharges'], errors='coerce').isnull()]
sns.scatterplot(x='MonthlyCharges',y='tenure',data=df_data,x_bins=100, y_bins=50, hue="Churn")
plt.show()

In [None]:
df_plot = df_data.groupby('gender').Churn.mean().reset_index()
df_plot

In [None]:
sns.barplot(x='gender',y='Churn',data=df_data)
plt.title("Gender Vs Churn Rate")
plt.xticks((0,1),("Female","Male"))
plt.show()

In [None]:
df_plot = df_data.groupby('InternetService').Churn.mean().reset_index()
df_plot

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='InternetService',y='Churn',data=df_plot)
plt.title("InternetService Vs Churn Rate")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='TechSupport',y='Churn',data=df_data)
plt.title("TechSupport Vs Churn Rate")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='PaymentMethod',y='Churn',data=df_data)
plt.title("PaymentMethod Vs Churn Rate")
plt.show()

In [None]:
fig = plt.figure(figsize=(10,12))
labels2 = ['PhoneService','PaperlessBilling','StreamingMovies','Contract','DeviceProtection','MultipleLines']  
for i in range(0,6):
    ax = fig.add_subplot(3,2,i+1)
    plt.tight_layout()
    sns.barplot(x=labels2[i],y='Churn',data=df_data)
plt.show()

### Clustering

A kind of grouping of "subjects" by the data itself. Sometimes, we called it a kind of unsupervisor learning mehod.

In [None]:
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

We start looking into whether there are many cluster in terms of the "tenure" feature.

In [None]:
sse={}
pd.options.mode.chained_assignment = None
df_cluster = df_data[['tenure', 'MonthlyCharges']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_cluster)
    df_cluster["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure(figsize=(16,8))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_data[['tenure','MonthlyCharges']])
df_data['TenureCluster'] = kmeans.predict(df_data[['tenure','MonthlyCharges']])

In [None]:
df_data = order_cluster('TenureCluster', 'tenure',df_data,True)

In [None]:
df_data.groupby('TenureCluster').tenure.describe()

In [None]:
y_kmeans = kmeans.fit_predict(X) 
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of Tenure')
plt.xlabel('Tenure Score')
plt.ylabel('MOnthly Charge Score')
plt.legend()
plt.show()

In [None]:
df_data['TenureCluster'] = df_data["TenureCluster"].replace({0:'Low',1:'Mid',2:'High'})

In [None]:
df_plot = df_data.groupby('TenureCluster').Churn.mean().reset_index()
plt.figure(figsize=(8,4))
sns.barplot(x='TenureCluster',y='Churn',data=df_plot)
plt.show()

In [None]:
df_plot = df_data.copy()
df_plot['MonthlyCharges'] = df_plot['MonthlyCharges'].astype(int)
df_plot = df_plot.groupby('MonthlyCharges').Churn.mean().reset_index()

In [None]:
sse={}
df_cluster = df_data[['MonthlyCharges']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_cluster)
    df_cluster["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure(figsize=(16,8))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_data[['MonthlyCharges']])
df_data['MonthlyChargeCluster'] = kmeans.predict(df_data[['MonthlyCharges']])

In [None]:
df_data = order_cluster('MonthlyChargeCluster', 'MonthlyCharges',df_data,True)

In [None]:
df_data.groupby('MonthlyChargeCluster').MonthlyCharges.describe()

In [None]:
df_data['MonthlyChargeCluster'] = df_data["MonthlyChargeCluster"].replace({0:'Low',1:'Mid',2:'High'})

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x='MonthlyChargeCluster',y='Churn',data=df_data)
plt.show()

In [None]:
len(df_data[pd.to_numeric(df_data['TotalCharges'], errors='coerce').isnull()])

In [None]:
df_data.loc[pd.to_numeric(df_data['TotalCharges'], errors='coerce').isnull(),'TotalCharges'] = np.nan

In [None]:
df_data = df_data.dropna()

In [None]:
df_data['TotalCharges'] = pd.to_numeric(df_data['TotalCharges'], errors='coerce')

In [None]:
df_plot = df_data.copy()
df_plot['TotalCharges'] = df_plot['TotalCharges'].astype(int)

In [None]:
sse={}
df_cluster = df_data[['TotalCharges']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_cluster)
    df_cluster["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure(figsize=(16,8))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_data[['TotalCharges']])
df_data['TotalChargeCluster'] = kmeans.predict(df_data[['TotalCharges']])

In [None]:
df_data = order_cluster('TotalChargeCluster', 'TotalCharges',df_data,True)

In [None]:
df_data.groupby('TotalChargeCluster').TotalCharges.describe()

In [None]:
df_data['TotalChargeCluster'] = df_data["TotalChargeCluster"].replace({0:'Low',1:'Mid',2:'High'})

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x='TotalChargeCluster',y='Churn',data=df_data)
plt.show()

In [None]:
df_data.info()

In [None]:
#import Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dummy_columns = [] #array for multiple value columns

for column in df_data.columns:
    if df_data[column].dtype == object and column != 'customerID':
        if df_data[column].nunique() == 2:
            #apply Label Encoder for binary ones
            df_data[column] = le.fit_transform(df_data[column]) 
        else:
            dummy_columns.append(column)

#apply get dummies for selected columns
df_data = pd.get_dummies(data = df_data,columns = dummy_columns)

In [None]:
df_data[['gender','Partner','TenureCluster_High','TenureCluster_Low','TenureCluster_Mid']].head()

In [None]:
all_columns = []
for column in df_data.columns:
    column = column.replace(" ", "_").replace("(", "_").replace(")", "_").replace("-", "_")
    all_columns.append(column)

df_data.columns = all_columns

In [None]:
glm_columns = 'gender'

for column in df_data.columns:
    if column not in ['Churn','customerID','gender']:
        glm_columns = glm_columns + ' + ' + column

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
 

glm_model = smf.glm(formula='Churn ~ {}'.format(glm_columns), data=df_data, family=sm.families.Binomial())
res = glm_model.fit()
print(res.summary())

In [None]:
np.exp(res.params)

In [None]:
df = df_data.drop(['customerID','Churn'], axis=1)
X = df
y = df_data.Churn

model = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
model.fit(X,y)
for name, score in zip(df.columns, model.feature_importances_):
    print(f" ",name, " = " ,score)

model.feature_importances_
