In [13]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from itertools import product

from sklearn.metrics import recall_score, accuracy_score
## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'churn-bigml-80.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
telecom_train = pd.read_csv(file_content_stream)
telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [14]:
## retreiving the testing dataset
## define csv file to read in the bucket
file_key= 'churn-bigml-20.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
telecom_test = pd.read_csv(file_content_stream)
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [15]:
## creating Churn_numb variable where true = 1, false = 0
telecom_train['Churn_numb'] = np.where(telecom_train['Churn'] == False, 0, 1)
telecom_test['Churn_numb'] = np.where(telecom_test['Churn'] == False, 0, 1)

In [16]:
## changing 'International_plan' from categorical to numerical (Yes = 1; No = 0) in both datasets
telecom_train['International_plan'] = np.where(telecom_train['International_plan'] == 'No', 0, 1)
telecom_test['International_plan'] = np.where(telecom_test['International_plan'] == 'No', 0, 1)

In [17]:
## changing 'Voice_mail_plan' from categorical to numerical (Yes = 1; No = 0) in both datasets
telecom_train['Voice_mail_plan'] = np.where(telecom_train['Voice_mail_plan'] == 'No', 0, 1)
telecom_test['Voice_mail_plan'] = np.where(telecom_test['Voice_mail_plan'] == 'No', 0, 1)

In [18]:
## creating 'total_charge' variable in both data frames
telecom_train = telecom_train.assign(total_charge = telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge'] + telecom_train['Total_intl_charge'])
telecom_test = telecom_test.assign(total_charge = telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge'] + telecom_test['Total_intl_charge'])

telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,Churn_numb,total_charge
0,KS,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False,0,75.56
1,OH,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False,0,59.24
2,NJ,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False,0,62.29
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False,0,66.8
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False,0,52.09


In [19]:
## keeping the following variables ONLY in both datasets: Account_length, International_plan, Voice_mail_plan, total_charge, 
## Customer_service_calls, and Churn
telecom_train = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 
                              'Churn_numb']]
telecom_test = telecom_test[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 
                              'Churn_numb']]

In [20]:
## definng input and target variables in training dataset
X = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
Y = telecom_train['Churn_numb']

In [21]:
## feature importance (considering telecom_train datasets); 1000 times
rf_results = list()
ada_results = list()
gb_results = list()

for i in range (0,10):
    
    print(i)
    
    ## splitting the telecom_training dataset into 80% training and 20% testing, stratify = Y
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## creating models
    rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)
    ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    gb_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

    ## extracting the importances
    rf_importances = rf_md.feature_importances_
    ada_importances = ada_md.feature_importances_
    gb_importances = gb_md.feature_importances_
    
    ## appending results to a list
    rf_results.append(rf_importances)
    ada_results.append(ada_importances)
    gb_results.append(gb_importances)

0
1
2
3
4
5
6
7
8
9


In [29]:
rf_importances_df = pd.DataFrame(rf_results)
rf_importances_df.columns = ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
rf_importances_df.apply(np.mean, axis = 0)

Account_length            0.019810
International_plan        0.140388
Voice_mail_plan           0.073318
total_charge              0.555976
Customer_service_calls    0.210509
dtype: float64

In [34]:
ada_importances_df = pd.DataFrame(ada_results)
ada_importances_df.columns = ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
ada_importances_df.apply(np.mean, axis = 0)

Account_length            0.301151
International_plan        0.247949
Voice_mail_plan           0.029955
total_charge              0.338529
Customer_service_calls    0.082416
dtype: float64

In [36]:
gb_importances_df = pd.DataFrame(gb_results)
gb_importances_df.columns = ['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']
gb_importances_df.apply(np.mean, axis = 0)

Account_length            0.015355
International_plan        0.114536
Voice_mail_plan           0.129902
total_charge              0.574196
Customer_service_calls    0.166010
dtype: float64

In [56]:
averages = pd.DataFrame(columns = ['Account_length'])
averages['Account_length'] =  rf_importances_df['Account_length'].mean()

averages

Unnamed: 0,Account_length


In [52]:
((rf_importances_df['Account_length'].mean()) + (ada_importances_df['Account_length'].mean()) + (ada_importances_df['Account_length'].mean())).mean()

AttributeError: 'float' object has no attribute 'mean'

In [45]:
gb_importances_df['Account_length'].mean()

0.015355189730910745

In [46]:
ada_importances_df['Account_length'].mean()

0.3011506794856379

In [48]:
rf_importances_df['Account_length'].mean()

0.019809622002226527

In [49]:
0.015355189730910745 + 0.3011506794856379 + 0.019809622002226527

0.3363154912187752

In [50]:
0.3363154912187752/3

0.11210516373959173