In [19]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score
## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'churn-bigml-80.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
telecom_train = pd.read_csv(file_content_stream)
telecom_train.head()

## disabling the 'FutureWarning' warning message
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [20]:
## define csv file to read in the bucket
file_key= 'churn-bigml-20.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
telecom_test = pd.read_csv(file_content_stream)
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


## feature engineering

In [21]:
## changing the variable 'churn' from categorical to numerical variable (True = 1, False = 0) in both data frames
telecom_train['Churn'] = np.where(telecom_train['Churn'] == False, 0, 1)
telecom_test['Churn'] = np.where(telecom_test['Churn'] == False, 0, 1)

In [22]:
## changing the variable 'International_plan' from categorical to numerical variable (Yes = 1, False = 0) in both data frames
telecom_train['International_plan'] = np.where(telecom_train['International_plan'] == 'No', 0, 1)
telecom_test['International_plan'] = np.where(telecom_test['International_plan'] == 'No', 0, 1)

In [23]:
## changing the variable 'International_plan' from categorical to numerical variable (Yes = 1, False = 0) in both data frames
telecom_train['Voice_mail_plan'] = np.where(telecom_train['Voice_mail_plan'] == 'No', 0, 1)
telecom_test['Voice_mail_plan'] = np.where(telecom_test['Voice_mail_plan'] == 'No', 0, 1)

In [24]:
## creating a new variable called 'total_charge' in train data frame
telecom_train = telecom_train.assign(total_charge = telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge'] + telecom_train['Total_intl_charge'])
telecom_train.head()                                                                                                                                                                                                                                                                                                                                                          

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,total_charge
0,KS,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0,75.56
1,OH,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0,59.24
2,NJ,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0,62.29
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0,66.8
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0,52.09


In [25]:
## creating a new variable called 'total_charge' in test data frame
telecom_test = telecom_test.assign(total_charge = telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge'] + telecom_test['Total_intl_charge'])
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,total_charge
0,LA,117,408,0,0,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,0,73.32
1,IN,65,415,0,0,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,1,54.2
2,NY,161,415,0,0,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,1,92.29
3,SC,111,415,0,0,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,0,41.05
4,HI,49,510,0,0,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,0,49.6


In [26]:
## dropping all columns besides Account length, International plan, Voice mail plan, total charge, Customer service calls, and Churn for both data sets
telecom_train= telecom_train.drop(columns= ['Area_code', 'Number_vmail_messages', 'Total_day_minutes', 'Total_day_calls', 'Total_day_charge', 'Total_eve_minutes', 'Total_eve_calls','Total_eve_charge', 'Total_night_minutes', 'Total_night_calls', 'Total_night_charge', 'Total_intl_minutes', 'Total_intl_calls', 'Total_intl_charge'], axis=1)
telecom_test= telecom_test.drop(columns= ['Area_code', 'Number_vmail_messages', 'Total_day_minutes', 'Total_day_calls', 'Total_day_charge', 'Total_eve_minutes', 'Total_eve_calls','Total_eve_charge', 'Total_night_minutes', 'Total_night_calls', 'Total_night_charge', 'Total_intl_minutes', 'Total_intl_calls', 'Total_intl_charge'], axis=1)

In [27]:
telecom_train.head()

Unnamed: 0,State,Account_length,International_plan,Voice_mail_plan,Customer_service_calls,Churn,total_charge
0,KS,128,0,1,1,0,75.56
1,OH,107,0,1,1,0,59.24
2,NJ,137,0,0,0,0,62.29
3,OH,84,1,0,2,0,66.8
4,OK,75,1,0,3,0,52.09


In [28]:
telecom_test.head()

Unnamed: 0,State,Account_length,International_plan,Voice_mail_plan,Customer_service_calls,Churn,total_charge
0,LA,117,0,0,1,0,73.32
1,IN,65,0,0,4,1,54.2
2,NY,161,0,0,4,1,92.29
3,SC,111,0,0,2,0,41.05
4,HI,49,0,0,1,0,49.6


In [29]:
## defining input and target variables
X_train = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
Y_train = telecom_train['Churn']

X_test = telecom_test[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
Y_test = telecom_test['Churn']

In [13]:
coeffs = list()

for i in range(0,10):
    
    # print(i)

    ## splitting data into training and testing datasets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## transforming the data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    #X_test = scaler.fit_transform(X_test)
    
    ## estimating lambda for lasso by CV with 5 folds
    lasso_cv = LassoCV(normalize = True, cv = 5).fit(X_train, Y_train)

    ## extracting the best lambda value via cross validation
    cv_lambda = lasso_cv.alpha_
    
    ## building the lasso model and capturing coefficients
    lasso_md = Lasso(alpha = cv_lambda, normalize = True).fit(X_train, Y_train)
    coeffs.append(lasso_md.coef_)

## creating a dataframe from array/list to store results
df_coeffs = pd.DataFrame(coeffs, columns = [['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']])
df_coeffs

Unnamed: 0,Account_length,International_plan,Voice_mail_plan,total_charge,Customer_service_calls
0,0.056692,0.331653,-0.063289,0.503184,0.536093
1,0.0356,0.329218,-0.066753,0.451514,0.551871
2,0.033268,0.316489,-0.068895,0.466403,0.506989
3,0.038658,0.316802,-0.078294,0.517484,0.54876
4,0.014578,0.267257,-0.069603,0.471122,0.483291
5,0.010094,0.331295,-0.063054,0.470778,0.531827
6,0.025931,0.311923,-0.077351,0.483796,0.520738
7,0.053131,0.285455,-0.072641,0.526517,0.57533
8,0.0,0.31384,-0.055802,0.446264,0.542816
9,0.038026,0.296193,-0.08412,0.504687,0.457186


In [33]:
## counting all 0s in each column
count_0 = (df_coeffs ==0).sum()
count_0

print(count_0)

print('I need to drop the Account_length variable because it has over 200 coefficients = 0')

Account_length            1
International_plan        0
Voice_mail_plan           0
total_charge              0
Customer_service_calls    0
dtype: int64
I need to drop the Account_length variable because it has over 200 coefficients = 0


In [30]:
## removing features with coefficients = 0 more than 200 times
X_train = X_train.drop(columns = ['Account_length'], axis = 1)
X_test = X_test.drop(columns = ['Account_length'], axis = 1)

In [31]:
telecom_train.head()

Unnamed: 0,State,Account_length,International_plan,Voice_mail_plan,Customer_service_calls,Churn,total_charge
0,KS,128,0,1,1,0,75.56
1,OH,107,0,1,1,0,59.24
2,NJ,137,0,0,0,0,62.29
3,OH,84,1,0,2,0,66.8
4,OK,75,1,0,3,0,52.09


In [32]:
telecom_test.head()

Unnamed: 0,State,Account_length,International_plan,Voice_mail_plan,Customer_service_calls,Churn,total_charge
0,LA,117,0,0,1,0,73.32
1,IN,65,0,0,4,1,54.2
2,NY,161,0,0,4,1,92.29
3,SC,111,0,0,2,0,41.05
4,HI,49,0,0,1,0,49.6


In [17]:
## redefining the input and target variables
#X = telecom_train[['International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
#Y = telecom_train['Churn']

## creating list to store 100 iterations of recall results for each model
from sklearn.preprocessing import MinMaxScaler
md1_results = list()
md2_results = list()
md3_results = list()
md4_results = list()

## creating a loop

for i in range (0,100):
    
    #print(i)

    kf = StratifiedKFold(n_splits = 5, shuffle = True)
    
    ## defining a list to store the fold results
    md1_fold_results = []
    md2_fold_results = []
    md3_fold_results = []
    md4_fold_results = []

    for train_ix, val_ix in kf.split(X, Y):
        
        ## transforming the data for both models
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)
        X_val = scaler.fit_transform(X_val)

        ## splitting the data into train and validation
        X_train, X_val = X.iloc[train_ix], X.iloc[val_ix]
        Y_train, Y_val = Y.iloc[train_ix], Y.iloc[val_ix]

        ## Building the logistic regression models
        md1 = LogisticRegression(solver = 'liblinear', penalty = 'l1').fit(X_train, Y_train)
        md2 = LogisticRegression(solver = 'liblinear', penalty = 'l2').fit(X_train, Y_train)
        md3 = LogisticRegression(solver = 'saga', penalty = 'l1').fit(X_train, Y_train)
        md4 = LogisticRegression(solver = 'saga', penalty = 'l2').fit(X_train, Y_train)

        ## Predicting on the validation set
        pred1 = md1.predict_proba(X_val)[:,1]
        pred2 = md2.predict_proba(X_val)[:,1]
        pred3 = md3.predict_proba(X_val)[:,1]
        pred4 = md4.predict_proba(X_val)[:,1]

        ## setting 10% cutoff and creating labels for transformation
        pred1_labels = np.where(pred1 < 0.1, 0, 1)
        pred2_labels = np.where(pred2 < 0.1, 0, 1)
        pred3_labels = np.where(pred3 < 0.1, 0, 1)
        pred4_labels = np.where(pred4 < 0.1, 0, 1)

        ## calculating recall score for both models
        md1_recall = recall_score(Y_val, pred1_labels)
        md2_recall = recall_score(Y_val, pred2_labels)
        md3_recall = recall_score(Y_val, pred3_labels)
        md4_recall = recall_score(Y_val, pred4_labels)

        ## storing recall score in a list
        md1_fold_results.append(md1_recall)
        md2_fold_results.append(md2_recall)
        md3_fold_results.append(md3_recall)
        md4_fold_results.append(md4_recall)

        ## printing each recall score for each fold at each iteration
        print(md1_fold_results)
        print(md2_fold_results)
        print(md3_fold_results)
        print(md4_fold_results)
        
    ## computing the average recall score and storing it in 'mdn_results'     
    avg_1 = np.mean(md1_fold_results)
    md1_results.append(avg_1)
    
    avg_2 = np.mean(md2_fold_results)
    md2_results.append(avg_2)
    
    avg_3 = np.mean(md3_fold_results)
    md3_results.append(avg_3)
    
    avg_4 = np.mean(md4_fold_results)
    md4_results.append(avg_4)

KeyError: 'Churn'

In [None]:
## reporting the average recall for each model
print('The average recall for model 1 is', avg_1)
print('The average recall for model 2 is', avg_2)
print('The average recall for model 3 is', avg_3)
print('The average recall for model 4 is', avg_4)

In [34]:
## visualizing the recall scores for each model at each iteration
iteration = [i for i in range(0,100)]
plt.plot(iteration, md1_results, color = 'red')
plt.plot(iteration, md2_results, color = 'purple')
plt.plot(iteration, md3_results, color = 'lightblue')
plt.plot(iteration, md4_results, color = 'orange')
plt.xlabel('Iteration')
plt.ylabel('Recall Score')
plt.legend(loc = 'upper right', labels = ['model 1', 'model 2', 'model 3', 'model 4'])
plt.grid()
plt.show();

NameError: name 'md1_results' is not defined