In [30]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [31]:
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf

# (a)
# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabrielferreira-data-455-bucket'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "College.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
college = pd.read_csv(file_content_stream)
college.head(1)

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60


In [32]:
college['Private'].value_counts()

Yes    565
No     212
Name: Private, dtype: int64

In [33]:
# (b)
# Changin Private variable from a categorical to a numerical variable
college['Private'] = pd.Series(np.where(college['Private'].values == 'Yes', 1, 0), college.index)
college['Private'].value_counts()

1    565
0    212
Name: Private, dtype: int64

In [34]:
# (c)
# Defining predictors and target variables
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
Y = college['Apps']

# Splitting data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

# (d)
# Trasforming the predictors variables in the train and test dataset to 0-1 scale
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [35]:
# (e)
# Building a multiple linear regression model
lm_md = LinearRegression().fit(X_train, Y_train)

# Prediciting on test dataset
lm_md_preds = lm_md.predict(X_test)

# Computing the mse
lm_md_mse = np.mean(np.power(lm_md_preds - Y_test, 2))
lm_md_mse

9447995.576057166

In [36]:
# (f)
ridge_cv = RidgeCV(np.linspace(0.001, 100, num = 100), cv = 5).fit(X_train, Y_train)
rigde_alpha = ridge_cv.alpha_
rigde_alpha

0.001

In [37]:
# Building Ridge regression model
ridge_md = Ridge(alpha = rigde_alpha).fit(X_train, Y_train)

# Predicting on test dataset
ridge_md_preds = ridge_md.predict(X_test)

# Computing the mse
ridge_md_mse = np.mean(np.power(ridge_md_preds - Y_test, 2))
ridge_md_mse

9447465.45391115

In [38]:
# (g)
lasso_cv = LassoCV(alphas = np.linspace(0.001, 100, num = 100), normalize = True, cv = 5).fit(X_train, Y_train)
lasso_alpha = lasso_cv.alpha_
lasso_alpha

1.011090909090909

In [39]:
# Building Lasso regression model
lasso_md = Lasso(alpha = lasso_alpha).fit(X_train, Y_train)

# Predicting on test dataset
lasso_md_preds = lasso_md.predict(X_test)

# Computing the mse
lasso_md_mse = np.mean(np.power(lasso_md_preds - Y_test, 2))
lasso_md_mse

9307336.589064173

## (h)
#### Using the results from parts (e), (f) and (g), I would use the model from part (e) to predict the number of applications that a university receive.

## Question 2

In [42]:
# (a)
# Train dataset
# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabrielferreira-data-455-bucket'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "churn-bigml-80.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
telecom_train = pd.read_csv(file_content_stream)

# Test dataset
# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabrielferreira-data-455-bucket'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "churn-bigml-20.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
telecom_test = pd.read_csv(file_content_stream)

In [43]:
# (b)
telecom_train['Churn'] = np.where(telecom_train['Churn'] == True, 1, 0)
telecom_test['Churn'] = np.where(telecom_test['Churn'] == True, 1, 0)

In [44]:
telecom_train['International_plan'] = np.where(telecom_train['International_plan'] == "Yes", 1, 0)
telecom_test['International_plan'] = np.where(telecom_test['International_plan'] == "Yes", 1, 0)

In [45]:
telecom_train['Voice_mail_plan'] = np.where(telecom_train['Voice_mail_plan'] == "Yes", 1, 0)
telecom_test['Voice_mail_plan'] = np.where(telecom_test['Voice_mail_plan'] == "Yes", 1, 0)

In [46]:
telecom_train['Total_charge'] = np.sum(telecom_train[['Total_day_charge', 'Total_eve_charge', 'Total_night_charge', 'Total_intl_charge']], axis = 1)
telecom_test['Total_charge'] = np.sum(telecom_test[['Total_day_charge', 'Total_eve_charge', 'Total_night_charge', 'Total_intl_charge']], axis = 1)

In [47]:
# (c)
telecom_train = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'Total_charge', 'Customer_service_calls', 'Churn']]
telecom_test = telecom_test[['Account_length', 'International_plan', 'Voice_mail_plan', 'Total_charge', 'Customer_service_calls', 'Churn']]

In [48]:
# (d)
# Defining predictors and target variables
X = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'Total_charge', 'Customer_service_calls']]
Y = telecom_train['Churn']

# (1)
# Splitting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# (2)
# Transfomating all input variables to 0-1 scale
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# (3)
# Estimating the optimal lambda for LASSO using default values.
lasso_cv = LassoCV(normalize = True, cv = 5).fit(X_train, Y_train)
lasso_cv = lasso_cv.alpha_

lasso_md = Lasso(alpha = lasso_cv, normalize = True).fit(X_train, Y_train)
lasso_md.coef_

array([ 0.06851413,  0.31353904, -0.07166731,  0.5090659 ,  0.47444084])

In [49]:
# Creating a list to store the results
lasso_results = list()

for i in range(0,1000):
    X = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'Total_charge', 'Customer_service_calls']]
    Y = telecom_train['Churn']

    # (1)
    # Splitting the data into train and test
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

    # (2)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    # (3)
    # Estimating the optimal lambda for LASSO using default values.
    lasso_cv = LassoCV(normalize = True, cv = 5).fit(X_train, Y_train)
    lasso_cv = lasso_cv.alpha_

    lasso_md = Lasso(alpha = lasso_cv, normalize = True).fit(X_train, Y_train)
    lasso_results.append(lasso_md.coef_)
    
lasso_results = pd.DataFrame(lasso_results, columns = X.columns)

In [50]:
# Counting coefficients equal zero
for i in range(0, len(lasso_results.iloc[0,:])):
    
    count = 0
    
    for j in range(0, len(lasso_results.iloc[:,0])):
        
        if lasso_results.iloc[j, i] == 0:
            count = count + 1
    
    print("Variable index: ", i)
    print('Number of 0 in this variable: ', count)
    print(' ')

Variable index:  0
Number of 0 in this variable:  269
 
Variable index:  1
Number of 0 in this variable:  0
 
Variable index:  2
Number of 0 in this variable:  0
 
Variable index:  3
Number of 0 in this variable:  0
 
Variable index:  4
Number of 0 in this variable:  0
 


In [51]:
# Dropping variables whose estimated coefficients is 0 more than 200 times.
telecom_train = telecom_train.drop('Account_length', axis = 1)
telecom_test = telecom_test.drop('Account_length', axis = 1)

In [52]:
# (e)
# Defining predictors and target variables
X_telecom_train = telecom_train[['International_plan', 'Voice_mail_plan', 'Total_charge', 'Customer_service_calls']]
Y_telecom_train = telecom_train['Churn']

# (i)
# importing StratifiedKFold function
from sklearn.model_selection import StratifiedKFold, KFold

# Setting StratifiedKFold with 5 folds
skf = StratifiedKFold(n_splits = 5, shuffle = True)

for train_ix, test_ix in skf.split(X_telecom_train, Y_telecom_train):
    
     ## Splitting data into train and test
    X_train, X_test = X_telecom_train.loc[train_ix], X_telecom_train.loc[test_ix]
    Y_train, Y_test = Y_telecom_train.loc[train_ix], Y_telecom_train.loc[test_ix]
    
    # (ii)
    ## Transforming all input variables to 0-1 scale
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

In [53]:
# Building a multi-layer perceptron (MLP) model with one single hidden layer with 5 neurons (hyperbolic 
# tangent as the activation function) and softmax as the activation function for the output.
md1 = tf.keras.models.Sequential([
      tf.keras.layers.Dense(5, input_dim = 4, activation = 'tanh'),
      tf.keras.layers.Dense(2, activation = 'softmax')
])

 # Use the stochastic descent gradient as the method to estimate the weights (optimizer = ’sgd’, 
# loss = ’categorical crossentropy’, and metrics = [’accuracy’]). Use epochs = 100 and 
    # batch size = 100 to build the model. 
md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
md1.fit(X_train,
        tf.keras.utils.to_categorical(Y_train, num_classes = 2), 
        epochs = 100, 
        batch_size =100, 
        verbose = 0,
        validation_data = (X_test, tf.keras.utils.to_categorical(Y_test, num_classes = 2)))
        
# using the model to predict on the test dataset. Using 10% as the cut-off value, compute the recall of this model. 
from sklearn.metrics import recall_score

# Getting predictions from multi-layer perceptron (MLP) model on X_test dataset
preds = md1.predict(X_test)[:,1]

# Defining likelyhoods to labels
preds = np.where(preds < 0.10, 0, 1)
# Report the average recall score across the 5-folds
recall_score(Y_test, preds)

0.8974358974358975

In [54]:
# Building a multi-layer perceptron (MLP) model with one single hidden layer with 5 neurons (ReLU
# as the activation function) and softmax as the activation function for the output.
md2 = tf.keras.models.Sequential([
      tf.keras.layers.Dense(5, input_dim = 4, activation = 'relu'),
      tf.keras.layers.Dense(2, activation = 'softmax')
])

 # Use the stochastic descent gradient as the method to estimate the weights (optimizer = ’sgd’, 
# loss = ’categorical crossentropy’, and metrics = [’accuracy’]). Use epochs = 100 and 
    # batch size = 100 to build the model. 
md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
md2.fit(X_train,
        tf.keras.utils.to_categorical(Y_train, num_classes = 2), 
        epochs = 100, 
        batch_size =100, 
        verbose = 0,
        validation_data = (X_test, tf.keras.utils.to_categorical(Y_test, num_classes = 2)))
        
# using the model to predict on the test dataset. Using 10% as the cut-off value, compute the recall of this model. 
# Getting predictions from multi-layer perceptron (MLP) model on X_test dataset
preds_2 = md2.predict(X_test)[:,1]

# Defining likelyhoods to labels
preds_2 = np.where(preds_2 < 0.10, 0, 1)
# Report the average recall score across the 5-folds
recall_score(Y_test, preds_2)

0.9615384615384616

In [55]:
# Building the support vector machine (SVM) with kernel = 'rbf'
from sklearn.svm import SVR, SVC

# Building the svm with kernel = 'rbf'
md3 = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)

# Predicting on the test dataset
preds_3 = md3.predict_proba(X_test)[:,1]
preds_3 = np.where(preds_3 < 0.10, 0, 1)
recall_score(Y_test, preds_3)

0.782051282051282

In [56]:
# Building the support vector machine (SVM) with kernel = 'poly'
md4 = SVC(kernel = 'poly', probability = True).fit(X_train, Y_train)

# Predicting on the test dataset
preds_4 = md4.predict_proba(X_test)[:,1]
preds_4 = np.where(preds_4 < 0.10, 0, 1)
recall_score(Y_test, preds_4)

0.7692307692307693

In [None]:
# (f)
# Creating list to store recall scores provided by the models
md1_recall = list()
md2_recall = list()
md3_recall = list()
md4_recall = list()

# Setting StratifiedKFold with 5 folds
# skf = StratifiedKFold(n_splits = 5, shuffle = True) 

for i in range(0,100):
    
    # Setting StratifiedKFold with 5 folds
    skf = StratifiedKFold(n_splits = 5, shuffle = True)
    
    md1_kfold_result = list()
    md2_kfold_result = list()
    md3_kfold_result = list()
    md4_kfold_result = list()
    
    for train_ix, test_ix in skf.split(X_telecom_train, Y_telecom_train):
    
         ## Splitting data into train and test
        X_train, X_test = X_telecom_train.loc[train_ix], X_telecom_train.loc[test_ix]
        Y_train, Y_test = Y_telecom_train.loc[train_ix], Y_telecom_train.loc[test_ix]
    
        # (ii)
        ## Transforming all input variables to 0-1 scale
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        # Building a multi-layer perceptron (MLP) model with one single hidden layer with 5 neurons (hyperbolic 
        # tangent as the activation function) and softmax as the activation function for the output.
        md1 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(5, input_dim = 4, activation = 'tanh'),
          tf.keras.layers.Dense(2, activation = 'softmax')
        ])

        # Use the stochastic descent gradient as the method to estimate the weights (optimizer = ’sgd’, 
        # loss = ’categorical crossentropy’, and metrics = [’accuracy’]). Use epochs = 100 and 
        # batch size = 100 to build the model. 
        md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
        md1.fit(X_train,
            tf.keras.utils.to_categorical(Y_train, num_classes = 2), 
            epochs = 100, 
            batch_size =100, 
            verbose = 0,
            validation_data = (X_test, tf.keras.utils.to_categorical(Y_test, num_classes = 2)))
        
        # using the model to predict on the test dataset. Using 10% as the cut-off value, compute the recall of this model. 
        # Getting predictions from multi-layer perceptron (MLP) model on X_test dataset
        preds = md1.predict(X_test)[:,1]

        # Defining likelyhoods to labels
        preds = np.where(preds < 0.10, 0, 1)
        # Report the average recall score across the 5-folds
        md1_kfold_result.append(recall_score(Y_test, preds))
        
 
    
        ##### Model 2
        # Building a multi-layer perceptron (MLP) model with one single hidden layer with 5 neurons (ReLU
        # as the activation function) and softmax as the activation function for the output.
        md2 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(5, input_dim = 4, activation = 'relu'),
          tf.keras.layers.Dense(2, activation = 'softmax')
        ])

        # Use the stochastic descent gradient as the method to estimate the weights (optimizer = ’sgd’, 
        # loss = ’categorical crossentropy’, and metrics = [’accuracy’]). Use epochs = 100 and 
        # batch size = 100 to build the model. 
        md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
        md2.fit(X_train,
            tf.keras.utils.to_categorical(Y_train, num_classes = 2), 
            epochs = 100, 
            batch_size =100, 
            verbose = 0,
            validation_data = (X_test, tf.keras.utils.to_categorical(Y_test, num_classes = 2)))
        
        # using the model to predict on the test dataset. Using 10% as the cut-off value, compute the recall of this model. 
        # Getting predictions from multi-layer perceptron (MLP) model on X_test dataset
        preds_2 = md2.predict(X_test)[:,1]

        # Defining likelyhoods to labels
        preds_2 = np.where(preds_2 < 0.10, 0, 1)
    
        # Computing the average recall score across the 5-folds
        md2_kfold_result.append(recall_score(Y_test, preds_2))
        
        ##### Model 3
        # Building the support vector machine (SVM) with kernel = 'rbf'
        md3 = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)

        # Predicting on the test dataset
        preds_3 = md3.predict_proba(X_test)[:,1]
    
        # Defining likelyhoods to labels
        preds_3 = np.where(preds_3 < 0.10, 0, 1)
    
        # Computing the average recall score across the 5-folds
        md3_kfold_result.append(recall_score(Y_test, preds_3))
    
        #### Model 4
        # Building the support vector machine (SVM) with kernel = 'poly'
        md4 = SVC(kernel = 'poly', probability = True).fit(X_train, Y_train)

        # Predicting on the test dataset
        preds_4 = md4.predict_proba(X_test)[:,1]
        preds_4 = np.where(preds_4 < 0.10, 0, 1)
        md4_kfold_result.append(recall_score(Y_test, preds_4))
    
    ## Appending average recall of 5-fold cross validation
    md1_recall.append(np.mean(md1_kfold_result))
    
    # Computing the mean of recall scores for model 2
    md2_recall.append(np.mean(md2_kfold_result))
    
    # Computing the mean of recall scores for model 3
    md3_recall.append(np.mean(md3_kfold_result))
    
    # Computing the mean of recall scores for model 4
    md4_recall.append(np.mean(md4_kfold_result))   

In [None]:
# Visualizing model 1
import matplotlib.pyplot as plt

## Plotting iteration results
fig = plt.figure(figsize = (12, 8))

iterations = range(0, 3)
plt.plot(iterations, md1_recall, marker = 'o', color = 'blue',
        label = 'First Model')
plt.plot(iterations, md2_recall, marker = 'o', color = 'orange',
        label = 'Second Model')
plt.plot(iterations, md3_recall, marker = 'o', color = 'red',
         label = 'Third Model')
plt.plot(iterations, md4_recall, marker = 'o', color = 'black',
        label = 'Fourth Model')
plt.xlabel('Number of Iteration')
plt.ylabel('Recall Score')
plt.legend()
plt.grid()
plt.show()

In [None]:
# (g)
# Defining predictors and target variables
X_telecom_train = telecom_train[['International_plan', 'Voice_mail_plan', 'Total_charge', 'Customer_service_calls']]
Y_telecom_train = telecom_train['Churn']
X_telecom_test = telecom_test[['International_plan', 'Voice_mail_plan', 'Total_charge', 'Customer_service_calls']]
Y_telecom_test = telecom_test['Churn']

# Transforming the input variables to 0-1 scale 
X_telecom_train = scaler.fit_transform(telecom_train)
X_telecom_test = scaler.fit_transform(telecom_test)

In [None]:
# Building a multi-layer perceptron (MLP) model with one single hidden layer with 5 neurons (ReLU
# as the activation function) and softmax as the activation function for the output.
md2 = tf.keras.models.Sequential([
      tf.keras.layers.Dense(5, input_dim = 4, activation = 'relu'),
      tf.keras.layers.Dense(2, activation = 'softmax')
])

 # Use the stochastic descent gradient as the method to estimate the weights (optimizer = ’sgd’, 
# loss = ’categorical crossentropy’, and metrics = [’accuracy’]). Use epochs = 100 and 
    # batch size = 100 to build the model. 
md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
md2.fit(X_telecom_train,
        tf.keras.utils.to_categorical(Y_telecom_train, num_classes = 2), 
        epochs = 100, 
        batch_size =100, 
        verbose = 0,
        validation_data = (X_telecom_test, tf.keras.utils.to_categorical(Y_telecom_test, num_classes = 2)))
        
# using the model to predict on the test dataset. Using 10% as the cut-off value, compute the recall of this model. 
# Getting predictions from multi-layer perceptron (MLP) model on X_test dataset
preds_2 = md2.predict(X_telecom_test)[:,1]

# Defining likelyhoods to labels
preds_2 = np.where(preds_2 < 0.10, 0, 1)
# Report the average recall score across the 5-folds
recall_score(Y_telecom_test, preds_2)

In [None]:
# Building the support vector machine (SVM) with kernel = 'rbf'
from sklearn.svm import SVR, SVC

# Building the svm with kernel = 'rbf'
md3 = SVC(kernel = 'rbf', probability = True).fit(X_telecom_train, Y_telecom_train)

# Predicting on the test dataset
preds_3 = md3.predict_proba(X_telecom_test)[:,1]
preds_3 = np.where(preds_3 < 0.10, 0, 1)
recall_score(Y_telecom_test, preds_3)

#### I would choose model # to predict Churn because after all tests, model # has had the best performance so far.