In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle
import boto
from boto.s3.key import Key
from boto.s3.connection import Location
import os, sys, time

In [40]:
def functions_ignitor(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    if not os.path.exists('trainDataFromS3'):
        os.makedirs('trainDataFromS3')
    print('MAIN FUNCTION TRIGGERED')
    download_filePath = downloadFromS3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    print('DOWNLOADED DATA DUMP FROM S3')
    X_train, X_test, y_train, y_test = data_transformation(download_filePath)
    print('DATA TRANSFORMATION COMPLETED')
    upload_filePath = train_pickle_model(X_train, X_test, y_train, y_test)
    print('TRAINED & PICKLED MODEL')
    uploadToS3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, upload_filePath)
    print('UPLOADED PICKLE TO S3')


def downloadFromS3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    bucket_name = 'ads-final-project-data-dump'
    conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(bucket_name)
    bucket_list = bucket.list()
    for l in bucket_list:
        print('l: ', l)
        keyString = str(l.key)
        print('keyString: ', keyString)
        download_filePath = 'trainDataFromS3/'+keyString
        l.get_contents_to_filename('trainDataFromS3/'+keyString)

    return download_filePath

def uploadToS3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, upload_filePath, destinationPath = ''):
    bucket_name = 'ads-final-project'
    conn = boto.connect_s3(AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY)
    bucket = conn.create_bucket(bucket_name,location=boto.s3.connection.Location.DEFAULT)
    print ('Uploading '+upload_filePath+' to Amazon S3 bucket '+bucket_name)
    def percent_cb(complete, total):
        sys.stdout.write('.')
        sys.stdout.flush()   
    k = Key(bucket)
    k.key = destinationPath+"/"+upload_filePath
    k.set_contents_from_filename(upload_filePath, cb = percent_cb, num_cb = 10)
    print('Uploaded')

def data_transformation(download_filePath):
    df = pd.read_csv(download_filePath)
    #Replacing empty spaces with Null values
    df = df.replace(r'^\s+$', np.nan, regex=True)
    # Dropping NA values
    df = df.dropna()
    # Change the 'SeniorCitizen' variable from interger to categorial
    df['SeniorCitizen']=pd.Categorical(df['SeniorCitizen'])
    # Change the 'TotalCharges' variable from object to interger 
    df['TotalCharges']=pd.to_numeric(df['TotalCharges'])
    # Deleting the custumerID column
    del df["customerID"]
    #Splitting data according to datatypes
    num = ['float64', 'int64']
    num_df = df.select_dtypes(include=num)
    obj_df = df.select_dtypes(exclude=num)
    # Add the 'Churn' variable in numeric dataset
    num_df = pd.concat([num_df,df["Churn"]],axis=1)
    #Creating bins and plotting Countplot for 'tenure'
    tenure_bins=pd.cut(num_df["tenure"], bins=[0,20,60,80], labels=['low','medium','high'])
    #Creating bins and plotting Countplot for 'MonthlyCharges'
    MonthlyCharges_bins=pd.cut(num_df["MonthlyCharges"], bins=[0,35,60,130], labels=['low','medium','high'])
    #Creating bins and plotting Countplot for 'MonthlyCharges'
    TotalCharges_bins=pd.cut(num_df["TotalCharges"], bins=[0,1000,4000,10000], labels=['low','medium','high'])
    #Saving bins into dataframe
    bins=pd.DataFrame([tenure_bins, MonthlyCharges_bins, TotalCharges_bins]).T
    #Converting SeniorCitizen variable into categorical and mapping values of 1 & 0 to Yes & No respectively
    df['SeniorCitizen'] = df.SeniorCitizen.map({0:'No', 1:'Yes'})
    # Concatenate bins with object variables
    df=pd.concat([bins,obj_df],axis=1)
    # Convert all the variables into categorical
    for i in list(df.columns):
        df[i] = pd.Categorical(df[i]) 
    dummy = pd.get_dummies(df) # Transform the categorical variables into dummy variables
    # Split training and testing dataset
    features = dummy.drop(["Churn_Yes", "Churn_No"], axis=1).columns
    print(features)
    X = dummy[features]
    y = dummy["Churn_Yes"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7)

    return X_train, X_test, y_train, y_test

def train_pickle_model(X_train, X_test, y_train, y_test):
    # Training Logistic Regression Mode
    logistic_regression = LogisticRegression()
    logistic_regression.fit(X_train, y_train)
    predictions = logistic_regression.predict(X_test)
    print(accuracy_score(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))

    upload_filePath = 'logistic_regression.pkl'

    with open(upload_filePath, "wb") as fp:
        pickle.dump(logistic_regression, fp, protocol=2)

    return upload_filePath

In [None]:
AWS_ACCESS_KEY_ID = sys.argv[1]
AWS_SECRET_ACCESS_KEY = sys.argv[2]
functions_ignitor(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
print('PROGRAM EXECUTED SUCCESSFULLY')

In [29]:
import boto3
S3 = boto3.client('s3', region_name='us-east-1')

In [30]:
response = S3.get_object(Bucket='ads-final-project', Key='logistic_regression.pkl')

In [31]:
response

{'AcceptRanges': 'bytes',
 'Body': <botocore.response.StreamingBody at 0x188709d2fd0>,
 'ContentLength': 1581,
 'ContentType': 'application/octet-stream',
 'ETag': '"8379587ef06ecc2dd3bab0c623609855"',
 'LastModified': datetime.datetime(2018, 4, 19, 0, 16, 47, tzinfo=tzutc()),
 'Metadata': {},
 'ResponseMetadata': {'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '1581',
   'content-type': 'application/octet-stream',
   'date': 'Thu, 19 Apr 2018 04:45:40 GMT',
   'etag': '"8379587ef06ecc2dd3bab0c623609855"',
   'last-modified': 'Thu, 19 Apr 2018 00:16:47 GMT',
   'server': 'AmazonS3',
   'x-amz-id-2': 'UCucrJG++dm4j7+aKWW7Tm5GGMyNAZ2N+19A1ZO3IEgQxDizHEmnhFrQCzd5F5/arO4yip4r3X0=',
   'x-amz-request-id': '42102C3EE350545F'},
  'HTTPStatusCode': 200,
  'HostId': 'UCucrJG++dm4j7+aKWW7Tm5GGMyNAZ2N+19A1ZO3IEgQxDizHEmnhFrQCzd5F5/arO4yip4r3X0=',
  'RequestId': '42102C3EE350545F',
  'RetryAttempts': 0}}

In [32]:
model_str = response['Body'].read()

In [36]:
df = pd.read_csv('trainDataFromS3/churn_test.csv')

In [38]:
small_df = df.head(12)

In [39]:
small_df.to_csv('small_test.csv', index=False)

In [41]:
X_train, X_test, y_train, y_test = data_transformation('trainDataFromS3/churn_train.csv')

In [42]:
X_train.shape

(4499, 52)

In [56]:
def script_data_transformation(data_dataframe):
	df = data_dataframe
	df = df.replace(r'^\s+$', np.nan, regex=True)
	df = df.dropna()
	df['SeniorCitizen']=pd.Categorical(df['SeniorCitizen'])
	df['TotalCharges']=pd.to_numeric(df['TotalCharges'])
	del df["customerID"]
	num = ['float64', 'int64']
	num_df = df.select_dtypes(include=num)
	obj_df = df.select_dtypes(exclude=num)
	tenure_bins=pd.cut(num_df["tenure"], bins=[0,20,60,80], labels=['low','medium','high'])
	MonthlyCharges_bins=pd.cut(num_df["MonthlyCharges"], bins=[0,35,60,130], labels=['low','medium','high'])
	TotalCharges_bins=pd.cut(num_df["TotalCharges"], bins=[0,1000,4000,10000], labels=['low','medium','high'])
	bins=pd.DataFrame([tenure_bins, MonthlyCharges_bins, TotalCharges_bins]).T
	df['SeniorCitizen'] = df.SeniorCitizen.map({0:'No', 1:'Yes'})
	df=pd.concat([bins,obj_df],axis=1)
	for i in list(df.columns):
	    df[i] = pd.Categorical(df[i]) 
	dummy = pd.get_dummies(df)

	###########CHANGE WHEN "CHURN" COLUMN IS REMOVED################
	features = dummy.drop(["Churn_Yes", "Churn_No"], axis=1).columns
	print(features)
	data_X = dummy[features]
	data_y = dummy["Churn_Yes"]
	print('0) data_X.shape: ', data_X.shape)

	return data_X, data_y
	###########CHANGE WHEN "CHURN" COLUMN IS REMOVED################

In [57]:
data_dataframe = pd.read_csv('small_test.csv')

In [60]:
big_dataframe = pd.read_csv('trainDataFromS3/churn_train.csv')

In [59]:
data_dataframe.shape

(12, 21)

In [61]:
big_dataframe.shape

(5634, 21)

In [58]:
data_X, data_y = script_data_transformation(data_dataframe)

Index(['tenure_low', 'tenure_medium', 'MonthlyCharges_high',
       'MonthlyCharges_low', 'MonthlyCharges_medium', 'TotalCharges_high',
       'TotalCharges_low', 'TotalCharges_medium', 'gender_Female',
       'gender_Male', 'SeniorCitizen_0', 'SeniorCitizen_1', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMo

In [62]:
data_X, data_y = script_data_transformation(big_dataframe)

Index(['tenure_high', 'tenure_low', 'tenure_medium', 'MonthlyCharges_high',
       'MonthlyCharges_low', 'MonthlyCharges_medium', 'TotalCharges_high',
       'TotalCharges_low', 'TotalCharges_medium', 'gender_Female',
       'gender_Male', 'SeniorCitizen_0', 'SeniorCitizen_1', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
    