In [1]:
print("commenting on file in Domino :)")

commenting on file in Domino :)


In [1]:
import pandas as pd
import numpy as np
import random
import math
import pickle
import json
import os
import requests
import upload_to_s3

### Note - make sure you have AWS creds in environment variables for uploading data to s3 bucket

## Import Training Data

In [2]:
#Bring in data used to train pickled model that is loaded in later
df = pd.read_csv('/mnt/data/smallPrepared.csv')
print(df.shape)
df.head()

(7939, 7)


Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
0,844336,0.016364,550,28,89.2,45,0
1,146041,0.018349,545,33,54.2,43,0
2,847745,0.018519,378,41,55.3,41,0
3,285565,0.014493,552,32,66.8,31,0
4,754611,0.012132,577,4,87.2,43,0


In [3]:
#append data to istelf to double volume
df2 = df.append(df)
df2.shape

(15878, 7)

In [4]:
#Reset custid field so that there are no repeats
df2['custid'] = np.random.choice(range(df.custid.min(), df.custid.max()),df2.shape[0], replace=False)

In [5]:
df2.describe()

Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
count,15878.0,15878.0,15878.0,15878.0,15878.0,15878.0,15878.0
mean,496607.47865,0.020052,501.498677,26.521728,70.104371,41.622119,0.049628
std,288641.329328,0.006617,99.672047,8.12379,20.062057,9.848793,0.217183
min,310.0,0.0,132.0,0.0,3.9,18.0,0.0
25%,247046.75,0.015601,435.0,21.0,56.5,35.0,0.0
50%,492773.0,0.019084,502.0,27.0,69.7,41.0,0.0
75%,745806.0,0.023438,568.0,32.0,83.8,48.0,0.0
max,999401.0,0.099338,865.0,55.0,139.4,84.0,1.0


In [6]:
df2.dtypes

custid            int64
dropperc        float64
mins              int64
consecmonths      int64
income          float64
age               int64
churn_Y           int64
dtype: object

## Randomly adjust data by up to 60% for simulated drift

#### Choose a random percentage between a set range to scale a variable by
##### Still experimenting with different ranges for best result but a larger range will generally produce more data drift
##### Cast all integer vars back to integers

In [8]:
##For each input feature adjust data and round/cast as necessary
#dropperc - 50%-150%
droppJitter = df2.dropperc.apply(lambda x : x*(random.randrange(50,150))/100)
#mins - 70%-130%
minsJitter = df2.mins.apply(lambda x : x*(random.randrange(70,130)/100)).round(0).astype(int)
#consecMonths - 80%-120%
consecMonthsJitter = df2.consecmonths.apply(lambda x : x*(random.randrange(80,120)/100)).round(0).astype(int)
#Income - 40%-160%
incomeJitter = df2.income.apply(lambda x : x*(random.randrange(40,160)/100)).round(1)
#age - 90%-110%
ageJitter = df2.age.apply(lambda x : x*(random.randrange(90,110)/100)).round(0).astype(int)

In [9]:
#Take all the new 'jittered' variables and write to a new df
#Keep original custid and churn_Y fields
df3 = pd.DataFrame({'custid': df2.custid,
       'dropperc': droppJitter, 
       'mins': minsJitter,
       'consecmonths': consecMonthsJitter,
       'income': incomeJitter,
       'age': ageJitter,
       'churn_Y': df2.churn_Y
                   })

In [10]:
df2.head()

Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
0,669631,0.016364,550,28,89.2,45,0
1,849864,0.018349,545,33,54.2,43,0
2,776254,0.018519,378,41,55.3,41,0
3,128614,0.014493,552,32,66.8,31,0
4,248866,0.012132,577,4,87.2,43,0


In [11]:
df3.head()

Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
0,669631,0.014564,704,33,43.7,45,0
1,849864,0.025321,659,37,74.3,39,0
2,776254,0.014074,306,34,42.0,43,0
3,128614,0.00971,392,26,90.8,31,0
4,248866,0.012374,410,4,64.5,42,0


In [12]:
#Understand correlations between new jittered data and original
#Should see larger data drift for lower correlations
concatset = pd.concat([df2,df3], axis =1)
concatset.columns=(list(df2.columns)+list('Jittered_'+df3.columns))
print('Correlations between original and altered vars -')
for i,k in enumerate(df2.columns):
    print('{}: {}'.format(k, concatset.corr()[k]['Jittered_'+k].round(4)))
    

Correlations between original and altered vars -
custid: 1.0
dropperc: 0.7323
mins: 0.7438
consecmonths: 0.9304
income: 0.6232
age: 0.9694
churn_Y: 1.0


In [13]:
#This kl divergence is not on same scale as DMM kl divergence
## Look into this more later
def kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

kl_divergence(df2.age, df3.age)

4328.619215502987

## Load Model In

In [14]:
loaded_model = pickle.load(open('/mnt/Models/BestModelCV.pkl', 'rb'))

In [15]:
#Grab between 100 and 300 random rows from jittered data
df_inf = df3.sample(n = random.randint(100,300))
print(df_inf.shape[0], "records selected for sample")

188 records selected for sample


In [16]:
df_inf.head()

Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
115,160212,0.010923,597,20,47.7,36,0
6786,10567,0.027149,468,30,85.7,46,0
6840,830045,0.018336,511,32,53.1,50,0
4885,348218,0.009085,558,27,67.2,36,0
1795,866302,0.008817,576,43,38.2,52,0


In [17]:
#Save input features
X = df_inf.loc[:, 'dropperc':'age']
X.head()

Unnamed: 0,dropperc,mins,consecmonths,income,age
115,0.010923,597,20,47.7,36
6786,0.027149,468,30,85.7,46
6840,0.018336,511,32,53.1,50
4885,0.009085,558,27,67.2,36
1795,0.008817,576,43,38.2,52


In [18]:
#Get model predictions for the sample of input features defined above
predictions = loaded_model.predict(X)

In [19]:
#Check accuracy and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix
print("Accuracy -", accuracy_score(df_inf.churn_Y, predictions).round(4))
print("--------------")
# print("Confusion Matrix")
print(confusion_matrix(df_inf.churn_Y, predictions))

Accuracy - 0.9309
--------------
[[169   6]
 [  7   6]]


In [21]:
#write ground truth data out
groundTruth = pd.DataFrame(df_inf[['custid','churn_Y']]).rename(columns = {'churn_Y': 'y_gt'})

In [22]:
# preds = pd.DataFrame(columns = {'churn_Y': prediction.values}, index = groundTruth.index)
preds_df = pd.DataFrame(data=predictions, columns=['churn_Y'], index=groundTruth.index)

In [23]:
#Preview accuracy for random data
accuracy_score(groundTruth.y_gt, preds_df.churn_Y)

0.9308510638297872

In [24]:
prod_inputs_and_preds = df_inf.drop('churn_Y', axis =1).join(preds_df)

In [25]:
prod_inputs_and_preds.head()

Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
115,160212,0.010923,597,20,47.7,36,0
6786,10567,0.027149,468,30,85.7,46,0
6840,830045,0.018336,511,32,53.1,50,0
4885,348218,0.009085,558,27,67.2,36,0
1795,866302,0.008817,576,43,38.2,52,0


In [26]:
groundTruth.head()

Unnamed: 0,custid,y_gt
115,160212,0
6786,10567,0
6840,830045,0
4885,348218,0
1795,866302,0


In [27]:
## This should return true 
prod_inputs_and_preds.shape[0]==groundTruth.shape[0]

True

In [142]:
#ground truth
prod_pred_partitions = upload_to_s3.split_data_export(prod_inputs_and_preds,1,'prod_inputs_and_preds')        
ground_truth_partitions = upload_to_s3.split_data_export(groundTruth,1,'prod_ground_truth')
print(ground_truth_partitions)
 
#save latest file names
file_name_dict = {}
file_name_dict['prod_predictions'] = prod_pred_partitions
file_name_dict['prod_ground_truth'] = ground_truth_partitions
 
file_names = json.dumps(file_name_dict)
f = open("/mnt/temp/prod_files_latest.json","w")
f.write(file_names)
f.close()
 
bucket = '<Your bucket name>'

CSV name:  /mnt/temp/prod_inputs_and_preds_2021-02-02.csv
CSV name:  /mnt/temp/prod_ground_truth_2021-02-02.csv
['/mnt/temp/prod_ground_truth_2021-02-02.csv']


In [138]:
#Upload to S3 bucket

for name in prod_pred_partitions:
    upload_to_s3.upload(name, bucket)
    
for name in ground_truth_partitions:
    upload_to_s3.upload(name, bucket)

prod_inputs_and_preds_2021-02-02.csv Upload Successful
prod_ground_truth_2021-02-02.csv Upload Successful


In [139]:
#Setup for DMM data registration
 
with open('/mnt/temp/active_model_version.json') as json_file:
    model_version_dict = json.load(json_file)
    json_file.close()
    
model_version = model_version_dict['model_version']
 
with open('/mnt/temp/prod_files_latest.json') as json_file: 
    data = json.load(json_file)
    
preds = data['prod_predictions']
ground_truth = data['prod_ground_truth']


In [140]:
#Register prediction data in DMM
#Note running this cell will probably fail and say that the data is already registered
#If so then today's data is already registered and no further action req'd
for name in preds:
    
    file_name = os.path.basename(name)
    
    #Customer churn "prod"
    
    url = "https://trial.dmm.domino.tech/api/v0/models/"+model_version+"/add_predictions"
 
 
    payload = "{\n  \"dataLocation\": \"https://s3.us-east-2.amazonaws.com/"+bucket+"/"+file_name+"\"\n}"
    headers = {
               'Authorization': 'eyJhbGciOiJIUzUxMiJ9.eyJpZCI6IjVlZThjMDA0YjJjYjJmNTliMGRiYmU4NiIsInVzZXJuYW1lIjoiY29saW4uZGVtby5kZGwudHJpYWwiLCJ1c2VyX3R5cGUiOiJhcGkiLCJvcmdhbml6YXRpb25faWQiOiI1ZWQ2NWRhZGQ5ZmFiZjAwMDE0ZjI5MmYifQ.bTuQUr57LtU9By6pXUP_TVMCj_MIbhYLo4ULamcafWJogx4oe8r_p8tQ5xARFQiJQzoOQL2u9-GO5FS6y7Wgpw',
               'Content-Type': 'application/json'
              }
 
    response = requests.request("PUT", url, headers=headers, data = payload)
 
    print(response.text.encode('utf8'))

b'{"status": "success"}'


In [147]:
#Register ground truth data in DMM
#Note running this cell will probably fail and say that the data is already registered
#If so then today's data is already registered and no further action req'd
for name in ground_truth:
    
    file_name = os.path.basename(name)
    
    url = "https://trial.dmm.domino.tech/api/v0/models/"+model_version+"/add_ground_truths"
 
    payload = "{\n  \"dataLocation\": \"https://s3.us-east-2.amazonaws.com/"+bucket+"/"+file_name+"\"\n}"
    headers = {'Authorization': 'eyJhbGciOiJIUzUxMiJ9.eyJpZCI6IjVlZThjMDA0YjJjYjJmNTliMGRiYmU4NiIsInVzZXJuYW1lIjoiY29saW4uZGVtby5kZGwudHJpYWwiLCJ1c2VyX3R5cGUiOiJhcGkiLCJvcmdhbml6YXRpb25faWQiOiI1ZWQ2NWRhZGQ5ZmFiZjAwMDE0ZjI5MmYifQ.bTuQUr57LtU9By6pXUP_TVMCj_MIbhYLo4ULamcafWJogx4oe8r_p8tQ5xARFQiJQzoOQL2u9-GO5FS6y7Wgpw',
               'Content-Type': 'application/json'
              }
 
    response = requests.request("PUT", url, headers=headers, data = payload)
 
    print(response.text.encode('utf8'))

b'{"status": "Fail", "message": "[\'dataColumns list should be of length one as its first time adding ground truth\']"}'
