In [14]:
import pandas as pd
import numpy as np
import random
import math
import pickle
import json
import os
import requests
import datetime
import boto3
from botocore.exceptions import NoCredentialsError
from domino.training_sets import TrainingSetClient, model

#set DMM vars
bucket = 'winequality-monitoring'
model_id='6226ea73369f1c7e27c38e89'
dmm_model_id='62279c7077ad4ce103068e44'
dmm_api_key = os.environ['DMM_API_KEY']

#Load in data

In [15]:
print('Reading in data for batch scoring')
df = TrainingSetClient.get_training_set_version('winequality-training-', number = 1).load_raw_pandas()

Reading in data for batch scoring


In [16]:
df.head()

Unnamed: 0,id,density,volatile_acidity,chlorides,is_red,alcohol,quality
0,0,1.001,0.27,0.045,0,8.8,5.58
1,1,0.994,0.3,0.049,0,9.5,5.04
2,2,0.9951,0.28,0.05,0,10.1,5.34
3,3,0.9956,0.23,0.058,0,9.9,4.92
4,4,0.9956,0.23,0.058,0,9.9,5.16


In [17]:
df2 = df.append(df).reset_index(drop=True)

In [18]:
df.columns

Index(['id', 'density', 'volatile_acidity', 'chlorides', 'is_red', 'alcohol',
       'quality'],
      dtype='object')

In [19]:
##For each input feature adjust data and round/cast as necessary
#Density - 50%-150
densityJitter = df2.density.apply(lambda x : x*(random.randrange(50,150))/100).round(4)
#volatile acidity - 70%-130%
volatileAcidityJitter = df2.volatile_acidity.apply(lambda x : x*(random.randrange(70,130)/100)).round(2)
#Chlorides - 80%-120%
chloridesJitter = df2.chlorides.apply(lambda x : x*(random.randrange(80,120)/100)).round(3)
#is_red - 40%-160%
is_redJitter = df2.is_red.apply(lambda x : x*(random.randrange(40,160)/100)).round(0)
#alcohol - 90%-110%
alcoholJitter = df2.alcohol.apply(lambda x : x*(random.randrange(90,110)/100)).round(1)

#Take all the new 'jittered' variables and write to a new df
#Keep original custid and churn_Y fields
df3 = pd.DataFrame({'id': df2.id,
       'density': densityJitter, 
       'volatile_acidity': volatileAcidityJitter,
       'chlorides': chloridesJitter,
       'is_red': is_redJitter,
       'alcohol': alcoholJitter,
       'quality': df2.quality
                   })

In [20]:
df3

Unnamed: 0,id,density,volatile_acidity,chlorides,is_red,alcohol,quality
0,0,1.4615,0.25,0.048,0.0,9.0,5.58
1,1,1.0835,0.26,0.057,0.0,8.6,5.04
2,2,1.0150,0.34,0.043,0.0,11.0,5.34
3,3,1.4337,0.27,0.067,0.0,9.5,4.92
4,4,1.2345,0.18,0.054,0.0,10.7,5.16
...,...,...,...,...,...,...,...
12921,6491,0.5082,0.75,0.059,1.0,8.6,5.10
12922,6492,1.0446,0.54,0.090,1.0,10.0,5.60
12923,6494,1.0754,0.53,0.080,1.0,10.3,6.18
12924,6495,1.0652,0.63,0.076,1.0,9.2,5.65


In [21]:
#Grab between 50 and 500 random rows from jittered data
df_inf = df3.sample(n = random.randint(50,500)).reset_index(drop=True)

#set up clean customer_ids
setup_ids = list(range(0, df_inf.shape[0]))
ids = list()
for i in setup_ids:
    ids.append(str(datetime.date.today())+'_'+str(setup_ids[i]))

In [22]:
df_inf['wine_id']=ids    
print('Sending {} records to model API endpoint for scoring'.format(df_inf.shape[0]))

#Set up dictionaries and lists for loops
setup_dict = {}
scoring_request = {}
results = list()

inputs = df_inf[['wine_id','density', 'volatile_acidity', 'chlorides', 'is_red', 'alcohol']]

Sending 95 records to model API endpoint for scoring


In [23]:
for n in range(inputs.shape[0]):
    for i in list(inputs.columns):
        setup_dict.update({i :list(inputs[n:n+1].to_dict().get(i).values())[0]})
        scoring_request = {'data' : setup_dict}
        
        
        response = requests.post("https://ws-dev.domino-eval.com:443/models/6226ea73369f1c7e27c38e89/latest/model",
    auth=(
        "tjBqCvaVFPzC0V9WYb8YgX2mzvpSpqZgsAfPNngwuUqET9x8pO6oPzUV2YTplfK0",
        "tjBqCvaVFPzC0V9WYb8YgX2mzvpSpqZgsAfPNngwuUqET9x8pO6oPzUV2YTplfK0"
    ),
        json=scoring_request
    )
    results.append(response.json().get('result').get('prediction'))

print('Scoring complete')

Scoring complete


In [24]:
df_ground_truth=df_inf[['wine_id', 'quality']].rename({'wine_id': 'event_id', 'quality' : 'quality_GT'}, axis=1)
print(df_ground_truth.shape[0]==inputs.shape[0])
print((df_ground_truth.event_id==inputs.wine_id).sum()==df_ground_truth.shape[0])

True
True


In [25]:
gt_file_name = str('GT_Data_') + str(datetime.date.today())+str('.csv')
gt_file_path = str('/domino/datasets/local/ground_truth_data/')+gt_file_name
df_ground_truth.to_csv(gt_file_path, index=False)

In [26]:
def s3_upload(local_file, bucket):
    s3 = boto3.client('s3', aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
                      aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'])
    
    s3_file_name = '{}'.format(os.path.basename(local_file))
    
    try:
        s3.upload_file(local_file, bucket, s3_file_name)
        print(str(s3_file_name) + " Upload Successful")
        return True
    except FileNotFoundError:
        print("The file was not found")
        return False
    except NoCredentialsError:
        print("Credentials not available")
        return False
    
s3_upload(gt_file_path, bucket)

print('Data Uploaded to s3 bucket at s3://{}/{}'.format(bucket, gt_file_name))
print('Done!')

GT_Data_2022-03-09.csv Upload Successful
Data Uploaded to s3 bucket at s3://winequality-monitoring/GT_Data_2022-03-09.csv
Done!
