In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from random import randint 
import time
import random 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, plot_confusion_matrix
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn import metrics

In [2]:
#from orion.contrib.envs import load_env
#load_env()
import boto3
#from orion.sources import S3Source
aws_bucket = 'kilimanjaro-prod-datalake'
s3 = boto3.client('s3')

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
object_ = 'masters/datascience/emma/retail.csv'
retail = s3.get_object(Bucket=aws_bucket, Key=object_)
retail = pd.read_csv(retail['Body'], index_col=0)

In [5]:
retail = retail.rename(columns={'class':'cluster'})
retail.head(1)

Unnamed: 0,ASP,cluster,loyaltyaccount_No,loyaltyaccount_Yes,gender_female,gender_male,gender_unknown,shipcountry_Albania,shipcountry_Armenia,shipcountry_Australia,...,category_Childrens,category_Infant,category_Junior,category_Mens,category_Miscellaneous,category_Nursery,category_Womens,divisioncode_ACCESSORY,divisioncode_APPAREL,divisioncode_FOOTWEAR
0,59.92,0,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [6]:
wine_white = pd.read_csv('winequality-white.csv', delimiter=';')
wine_red = pd.read_csv('winequality-red.csv', delimiter=';')

In [7]:
# Red Wine Quality Low vs High
WineRLvH = deepcopy(wine_red)
WineRLvH = WineRLvH[(WineRLvH['quality'] <5)| (WineRLvH['quality'] >6)]
WineRLvH.loc[WineRLvH.quality >= 7, "class"] = 0
WineRLvH.loc[WineRLvH.quality <= 4, "class"] = 1
WineRLvH['class'] = WineRLvH['class'].astype("int")
WineRLvH['class'] = WineRLvH['class'].astype("category")
WineRLvH = WineRLvH.drop(columns=['quality'])
print(WineRLvH['class'].value_counts())
WineRLvH.name ='D12 - Wine_red_LvH'

0    217
1     63
Name: class, dtype: int64


In [8]:
# White Wine Quality Low vs High
WineWLvH = deepcopy(wine_white)
WineWLvH = WineWLvH[(WineWLvH['quality'] <5)| (WineWLvH['quality'] >6)]
WineWLvH.loc[WineWLvH.quality >= 7, "class"] = 0
WineWLvH.loc[WineWLvH.quality <= 4, "class"] = 1
WineWLvH['class'] = WineWLvH['class'].astype("int")
WineWLvH['class'] = WineWLvH['class'].astype("category")
WineWLvH = WineWLvH.drop(columns=['quality'])
print(WineWLvH['class'].value_counts())
WineWLvH.name = 'D10 - Wine_white_LvH'

0    1060
1     183
Name: class, dtype: int64


In [9]:
# Cust Summary focusing on singular minority
cust_sum2 = deepcopy(retail)
cust_sum2.loc[cust_sum2.cluster ==0, "class"] = 0
cust_sum2.loc[cust_sum2.cluster >= 1, "class"] = 1
cust_sum2['class'] = cust_sum2['class'].astype("int")
cust_sum2['class'] = cust_sum2['class'].astype("category")
cust_sum2 = cust_sum2.drop(columns=['cluster'])
print(cust_sum2['class'].value_counts())
cust_sum2.name ='Retail Data -2'

0    1935848
1     269025
Name: class, dtype: int64


In [10]:
# train test/ scaling
def data_prep (data, seed):
  X= data.drop('class',axis=1).copy()
  y = data['class'].copy()
  y = y.astype('category')
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed, shuffle=y, stratify=y) # add ssed

  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
  X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [11]:
def random_under_minority (data, imbalance_level, seed):

  random.seed(seed)
  
  X_train_scaled, X_test_scaled, y_train, y_test = data_prep(data, seed)
  y_train = y_train.reset_index(drop=True)
  X_train_scaled['class'] = y_train

  class_counts = X_train_scaled['class'].value_counts().to_frame()
  maj_count = class_counts.iloc[0,0]
  min_count = class_counts.iloc[1,0]

  majority = X_train_scaled[(X_train_scaled['class'] == 0)]
  minority = X_train_scaled[(X_train_scaled['class']== 1)]

  if imbalance_level == 'absolute':
    downsample = 6
  
  else:
    if imbalance_level == 'high':
      imbalance = 0.05
      
    elif imbalance_level == 'extreme':
      imbalance = 0.01

    downsample = (maj_count * imbalance).round().astype('int')
    
    if imbalance_level == 'extreme' and downsample < 8:
      downsample = 8
 
  if downsample >= min_count:
    minority_sample = minority
      
  else:
    minority_sample = minority.sample(n= downsample)

  final = pd.concat([majority, minority_sample])
  final = shuffle(final)

  X_train_scaled = final.drop('class',axis=1).copy()
  y_train = final['class'].copy()
  y_train = y_train.astype('category')

  X_train_scaled = X_train_scaled
  X_test_scaled = X_test_scaled
  y_train = y_train.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [1]:
X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(cust_sum2, 'high', 3)

NameError: name 'random_under_minority' is not defined

In [47]:
X_train_scaledw, X_test_scaledx, y_trainw, y_testw = random_under_minority(WineWLvH, 'high', 3)

In [48]:
type(X_train_scaledw)

pandas.core.frame.DataFrame

In [49]:
from io import StringIO  
csv_buffer = StringIO()
X_train_scaledw.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object('kilimanjaro-prod-datalake','masters/datascience/emma/x_train.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '7H380CD5SN94D9KE',
  'HostId': 'FZAGvPc4+x3LJBVAKPSO41LZJf+duo0ukaT7/xj3sScn8+S6bvTDgd2cYI/VfcyTmBpBWF3xh5M=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'FZAGvPc4+x3LJBVAKPSO41LZJf+duo0ukaT7/xj3sScn8+S6bvTDgd2cYI/VfcyTmBpBWF3xh5M=',
   'x-amz-request-id': '7H380CD5SN94D9KE',
   'date': 'Tue, 20 Jul 2021 18:05:59 GMT',
   'x-amz-version-id': 'rHAPKViWV9iRXInUlkcyhf4AShBh5gpe',
   'x-amz-server-side-encryption': 'aws:kms',
   'x-amz-server-side-encryption-aws-kms-key-id': 'arn:aws:kms:eu-west-1:794236216820:key/68512d48-1991-4929-91a2-1892476cd145',
   'etag': '"c0691e42522ad16c743c6a29064a87ab"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 1},
 'ETag': '"c0691e42522ad16c743c6a29064a87ab"',
 'ServerSideEncryption': 'aws:kms',
 'VersionId': 'rHAPKViWV9iRXInUlkcyhf4AShBh5gpe',
 'SSEKMSKeyId': 'arn:aws:kms:eu-west-1:794236216820:key/68512d48-1991-4929-91a2-1892476cd145'}

In [None]:
import scipy.spatial.distance
import math

In [78]:
def Div_SP(X, Dis):
  ## Will need to check type when coming in 
  
  ## python func pairwise distances between observations in n-dimensional space. Not sure if this is an equivalent?
  M = np.matrix(np.exp(-1*(scipy.spatial.distance.pdist(X,'Dist'))))
  Div = sum(M_1)
  return Div