In [3]:
import pandas as pd
import numpy as np
from copy import deepcopy
from random import randint 
import time
import random 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, plot_confusion_matrix
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn import metrics

In [4]:
#from orion.contrib.envs import load_env
#load_env()
import boto3
#from orion.sources import S3Source
aws_bucket = 'kilimanjaro-prod-datalake'
s3 = boto3.client('s3')

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
object_ = 'masters/datascience/emma/retail.csv'
retail = s3.get_object(Bucket=aws_bucket, Key=object_)
retail = pd.read_csv(retail['Body'], index_col=0)

In [7]:
retail = retail.rename(columns={'class':'cluster'})
retail.head(1)

Unnamed: 0,ASP,cluster,loyaltyaccount_No,loyaltyaccount_Yes,gender_female,gender_male,gender_unknown,shipcountry_Albania,shipcountry_Armenia,shipcountry_Australia,...,category_Childrens,category_Infant,category_Junior,category_Mens,category_Miscellaneous,category_Nursery,category_Womens,divisioncode_ACCESSORY,divisioncode_APPAREL,divisioncode_FOOTWEAR
0,59.92,0,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [8]:
wine_white = pd.read_csv('winequality-white.csv', delimiter=';')
wine_red = pd.read_csv('winequality-red.csv', delimiter=';')

In [9]:
# Red Wine Quality Low vs High
WineRLvH = deepcopy(wine_red)
WineRLvH = WineRLvH[(WineRLvH['quality'] <5)| (WineRLvH['quality'] >6)]
WineRLvH.loc[WineRLvH.quality >= 7, "class"] = 0
WineRLvH.loc[WineRLvH.quality <= 4, "class"] = 1
WineRLvH['class'] = WineRLvH['class'].astype("int")
WineRLvH['class'] = WineRLvH['class'].astype("category")
WineRLvH = WineRLvH.drop(columns=['quality'])
print(WineRLvH['class'].value_counts())
WineRLvH.name ='D12 - Wine_red_LvH'

0    217
1     63
Name: class, dtype: int64


In [10]:
# White Wine Quality Low vs High
WineWLvH = deepcopy(wine_white)
WineWLvH = WineWLvH[(WineWLvH['quality'] <5)| (WineWLvH['quality'] >6)]
WineWLvH.loc[WineWLvH.quality >= 7, "class"] = 0
WineWLvH.loc[WineWLvH.quality <= 4, "class"] = 1
WineWLvH['class'] = WineWLvH['class'].astype("int")
WineWLvH['class'] = WineWLvH['class'].astype("category")
WineWLvH = WineWLvH.drop(columns=['quality'])
print(WineWLvH['class'].value_counts())
WineWLvH.name = 'D10 - Wine_white_LvH'

0    1060
1     183
Name: class, dtype: int64


In [11]:
# Cust Summary focusing on singular minority
cust_sum2 = deepcopy(retail)
cust_sum2.loc[cust_sum2.cluster ==0, "class"] = 0
cust_sum2.loc[cust_sum2.cluster >= 1, "class"] = 1
cust_sum2['class'] = cust_sum2['class'].astype("int")
cust_sum2['class'] = cust_sum2['class'].astype("category")
cust_sum2 = cust_sum2.drop(columns=['cluster'])
print(cust_sum2['class'].value_counts())
cust_sum2.name ='Retail Data -2'

0    1935848
1     269025
Name: class, dtype: int64


In [12]:
# train test/ scaling
def data_prep (data, seed):
  X= data.drop('class',axis=1).copy()
  y = data['class'].copy()
  y = y.astype('category')
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed, shuffle=y, stratify=y) # add ssed

  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
  X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [13]:
def random_under_minority (data, imbalance_level, seed):

  random.seed(seed)
  
  X_train_scaled, X_test_scaled, y_train, y_test = data_prep(data, seed)
  y_train = y_train.reset_index(drop=True)
  X_train_scaled['class'] = y_train

  class_counts = X_train_scaled['class'].value_counts().to_frame()
  maj_count = class_counts.iloc[0,0]
  min_count = class_counts.iloc[1,0]

  majority = X_train_scaled[(X_train_scaled['class'] == 0)]
  minority = X_train_scaled[(X_train_scaled['class']== 1)]

  if imbalance_level == 'absolute':
    downsample = 6
  
  else:
    if imbalance_level == 'high':
      imbalance = 0.05
      
    elif imbalance_level == 'extreme':
      imbalance = 0.01

    downsample = (maj_count * imbalance).round().astype('int')
    
    if imbalance_level == 'extreme' and downsample < 8:
      downsample = 8
 
  if downsample >= min_count:
    minority_sample = minority
      
  else:
    minority_sample = minority.sample(n= downsample)

  final = pd.concat([majority, minority_sample])
  final = shuffle(final)

  X_train_scaled = final.drop('class',axis=1).copy()
  y_train = final['class'].copy()
  y_train = y_train.astype('category')

  X_train_scaled = X_train_scaled
  X_test_scaled = X_test_scaled
  y_train = y_train.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [14]:
X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(cust_sum2, 'high', 3)

In [26]:
class SingularMatrixException(Exception):
    def __init__(self):
        Exception.__init__(self,"Singular data matrix... use subspace") 

class MAHAKIL(object):
    def __init__(self, pfp=0.5):
        self.data_t = None  # Save the initial defect sample
        self.pfp = pfp  # Proportion of expected defect samples
        self.T = 0  # Number of defect samples to be generated
        self.new = []  # Store newly generated samples

    # Core method
    # return : data_new, label_new
    def fit_sample(self, data, label):
        
        label = label.reset_index(drop=True)
        data = data.reset_index(drop=True)
        
        label_f = np.array(label[label==0])
        label_t = np.array(label[label==1])
        
        data_f = np.array(data[label==0])
        data_t = np.array(data[label==1])
        
        data_t = data_t+0.00001*np.random.rand((data_t.shape)[0],(data_t.shape)[1])
  
        self.T = len(data_f) / (1 - self.pfp) - len(data_f)
        self.data_t = np.array(data_t)
        print('sorted data')
        # Calculate the Mahalanobis distance
        d = self.mahalanobis_distance(self.data_t)
        
        d = pd.DataFrame (d,columns=['Malhabonis Distance'])
        d = d.reset_index(drop=False)
        d = d.values.tolist()
        
        print('calculated distance')
        # Descending order
        d.sort(key=lambda x: x[1], reverse=True)
        # Divide the set of positive examples into two
        k = len(d)
        d_index = [d[i][0] for i in range(k)]
        d_index = [ int(d) for d in d_index ]
        data_t_sorted = [data_t[i] for i in d_index]
        mid = int(k/2)
        bin1 = [data_t_sorted[i] for i in range(0, mid)]
        bin2 = [data_t_sorted[i] for i in range(mid, k)]
        # Loop iteration to generate new samples
        l_ = len(bin1)
        mark = [1, 3, 7, 15, 31, 63, 127, 255, 511]
        p = self.T / l_
        is_full = True
        g = mark.index([m for m in mark if m > p][0]) + 1
        cluster = 2 ** (g - 1)  # Number of children of the last generation
        if (self.T - mark[g-2]*l_) < cluster:
            # Explain that adding more generations is better than keeping a few
            is_full = False
            g -= 1
            k = 0
        else:
            k = l_ - round((self.T - mark[g-2]*l_)/cluster)
        self.generate_new_sample(bin1, bin2, g, l_, k, is_full)
        # Return data and labels
        label_new = np.ones(len(self.new))
        print(type(data))
        return np.append(data, self.new, axis=0), np.append(label, label_new, axis=0)

    def mahalanobis_distance(self, x):
        x_mu = x - np.mean(x)
        cov = np.cov(x.T)
        inv_covmat = np.linalg.inv(cov)
        left = np.dot(x_mu, inv_covmat)
        mahal = np.dot(left, x_mu.T).diagonal()
        return mahal


    # Generate new samples
    def generate_new_sample(self, bin1, bin2, g, l, k, is_full):
        # bin1, bin2 are arrays
        # g Hereditary remaining algebra
        # l bin1 number of items
        # k The number of each node to be cropped in the last generation
        # is_full whether it overflows, that is, the last generation is counted, whether it exceeds T, or is not full
        assert len(bin1) <= len(bin2)
        if g >= 2 or (g == 1 and is_full is False):
            print('reached')
            lv_0 = []  # Offspring
            for i in range(l):
                # Generate children
                lv_0.append(np.mean(np.append(np.atleast_2d(bin1[i]), np.atleast_2d(bin2[i]), axis=0), axis=0))
            self.new.extend(lv_0)
            self.generate_new_sample(lv_0, bin1, g-1, l, k, is_full)
            self.generate_new_sample(lv_0, bin2, g-1, l, k, is_full)
        if g == 1 and is_full:
            lv_0 = []  # Offspring
            for i in range(l):
                # Generate children
                lv_0.append(np.mean(np.append(np.atleast_2d(bin1[i]), np.atleast_2d(bin2[i]), axis=0), axis=0))
            del lv_0[-1: (-k-1): -1]
            self.new.extend(lv_0)

In [33]:
X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(cust_sum2, 'high', 3)

In [34]:
mk = MAHAKIL(pfp=0.5)

In [35]:
X_train_scaled, y_train = mk.fit_sample(X_train_scaled, y_train)

sorted data
calculated distance
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
reached
<class 'pandas.core.frame.DataFrame'>


In [13]:
y_train = y_train.reset_index(drop=True)
X_train_scaled = X_train_scaled.reset_index(drop=True)

In [14]:
label_f = np.array(y_train[y_train==0])
label_t = np.array(y_train[y_train==1])

In [15]:
data_f = np.array(X_train_scaled[y_train==0])
data_t = np.array(X_train_scaled[y_train==1])

In [16]:
data_t = data_t+0.00001*np.random.rand((data_t.shape)[0],(data_t.shape)[1])

In [17]:
x_mu = data_t - np.mean(data_t)
cov = np.cov(data_t.T)
inv_covmat = np.linalg.inv(cov)
left = np.dot(x_mu, inv_covmat)
mahal = np.dot(left, x_mu.T).diagonal()

In [26]:
d= mahal

NameError: name 'mahal' is not defined

In [None]:
d = pd.DataFrame (mahal,columns=['Malhabonis Distance'])
d = d.reset_index(drop=False)
d = d.values.tolist()

In [21]:
md.sort(key=lambda x: x[1], reverse=True)

In [22]:
d=md

In [24]:
k = len(d)
k

72594

In [26]:
d_index = [d[i][0] for i in range(k)]

In [27]:
d_index = [ int(d) for d in d_index ]

In [28]:
data_t_sorted = [data_t[i] for i in d_index]

In [25]:
mid = int(k/2)

In [29]:
bin1 = [data_t_sorted[i] for i in range(0, mid)]
bin2 = [data_t_sorted[i] for i in range(mid, k)]

In [148]:
k = len(d)
d_index = [d[i][0] for i in range(k)]
d_index = [ int(d) for d in d_index ]
data_t_sorted = [data_t[i] for i in d_index]
mid = int(k/2)
bin1 = [data_t_sorted[i] for i in range(0, mid)]
bin2 = [data_t_sorted[i] for i in range(mid, k)]

In [122]:
def fit_sample(data, label):
        # data: sample array containing metric information
        # label: sample label array
        data_t, data_f, label_t, label_f = [], [], [], []
        # Divide the data set according to positive examples and negative examples
        for i in range(label.shape[0]):
            if label[i] == 1:
                data_t.append(data[i])
                label_t.append(label[i])
            if label[i] == 0:
                data_f.append(data[i])
                label_f.append(label[i])
        data_t = np.array(data_t)
        print('sorted data')
        # Calculate the Mahalanobis distance
        d = mahalanobis_distance(data_t)
        print(d)
        print('calculated distance')
        print(type(d))
        # Descending order
        d.sort(key=lambda x: x[1], reverse=True)
        print(d)
        print(type(data_t))
        

In [127]:
X_train_scaledw, X_test_scaledw, y_trainw, y_testw = random_under_minority(WineRLvH, 'high', 3)

In [128]:
X_train_scaledw = X_train_scaledw.values
y_trainw = y_trainw.values

In [132]:
d = fit_sample(X_train_scaledw, y_trainw )

sorted data
[(0, 2.6809826641406698), (1, 0.8095923412214483), (2, 1.2052161935969954), (3, 0.9058553549091795), (4, 1.1019685282270388), (5, 0.9408875271837776), (6, 0.8884113402518116), (7, 0.7949815004522905)]
calculated distance
<class 'list'>
[(0, 2.6809826641406698), (2, 1.2052161935969954), (4, 1.1019685282270388), (5, 0.9408875271837776), (3, 0.9058553549091795), (6, 0.8884113402518116), (1, 0.8095923412214483), (7, 0.7949815004522905)]
<class 'numpy.ndarray'>


In [130]:
def mahalanobis_distance(x):
        # x: array
        mu = np.mean(x, axis=0)  # Mean
        d = []
        for i in range(x.shape[0]):
            x_mu = np.atleast_2d(x[i] - mu)
            s = cov(x)
            d_squre = np.dot(np.dot(x_mu, np.linalg.inv(s)), np.transpose(x_mu))[0][0]
            d_tuple = (i, d_squre)
            d.append(d_tuple)
        return d


In [131]:

    def cov(x):
        # x: array
        s = np.zeros((x.shape[1], x.shape[1]))
        mu = np.mean(x, axis=0)  # Mean
        for i in range(x.shape[0]):
            x_xbr = np.atleast_2d(x - mu)
            s_i = np.dot(np.transpose(x_xbr), x_xbr)
            s = s + s_i
        return np.divide(s, x.shape[0])