In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
from random import randint 
import time
import random 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, plot_confusion_matrix
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn import metrics

In [None]:
#from orion.contrib.envs import load_env
#load_env()
import boto3
#from orion.sources import S3Source
aws_bucket = 'kilimanjaro-prod-datalake'
s3 = boto3.client('s3')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
object_ = 'masters/datascience/emma/retail.csv'
retail = s3.get_object(Bucket=aws_bucket, Key=object_)
retail = pd.read_csv(retail['Body'], index_col=0)

In [None]:
retail = retail.rename(columns={'class':'cluster'})
retail.head(1)

In [None]:
wine_white = pd.read_csv('winequality-white.csv', delimiter=';')
wine_red = pd.read_csv('winequality-red.csv', delimiter=';')

In [None]:
# Red Wine Quality Low vs High
WineRLvH = deepcopy(wine_red)
WineRLvH = WineRLvH[(WineRLvH['quality'] <5)| (WineRLvH['quality'] >6)]
WineRLvH.loc[WineRLvH.quality >= 7, "class"] = 0
WineRLvH.loc[WineRLvH.quality <= 4, "class"] = 1
WineRLvH['class'] = WineRLvH['class'].astype("int")
WineRLvH['class'] = WineRLvH['class'].astype("category")
WineRLvH = WineRLvH.drop(columns=['quality'])
print(WineRLvH['class'].value_counts())
WineRLvH.name ='D12 - Wine_red_LvH'

In [None]:
# Low Quality Red Wine (3,4)
WineT = deepcopy(wine_red)
WineT["class"] = ""
WineT.loc[WineT.quality >= 7, "class"] = 1
WineT.loc[WineT.quality == 6, "class"] = 0
WineT.loc[WineT.quality == 5, "class"] = 0
WineT.loc[WineT.quality <= 4, "class"] = 2
WineT['class'] = WineT['class'].astype("category")
WineT = WineT.drop(columns=['quality'])
WineT.name = 'Wine test'
WineT['class'].value_counts()

In [None]:
# White Wine Quality Low vs High
WineWLvH = deepcopy(wine_white)
WineWLvH = WineWLvH[(WineWLvH['quality'] <5)| (WineWLvH['quality'] >6)]
WineWLvH.loc[WineWLvH.quality >= 7, "class"] = 0
WineWLvH.loc[WineWLvH.quality <= 4, "class"] = 1
WineWLvH['class'] = WineWLvH['class'].astype("int")
WineWLvH['class'] = WineWLvH['class'].astype("category")
WineWLvH = WineWLvH.drop(columns=['quality'])
print(WineWLvH['class'].value_counts())
WineWLvH.name = 'D10 - Wine_white_LvH'

In [None]:
# Cust Summary focusing on singular minority
cust_sum2 = deepcopy(retail)
cust_sum2.loc[cust_sum2.cluster ==0, "class"] = 0
cust_sum2.loc[cust_sum2.cluster >= 1, "class"] = 1
cust_sum2['class'] = cust_sum2['class'].astype("int")
cust_sum2['class'] = cust_sum2['class'].astype("category")
cust_sum2 = cust_sum2.drop(columns=['cluster'])
print(cust_sum2['class'].value_counts())
cust_sum2.name ='Retail Data -2'

In [70]:
# train test/ scaling
def data_prep (data, seed):
  X= data.drop('class',axis=1).copy()
  y = data['class'].copy()
  y = y.astype('category')
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed, shuffle=y, stratify=y) # add ssed

  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
  X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [72]:
def random_under_minority (data, imbalance_level, seed):

  random.seed(seed)
  
  X_train_scaled, X_test_scaled, y_train, y_test = data_prep(data, seed)
  y_train = y_train.reset_index(drop=True)
  X_train_scaled['class'] = y_train

  class_counts = X_train_scaled['class'].value_counts().to_frame()
  maj_count = class_counts.iloc[0,0]
  min_count = class_counts.iloc[1,0]

  majority = X_train_scaled[(X_train_scaled['class'] == 0)]
  minority = X_train_scaled[(X_train_scaled['class']== 1)]

  if imbalance_level == 'absolute':
    downsample = 6
  
  else:
    if imbalance_level == 'high':
      imbalance = 0.05
      
    elif imbalance_level == 'extreme':
      imbalance = 0.01

    downsample = (maj_count * imbalance).round().astype('int')
    
    if imbalance_level == 'extreme' and downsample < 8:
      downsample = 8
 
  if downsample >= min_count:
    minority_sample = minority
      
  else:
    minority_sample = minority.sample(n= downsample)

  final = pd.concat([majority, minority_sample])
  final = shuffle(final)

  X_train_scaled = final.drop('class',axis=1).copy()
  y_train = final['class'].copy()
  y_train = y_train.astype('category')

  X_train_scaled = X_train_scaled
  X_test_scaled = X_test_scaled
  y_train = y_train.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  return X_train_scaled, X_test_scaled, y_train, y_test

In [393]:
X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(WineRLvH,'high', 2)

In [366]:
y_train = y_train.reset_index(drop=True)

In [367]:
y_train.value_counts()

0    989
1    163
2     47
Name: class, dtype: int64

In [345]:
X_train_scaled

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.493186,-0.200761,-0.422799,-0.436785,-0.243628,-0.562582,-0.077561,-0.576077,0.536405,-0.686870,-0.667028
1,1.064921,-1.043492,1.066376,-0.303840,-0.747991,-1.028679,-1.151054,-1.178037,-0.693574,0.168507,1.604767
2,0.949505,-0.987310,1.015025,-0.370312,-0.558855,-0.935459,-1.089712,-0.495458,-0.499367,-0.800920,1.510109
3,0.199306,0.838608,-0.884957,-0.636203,0.050584,-1.121898,-1.089712,0.364485,0.018519,-0.059593,-0.809015
4,0.949505,-0.200761,-0.371448,5.612233,-0.348704,-0.282924,0.106467,2.541215,-0.952517,0.168507,-1.140319
...,...,...,...,...,...,...,...,...,...,...,...
1194,0.141598,-0.200761,0.039359,-0.436785,0.470886,0.369612,2.744194,0.230119,-2.441438,7.353672,-0.477712
1195,-0.089233,1.203791,-0.936307,-0.370312,-0.432764,-1.028679,-1.120383,-1.393023,-0.175688,-0.515794,1.320793
1196,0.026183,0.529606,-0.782255,-0.503258,-0.327689,2.047561,-0.016218,-0.930804,0.471669,1.309009,1.320793
1197,2.334489,-0.594035,1.117727,1.357979,-0.306674,0.276393,-0.108232,0.552597,-1.858817,-0.287694,-0.856344


In [134]:
data = X_train_scaled

In [136]:
data = data.reset_index(drop=True)
            
label_f = np.array(label[label==0])
label_t = np.array(label[label== 1])
            
data_f = np.array(data[label==0])
data_t = np.array(data[label== 1])

In [60]:
label = y_train

In [388]:
class SingularMatrixException(Exception):
    def __init__(self):
        Exception.__init__(self,"Singular data matrix... use subspace") 

class MAHAKIL(object):
    def __init__(self, pfp=0.5):
        self.data_t = None  # Save the initial defect sample
        self.pfp = pfp  # Proportion of expected defect samples
        self.T = 0  # Number of defect samples to be generated
        self.new = []  # Store newly generated samples

    # Core method
    # return : data_new, label_new
    def fit_sample(self, data, label):
        
        for i in range(1, 3):
            print(type(data))
            
            label_f = np.array(label[label==0])
            label_t = np.array(label[label== i])
            
            data_f = np.array(data[label==0])
            data_t = np.array(data[label== i])
            
            data_t = data_t+0.00001*np.random.rand((data_t.shape)[0],(data_t.shape)[1])
            self.T = int(len(data_f) / (1 - self.pfp) - len(data_f))
            self.data_t = np.array(data_t)
            
            # Calculate the Mahalanobis distance
            d = 0
            d = self.mahalanobis_distance(self.data_t)
        
            d = pd.DataFrame (d,columns=['Malhabonis Distance'])
            d = d.reset_index(drop=False)
            d = d.values.tolist()
        

            # Descending order
            d.sort(key=lambda x: x[1], reverse=True)
            # Divide the set of positive examples into two
            k = len(d)
            d_index = [d[i][0] for i in range(k)]
            d_index = [ int(d) for d in d_index ]
            data_t_sorted = [data_t[i] for i in d_index]
            
            mid = int(k/2)
            bin1 = [data_t_sorted[i] for i in range(0, mid)]
            bin2 = [data_t_sorted[i] for i in range(mid, k)]
            # Loop iteration to generate new samples
            l_ = len(bin1)
            print("l", l_)
            mark = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535, 131071, 262143, 524287]
            p = self.T / (l_ +0.0)
            is_full = True
            g = mark.index([m for m in mark if m > p][0]) + 1
            cluster = 2 ** (g - 1)  # Number of children of the last generation
            if (self.T - mark[g-2]*l_) < cluster:
                # Explain that adding more generations is better than keeping a few
                is_full = False
                g -= 1
                k = 0
            else:
                k = l_ - round((self.T - mark[g-2]*l_)/cluster)
            self.generate_new_sample(bin1, bin2, g, l_, k, is_full)
            # Return data and labels
            label_new = np.ones(len(self.new))+ (i-1)
            data = np.append(data, self.new, axis=0)
            data = pd.DataFrame(data)
            label = np.append(label, label_new, axis=0)
        return data, label 

    def mahalanobis_distance(self, x):
        x_mu = x - np.mean(x)
        cov = np.cov(x.T)
        inv_covmat = np.linalg.inv(cov)
        left = np.dot(x_mu, inv_covmat)
        mahal = np.dot(left, x_mu.T).diagonal()
        return mahal


    # Generate new samples
    def generate_new_sample(self, bin1, bin2, g, l, k, is_full):
        # bin1, bin2 are arrays
        # g Hereditary remaining algebra
        # l bin1 number of items
        # k The number of each node to be cropped in the last generation
        # is_full whether it overflows, that is, the last generation is counted, whether it exceeds T, or is not full
        assert len(bin1) <= len(bin2)
        if g >= 2 or (g == 1 and is_full is False):
            lv_0 = []  # Offspring
            for i in range(l):
                # Generate children
                lv_0.append(np.mean(np.append(np.atleast_2d(bin1[i]), np.atleast_2d(bin2[i]), axis=0), axis=0))
            self.new.extend(lv_0)
            self.generate_new_sample(lv_0, bin1, g-1, l, k, is_full)
            self.generate_new_sample(lv_0, bin2, g-1, l, k, is_full)
        if g == 1 and is_full:
            lv_0 = []  # Offspring
            for i in range(l):
                # Generate children
                lv_0.append(np.mean(np.append(np.atleast_2d(bin1[i]), np.atleast_2d(bin2[i]), axis=0), axis=0))
            del lv_0[-1: (-k-1): -1]
            self.new.extend(lv_0)


In [389]:
mk = MAHAKIL(pfp=0.5)

In [390]:
X_train_scaled = X_train_scaled.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

AttributeError: 'numpy.ndarray' object has no attribute 'reset_index'

In [391]:
y_train = y_train.reset_index(drop=True)

AttributeError: 'numpy.ndarray' object has no attribute 'reset_index'

In [386]:
X_train_scaled, y_train = mk.fit_sample(X_train_scaled, y_train)

<class 'pandas.core.frame.DataFrame'>
l 81
p 12.209876543209877
T 989
<class 'pandas.core.frame.DataFrame'>
l 23
p 43.0
T 989


In [387]:
pd.DataFrame(y_train).value_counts()

2.0    2039
1.0    1154
0.0     989
dtype: int64

In [360]:
model_base = GaussianNB()
model_base.fit(X_train_scaled, y_train)

y_pred = model_base.predict(X_test_scaled)
score_f1 = f1_score(y_test, y_pred, average="macro")
GMS = geometric_mean_score(y_test, y_pred, average = 'multiclass')
conf = confusion_matrix(y_test, y_pred)

In [361]:
score_f1

0.37150976885616244

In [362]:
conf

array([[121,  69, 140],
       [  6,  40,   8],
       [  6,   1,   9]])

In [319]:
y_train = y_train

In [320]:
X_train_scaled = pd.DataFrame(X_train_scaled)

In [321]:
X_train_scaled = X_train_scaled.reset_index(drop=True)

In [322]:
y_train

array([1., 0., 0., ..., 1., 1., 1.])

In [323]:
X_train_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.007213,-0.874946,0.347464,-0.104422,-0.054492,-1.028679,-0.905684,0.031257,-0.887781,-0.059593,0.184895
1,0.083890,-0.874946,-0.371448,0.161469,0.050584,0.183173,0.351837,0.660090,0.212726,0.225532,-0.761686
2,-0.666309,2.664525,-1.398465,0.294415,0.302765,-0.842240,-0.936356,-0.490083,0.795348,-1.143071,-0.383054
3,-0.320063,-0.762581,0.758271,-0.370312,-0.033477,-0.655801,-0.782999,-0.812561,0.536405,-0.002568,1.131476
4,0.083890,0.726244,1.117727,2.355069,0.723067,1.674683,3.204263,2.272483,0.018519,2.734638,-1.045660
...,...,...,...,...,...,...,...,...,...,...,...
2033,0.456294,-1.317378,0.330618,-0.314745,-0.515837,-0.291659,-0.593693,-0.822296,-0.309204,1.002509,1.245367
2034,1.057714,-0.220065,1.828622,0.222758,-0.327029,-0.568408,-0.633471,0.010096,-0.948464,0.712928,1.976007
2035,-0.589662,0.111753,-1.305383,-0.497024,0.107393,-0.660166,-0.965587,-0.868154,0.262297,-0.409761,0.578321
2036,-1.136077,0.241682,-0.789468,-0.176083,0.656091,0.840084,-0.298483,-0.887047,1.057332,0.292365,0.887439


In [324]:
label_f = np.array(y_train[y_train==0])
label_t = np.array(y_train[y_train==2])

In [325]:
label_t

array([1., 1., 1., ..., 1., 1., 1.])

In [326]:
data_f = np.array(X_train_scaled[y_train==0])
data_t = np.array(X_train_scaled[y_train==2])

In [327]:
data_t = data_t+0.00001*np.random.rand((data_t.shape)[0],(data_t.shape)[1])

In [328]:
T = len(data_f) / (1 - 0.5) - len(data_f)
T

989.0

In [329]:
x_mu = data_t - np.mean(data_t)
cov = np.cov(data_t.T)
inv_covmat = np.linalg.inv(cov)
left = np.dot(x_mu, inv_covmat)
mahal = np.dot(left, x_mu.T).diagonal()

In [330]:
d= mahal

In [331]:
d = pd.DataFrame (mahal,columns=['Malhabonis Distance'])
d = d.reset_index(drop=False)
d = d.values.tolist()

In [332]:
d.sort(key=lambda x: x[1], reverse=True)

In [333]:
k = len(d)
k

1049

In [334]:
d_index = [d[i][0] for i in range(k)]

In [335]:
d_index = [ int(d) for d in d_index ]

In [336]:
data_t_sorted = [data_t[i] for i in d_index]

In [337]:
mid = int(k/2)

In [338]:
bin1 = [data_t_sorted[i] for i in range(0, mid)]
bin2 = [data_t_sorted[i] for i in range(mid, k)]

In [339]:
l_ = len(bin1)+0.0
l_

524.0

In [226]:
p = T / l_
p

41.208333333333336

In [227]:
mark = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047]

In [228]:
mark = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535, 131071, 262143, 524287]

In [229]:
g = mark.index([m for m in mark if m > p][0]) + 1

In [230]:
k = len(d)
d_index = [d[i][0] for i in range(k)]
d_index = [ int(d) for d in d_index ]
data_t_sorted = [data_t[i] for i in d_index]
mid = int(k/2)
bin1 = [data_t_sorted[i] for i in range(0, mid)]
bin2 = [data_t_sorted[i] for i in range(mid, k)]

In [231]:
bin1

[array([-0.2623491 , -1.26821614,  2.09339858,  3.81747467, -0.79001501,
         2.00096004,  7.0995162 , -1.94123033, -1.92354793, -0.85793975,
         1.79408844]),
 array([-0.20464138, -1.2120378 ,  0.91232444, -0.30383143,  2.69849001,
        -1.02867464, -0.93634795, -1.71549285, -1.01724635,  0.85280875,
         1.98340789]),
 array([ 3.83489186, -1.77385609,  0.8609774 , -0.23736331, -0.26464301,
        -0.56257503, -0.69097569,  1.7618978 , -1.53512997,  1.02389236,
        -1.14031359]),
 array([ 2.10366277, -0.87494165,  2.50420308,  1.09209538, -0.45377606,
        -0.84223776, -0.26158549,  1.95000852, -0.56409331, -0.34471557,
         2.4566996 ]),
 array([ 0.89180036, -0.70639334,  1.32313395,  2.75391449,  0.19769494,
        -0.93545298, -0.84433623,  0.98256891, -0.24042109,  0.90984357,
         1.2261406 ]),
 array([ 0.14160377, -1.71767446,  0.45017228, -0.43678094, -0.49580432,
         3.44585627,  0.93459985, -0.39333941,  1.05429494,  1.19496871,
         

In [None]:
X_train_scaled, X_test_scaled, y_train, y_test = random_under_minority(WineRLvH,'high', 2)

In [395]:
minClass = np.argmin(np.bincount(y_train.astype(int)))
minClass

1