## Imports

### Explore data

In [1]:
## Imports
from sklearn.neighbors import NearestNeighbors
import numpy as np 
import pandas as pd 

import math
import scipy  

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot## Exploratory Data Analysis

## Exploratory Data Analysis

### Import data

In [2]:
def read_Dataset(y):
    df = pd.read_csv(y)
    print("The dataset: "+y+ " has {} credit record".format(len(df)))
    training_data, testing_data = train_test_split(df, test_size=0.2, random_state=44)    
    return df,training_data, testing_data

### Data Visualization

In [3]:
def visualise(df):
    fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "bar"}]])
    colors = ['pink', 'skyblue'] 
    fig.add_trace(go.Pie(labels=df[df.columns[-1]].value_counts().index,
                                 values=df[df.columns[-1]].value_counts().values), 1, 1)

    fig.update_traces(hoverinfo='label+percent', textfont_size=20,
                      marker=dict(colors=colors, line=dict(color='#000000', width=2)))

    fig.add_trace(go.Bar(x=df[df.columns[-1]].value_counts().index, y=df[df.columns[-1]].value_counts().values, marker_color = colors), 1,2)

    fig.show()

## Model developement

In [4]:
def get_labels(df):
    valuecounts=df[df.columns[-1]].value_counts().index
    majlabel=valuecounts[0]
    minlabel=valuecounts[1:]
    if len(minlabel)==1:
        minlabel=int(minlabel[0])
    return minlabel,int(majlabel)

In [5]:
def findNeighbours(maj,df):
    #find neighbours for each instance in the whole dataset 
    neigh = NearestNeighbors(n_neighbors=6)
    neigh.fit(df)
   # NearestNeighbors(n_neighbors=5) change it to six 
    NNAllDatasetar = neigh.kneighbors(df, return_distance=False)
    NNAllDataset = pd.DataFrame(NNAllDatasetar, columns = ['index','n1','n2','n3','n4','n5'])
    
    #find the subset of the instances that belong to the majority class
    NNMAJ = NNAllDataset[NNAllDataset.index.isin(maj.index)]
    NNMAJ = NNMAJ.drop(['index'], axis=1)
    NNMAJ =pd.merge(maj, NNMAJ, left_index=True, right_index=True)
    return(NNMAJ)
    

In [6]:
def calculateweight(df,maj):
    #  add  a column in which for each neighbooring point by 1 if it belongs to the majority class and 0 if it belongs to the minority class
    df.insert(len(df.columns) ,"n1W", np.where(df["n1"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n2W", np.where(df["n2"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n3W", np.where(df["n3"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n4W", np.where(df["n4"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n5W", np.where(df["n5"].isin(maj.index) , 1, 0))
    df
    
    #  Calculate the weight 
    df.insert( len(df.columns) ,"weight", df[['n1W', 'n2W','n3W', 'n4W', 'n5W']].mean(axis=1),True)
    
    #  Sort the dataframe by the weight 
    df=df.sort_values(by=['weight'] ,ascending=False)
    
    return df

In [7]:
#def repPoints(maj,df,n):
 #   resultfinal= df.head(n)
  #  return maj[maj.index.isin(resultfinal.index)]

In [8]:

def calculate(df,minlabel, majlabel):
    """
    
    """
    P = (df[df.columns[-1]] == minlabel).sum() #Number of instances in the minority class
    N = (df[df.columns[-1]] == majlabel).sum() #Number of instances in the majority class
   
    n1 = P**2/N
    
    majclass = df[df[df.columns[-1]] == majlabel]
    sigma = np.var(majclass.to_numpy()) #Strandard deviation of the majority class  # square root variance formula
    Zalpha = scipy.stats.norm.ppf(.05) #the critical value of the Z test at the significance level α
    epsilone =  pow(10,-4) # acceptable tolerance error that can be adjusted as required 10 power -4
    
    n2= (N*Zalpha*epsilone*sigma)/((N*epsilone**2)+ (Zalpha*epsilone*sigma**2))
    
    pr = n2/n1
    
    M = 1.5
    
    if pr < 1:
         size = n1
    elif pr > M:
         size = n1*M
    else:
         size = n2
    return size
    

In [9]:
def concatinate(df,ds):
    ds1 = df 
    ds1 = ds1.iloc[: , :-1]
    ds2 = ds
   # ds1 = ds1.drop(ds1.index[0]) #if the same instance can't be concatinated with itself 
    
    mainds= pd.DataFrame()
    m=len(ds2.index)
    for p in range(m):
        mainds = mainds.append((ds1.assign(key=1).merge(ds2.head(1).assign(key=1), on='key').drop('key',axis=1)), ignore_index=True)
        ds2 = ds2.iloc[1: , :] 
    return (mainds)

In [10]:
#figure out the subset
def representeticepointssel(df,c):
    if df.size<=c: #number of rows
        tmpSet = df
    else:
        tmpSet = []
        vecB= np.mean(df.to_numpy())
        for i in range(c):
            maxDist = 0
            for p in df.iterrows():  # iterate it by rows not columns
                if i==0:
                    minDist = scipy.spatial.distance.pdist((p, vecB),'euclidean') #vecA is p vecB mean value of all the rows in df
                else:
                    # for a given p, if p's min distance to any q in tmpset is biggest, then p is next representative point 
                    minDist = np.min([scipy.spatial.distance.pdist((p, q),'euclidean') for q in tmpSet])
                if minDist >= maxDist:
                    maxPoint = p
                    maxDist = minDist
            tmpSet.append(maxPoint)
        return(tmpSet)

In [11]:
def concattest(df,finalMaj):
    ds1 = df.iloc[: , :-1]
    df = pd.concat([ds1, df], axis=1)
    df.columns = list(finalMaj.columns)
    return df

In [12]:
def repPoints(neighboursds,df,n):

    rep1 = neighboursds[neighboursds["weight"] > neighboursds.head(n)["weight"].iloc[-1]]
    rep2 = neighboursds[neighboursds["weight"] == neighboursds.head(n)["weight"].iloc[-1]]

    rep1 =  df[df.index.isin(rep1.index)]
    rep2 =  df[df.index.isin(rep2.index)]

    if  (neighboursds.head(n)["weight"].iloc[-1] == neighboursds.head(n+1)["weight"].iloc[-1]):
        if (len(rep1.index)>0):
            m=n-len(rep1.index)
            rep2= df.head(m) #representeticepointssel(rep2,m)
            resultfinal= rep1.append(rep2)
        else :
            rep2= df.head(n) #representeticepointssel(rep2,n)
            resultfinal= rep1.append(rep2)   

    else :
        resultfinal= df.head(n) 
    return resultfinal


In [18]:
def con(dataset):
    df, dftrain , dftest = read_Dataset("datasets/"+dataset) # use a dataset that they are using 
    visualise(df)


    minlabel, majlabel= get_labels(dftrain)
    majds = dftrain[dftrain[dftrain.columns[-1]] == majlabel]
    minds = dftrain[dftrain[dftrain.columns[-1]] == minlabel]

    IRO= len(majds.index)/len(minds.index)

    neighboursds = findNeighbours(majds,df)
    neighboursds = calculateweight(neighboursds,majds)

    size = calculate(dftrain,minlabel, majlabel)

    reppoints=repPoints(neighboursds,majds,math.ceil(size)) # use math.ceil(size)  or math math.floor(size)


    finalMaj=concatinate(majds,reppoints)
    finalMin=concatinate(minds,minds) 

    IRcon= len(finalMaj.index)/len(finalMin.index)

    finaldf=pd.concat([finalMaj,finalMin])
    finaltest =concattest(dftest,finalMaj)   

    return neighboursds,finaldf, finaltest 

In [19]:
neighboursds,finaldf, finaltest = con("pima-indians-diabetes.csv")

The dataset: datasets/pima-indians-diabetes.csv has 768 credit record


In [20]:
neighboursds

Unnamed: 0,1,2,3,4,5,6,7,8,label,n1,n2,n3,n4,n5,n1W,n2W,n3W,n4W,n5W,weight
173,1,79,60,42,48,43.5,0.678,23,1,534,747,234,462,532,1,1,1,1,1,1.0
334,1,95,60,18,58,23.9,0.260,22,1,383,224,32,288,563,1,1,1,1,1,1.0
340,1,130,70,13,105,25.9,0.472,22,1,384,530,527,465,765,1,1,1,1,1,1.0
350,4,92,80,0,0,42.2,0.237,29,1,354,564,226,758,583,1,1,1,1,1,1.0
396,3,96,56,34,115,24.7,0.944,39,1,147,567,277,454,593,1,1,1,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,6,165,68,26,168,33.6,0.631,49,1,14,114,130,259,427,0,0,0,0,0,0.0
725,4,112,78,40,0,39.4,0.236,38,1,314,141,590,386,37,0,0,0,0,0,0.0
260,3,191,68,15,130,30.9,0.299,34,1,696,110,646,175,498,0,0,0,0,0,0.0
335,0,165,76,43,255,47.9,0.259,26,1,31,608,215,425,485,0,0,0,0,0,0.0


In [15]:
finaldf

Unnamed: 0,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,1_y,2_y,3_y,4_y,5_y,6_y,7_y,8_y,label
0,0,125,68,0,0,24.7,0.206,21,3,78,70,0,0,32.5,0.270,39,1
1,0,93,60,25,92,28.7,0.532,22,3,78,70,0,0,32.5,0.270,39,1
2,3,78,70,0,0,32.5,0.270,39,3,78,70,0,0,32.5,0.270,39,1
3,0,137,84,27,0,27.3,0.231,59,3,78,70,0,0,32.5,0.270,39,1
4,4,137,84,0,0,31.2,0.252,30,3,78,70,0,0,32.5,0.270,39,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41611,2,197,70,99,0,34.7,0.575,62,3,129,64,29,115,26.4,0.219,28,0
41612,0,162,76,56,100,53.2,0.759,25,3,129,64,29,115,26.4,0.219,28,0
41613,5,137,108,0,0,48.8,0.227,37,3,129,64,29,115,26.4,0.219,28,0
41614,0,181,88,44,510,43.3,0.222,26,3,129,64,29,115,26.4,0.219,28,0


In [16]:
finaltest

Unnamed: 0,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,1_y,2_y,3_y,4_y,5_y,6_y,7_y,8_y,label
452,0,91,68,32,210,39.9,0.381,25,0,91,68,32,210,39.9,0.381,25,1
370,3,173,82,48,465,38.4,2.137,25,3,173,82,48,465,38.4,2.137,25,0
746,1,147,94,41,0,49.3,0.358,27,1,147,94,41,0,49.3,0.358,27,0
122,2,107,74,30,100,33.6,0.404,23,2,107,74,30,100,33.6,0.404,23,1
614,11,138,74,26,144,36.1,0.557,50,11,138,74,26,144,36.1,0.557,50,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,9,122,56,0,0,33.3,1.114,33,9,122,56,0,0,33.3,1.114,33,0
238,9,164,84,21,0,30.8,0.831,32,9,164,84,21,0,30.8,0.831,32,0
634,10,92,62,0,0,25.9,0.167,31,10,92,62,0,0,25.9,0.167,31,1
731,8,120,86,0,0,28.4,0.259,22,8,120,86,0,0,28.4,0.259,22,0
