## Imports

### Explore data

In [1]:
## Imports
from sklearn.neighbors import NearestNeighbors
import numpy as np 
import pandas as pd 

import math
import scipy  

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot## Exploratory Data Analysis

import warnings
warnings.filterwarnings('ignore')

## Exploratory Data Analysis

### Import data

In [2]:
def read_Dataset(y):
    df = pd.read_csv(y)
    print("The dataset: "+y+ " has {} credit record".format(len(df)))
    training_data, testing_data = train_test_split(df, test_size=0.2, random_state=44)    
    return df,training_data, testing_data

### Data Visualization

In [3]:
def visualise(df):
    fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "bar"}]])
    colors = ['pink', 'skyblue'] 
    fig.add_trace(go.Pie(labels=df[df.columns[-1]].value_counts().index,
                                 values=df[df.columns[-1]].value_counts().values), 1, 1)

    fig.update_traces(hoverinfo='label+percent', textfont_size=20,
                      marker=dict(colors=colors, line=dict(color='#000000', width=2)))

    fig.add_trace(go.Bar(x=df[df.columns[-1]].value_counts().index, y=df[df.columns[-1]].value_counts().values, marker_color = colors), 1,2)

    fig.show()

## Model developement

In [4]:
def get_labels(df):
    valuecounts=df[df.columns[-1]].value_counts().index
    majlabel=valuecounts[0]
    minlabel=valuecounts[1:]
    if len(minlabel)==1:
        minlabel=int(minlabel[0])
    return minlabel,int(majlabel)

In [5]:
def findNeighbours(maj,df):
    #find neighbours for each instance in the whole dataset 
    neigh = NearestNeighbors(n_neighbors=6)
    neigh.fit(df)
   # NearestNeighbors(n_neighbors=5) change it to six 
    NNAllDatasetar = neigh.kneighbors(df, return_distance=False)
    NNAllDataset = pd.DataFrame(NNAllDatasetar, columns = ['index','n1','n2','n3','n4','n5'])
    
    #find the subset of the instances that belong to the majority class
    NNMAJ = NNAllDataset[NNAllDataset.index.isin(maj.index)]
    NNMAJ = NNMAJ.drop(['index'], axis=1)
    NNMAJ =pd.merge(maj, NNMAJ, left_index=True, right_index=True)
    return(NNMAJ)
    

In [6]:
def calculateweight(df,maj):
    #  add  a column in which for each neighbooring point by 1 if it belongs to the majority class and 0 if it belongs to the minority class
    df.insert(len(df.columns) ,"n1W", np.where(df["n1"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n2W", np.where(df["n2"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n3W", np.where(df["n3"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n4W", np.where(df["n4"].isin(maj.index) , 1, 0))
    df.insert(len(df.columns) ,"n5W", np.where(df["n5"].isin(maj.index) , 1, 0))
    df
    
    #  Calculate the weight 
    df.insert( len(df.columns) ,"weight", df[['n1W', 'n2W','n3W', 'n4W', 'n5W']].mean(axis=1),True)
    
    #  Sort the dataframe by the weight 
    df=df.sort_values(by=['weight'] ,ascending=False)
    
    return df

In [7]:
#def repPoints(maj,df,n):
 #   resultfinal= df.head(n)
  #  return maj[maj.index.isin(resultfinal.index)]

In [8]:

def calculate(df,minlabel, majlabel):
    """
    
    """
    P = (df[df.columns[-1]] == minlabel).sum() #Number of instances in the minority class
    N = (df[df.columns[-1]] == majlabel).sum() #Number of instances in the majority class
   
    n1 = P**2/N
    
    majclass = df[df[df.columns[-1]] == majlabel]
    sigma = np.var(majclass.to_numpy()) #Strandard deviation of the majority class  # square root variance formula
    Zalpha = scipy.stats.norm.ppf(.05) #the critical value of the Z test at the significance level α
    epsilone =  pow(10,-4) # acceptable tolerance error that can be adjusted as required 10 power -4
    
    n2= (N*Zalpha*epsilone*sigma)/((N*epsilone**2)+ (Zalpha*epsilone*sigma**2))
    
    pr = n2/n1
    
    M = 1.5
    
    if pr < 1:
         size = n1
    elif pr > M:
         size = n1*M
    else:
         size = n2
    return size
    

In [9]:
def concatinate(df,ds):
    ds1 = df 
    ds1 = ds1.iloc[: , :-1]
    ds2 = ds
   # ds1 = ds1.drop(ds1.index[0]) #if the same instance can't be concatinated with itself 
    
    mainds= pd.DataFrame()
    m=len(ds2.index)
    for p in range(m):
        mainds = mainds.append((ds1.assign(key=1).merge(ds2.head(1).assign(key=1), on='key').drop('key',axis=1)), ignore_index=True)
        ds2 = ds2.iloc[1: , :] 
    return (mainds)

In [10]:
#figure out the subset
def representeticepointssel(df,c):
    if df.size<=c: #number of rows
        tmpSet = df
    else:
        tmpSet = []
        vecB= np.mean(df.to_numpy())
        for i in range(c):
            maxDist = 0
            for p in df.iterrows():  # iterate it by rows not columns
                if i==0:
                    minDist = scipy.spatial.distance.pdist((p, vecB),'euclidean') #vecA is p vecB mean value of all the rows in df
                else:
                    # for a given p, if p's min distance to any q in tmpset is biggest, then p is next representative point 
                    minDist = np.min([scipy.spatial.distance.pdist((p, q),'euclidean') for q in tmpSet])
                if minDist >= maxDist:
                    maxPoint = p
                    maxDist = minDist
            tmpSet.append(maxPoint)
        return(tmpSet)

In [11]:
def concattest(df,finalMaj):
    ds1 = df.iloc[: , :-1]
    df = pd.concat([ds1, df], axis=1)
    df.columns = list(finalMaj.columns)
    return df

In [12]:
def repPoints(neighboursds,df,n):

    rep1 = neighboursds[neighboursds["weight"] > neighboursds.head(n)["weight"].iloc[-1]]
    rep2 = neighboursds[neighboursds["weight"] == neighboursds.head(n)["weight"].iloc[-1]]

    rep1 =  df[df.index.isin(rep1.index)]
    rep2 =  df[df.index.isin(rep2.index)]

    if  (neighboursds.head(n)["weight"].iloc[-1] == neighboursds.head(n+1)["weight"].iloc[-1]):
        if (len(rep1.index)>0):
            m=n-len(rep1.index)
            rep2= df.head(m) #representeticepointssel(rep2,m)
            resultfinal= rep1.append(rep2)
        else :
            rep2= df.head(n) #representeticepointssel(rep2,n)
            resultfinal= rep1.append(rep2)   

    else :
        resultfinal= df.head(n) 
    return resultfinal


In [13]:
def con(dataset,compare):
    df, dftrain , dftest = read_Dataset("datasets/"+dataset) # use a dataset that they are using 
    #visualise(df)


    minlabel, majlabel= get_labels(dftrain)
    majds = dftrain[dftrain[dftrain.columns[-1]] == majlabel]
    minds = dftrain[dftrain[dftrain.columns[-1]] == minlabel]

    IRO= len(majds.index)/len(minds.index)

    neighboursds = findNeighbours(majds,df)
    neighboursds = calculateweight(neighboursds,majds)

    size = calculate(dftrain,minlabel, majlabel)

    reppoints=repPoints(neighboursds,majds,math.ceil(size)) # use math.ceil(size)  or math math.floor(size)


    finalMaj=concatinate(majds,reppoints)
    finalMin=concatinate(minds,minds) 

    IRcon= len(finalMaj.index)/len(finalMin.index)

    finaldf=pd.concat([finalMaj,finalMin])
    finaltest =concattest(dftest,finalMaj)   

    compare.loc[len(compare.index)] = [dataset, len(dftrain.index), len(majds.index),len(minds.index),IRO,len(reppoints.index),len(finaldf.index),len(finalMaj.index),len(finalMin.index),IRcon] 
    return finaldf, finaltest , compare

In [14]:
datasets=["abalone9-18.csv","Breast.csv","ecoli-0-1_vs_2-3-5.csv","ecoli-0-1_vs_5.csv","ecoli-0-1-4-7_vs_5-6.csv",
"ecoli-0-2-3-4_vs_5.csv","ecoli-0-4-6_vs_5.csv","ecoli-0-6-7_vs_5.csv","ecoli2.csv","ecoli3.csv", "glass0123vs456.csv",
"glass0.csv","glass1.csv","glass6.csv","haberman.csv","iris1.csv","leaf.csv","new-thyroid1.csv","new-thyroid2.csv",
"parkinsons.csv","seeds.csv","spect.csv"]

In [15]:
def scores(y_test,predictions):
    accuracy= round(accuracy_score(y_test, predictions)*100, 2)
    F1 = round(f1_score(y_test, predictions, average='weighted')*100, 2)
    precision = round(precision_score(y_test, predictions, average='weighted')*100, 2)
    recall = round(recall_score(y_test, predictions, average='weighted')*100, 2)
    AUC = round(roc_auc_score(y_test, predictions, average='weighted')*100, 2)
    return accuracy,F1,precision,recall,AUC

In [16]:
def compsvc(datasets):
    classificationCompare = pd.DataFrame(columns=['Dataset','Version','Accuracy score %', "F1 Score %",'precision score %', "recall Score %", "AUC Score %"])
    compare = pd.DataFrame(columns=['Dataset','original count','original Majority','original Minority'," Imbalance ratio original",'representitive points','con count','con Majority','con Minority'," Imbalance ratio con"])

    for dataset in datasets:
        #classification on original dataset
        df = pd.read_csv("datasets/"+dataset)
        
        X = df.drop(df.columns[-1], axis=1)
        y = df[df.columns[-1]]
        
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)
        
        svc_or = SVC(kernel='linear')
        svc_or.fit(X_train, y_train)
        
        predictions_or = svc_or.predict(X_test)
        
        accuracy,F1,precision,recall,AUC =  scores(y_test,predictions_or)
        classificationCompare.loc[len(classificationCompare.index)]=[dataset,"original", accuracy,F1,precision,recall,AUC]
        #classification on concatinated dataset
        svc_con = SVC(kernel='linear')        
        finaldf, finaltest , compare=con(dataset,compare)
        X_train_con = finaldf.drop(finaldf.columns[-1], axis=1)
        X_test_con = finaltest.drop(finaltest.columns[-1], axis=1)

        y_train_con = finaldf[finaldf.columns[-1]]
        y_test_con = finaltest[finaltest.columns[-1]]
        
        svc_con.fit(X_train_con, y_train_con)
        predictions_con = svc_con.predict(X_test_con)
        accuracy,F1,precision,recall,AUC =  scores(y_test_con,predictions_con)
        classificationCompare.loc[len(classificationCompare.index)]=[dataset,"concatinated", accuracy,F1,precision,recall,AUC]

    return classificationCompare, compare

In [17]:
classificationCompare , compare= compsvc(datasets)


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



The dataset: datasets/abalone9-18.csv has 731 credit record
The dataset: datasets/Breast.csv has 569 credit record
The dataset: datasets/ecoli-0-1_vs_2-3-5.csv has 244 credit record
The dataset: datasets/ecoli-0-1_vs_5.csv has 240 credit record
The dataset: datasets/ecoli-0-1-4-7_vs_5-6.csv has 332 credit record
The dataset: datasets/ecoli-0-2-3-4_vs_5.csv has 202 credit record
The dataset: datasets/ecoli-0-4-6_vs_5.csv has 203 credit record
The dataset: datasets/ecoli-0-6-7_vs_5.csv has 220 credit record
The dataset: datasets/ecoli2.csv has 336 credit record



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



The dataset: datasets/ecoli3.csv has 336 credit record
The dataset: datasets/glass0123vs456.csv has 214 credit record
The dataset: datasets/glass0.csv has 214 credit record
The dataset: datasets/glass1.csv has 214 credit record
The dataset: datasets/glass6.csv has 214 credit record
The dataset: datasets/haberman.csv has 306 credit record



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



The dataset: datasets/iris1.csv has 150 credit record



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



The dataset: datasets/leaf.csv has 340 credit record
The dataset: datasets/new-thyroid1.csv has 215 credit record
The dataset: datasets/new-thyroid2.csv has 215 credit record
The dataset: datasets/parkinsons.csv has 195 credit record
The dataset: datasets/seeds.csv has 210 credit record
The dataset: datasets/spect.csv has 267 credit record


In [18]:
compare

Unnamed: 0,Dataset,original count,original Majority,original Minority,Imbalance ratio original,representitive points,con count,con Majority,con Minority,Imbalance ratio con
0,abalone9-18.csv,584,548,36,15.222222,4,3488,2192,1296,1.691358
1,Breast.csv,455,282,173,1.630058,107,60103,30174,29929,1.008186
2,ecoli-0-1_vs_2-3-5.csv,195,176,19,9.263158,3,889,528,361,1.462604
3,ecoli-0-1_vs_5.csv,192,177,15,11.8,2,579,354,225,1.573333
4,ecoli-0-1-4-7_vs_5-6.csv,265,249,16,15.5625,2,754,498,256,1.945312
5,ecoli-0-2-3-4_vs_5.csv,161,146,15,9.733333,2,517,292,225,1.297778
6,ecoli-0-4-6_vs_5.csv,162,147,15,9.8,2,519,294,225,1.306667
7,ecoli-0-6-7_vs_5.csv,176,157,19,8.263158,3,832,471,361,1.304709
8,ecoli2.csv,268,227,41,5.536585,8,3497,1816,1681,1.080309
9,ecoli3.csv,268,240,28,8.571429,4,1744,960,784,1.22449


In [19]:
classificationCompare.head(50)

Unnamed: 0,Dataset,Version,Accuracy score %,F1 Score %,precision score %,recall Score %,AUC Score %
0,abalone9-18.csv,original,95.92,93.92,92.0,95.92,50.0
1,abalone9-18.csv,concatinated,40.82,53.62,96.18,40.82,69.15
2,Breast.csv,original,98.25,98.23,98.29,98.25,97.44
3,Breast.csv,concatinated,98.25,98.25,98.25,98.25,98.05
4,ecoli-0-1_vs_2-3-5.csv,original,93.88,92.67,94.27,93.88,70.0
5,ecoli-0-1_vs_2-3-5.csv,concatinated,51.02,60.14,83.54,51.02,55.0
6,ecoli-0-1_vs_5.csv,original,97.92,97.81,97.96,97.92,90.0
7,ecoli-0-1_vs_5.csv,concatinated,70.83,76.5,92.32,70.83,83.72
8,ecoli-0-1-4-7_vs_5-6.csv,original,94.03,93.71,93.75,94.03,82.47
9,ecoli-0-1-4-7_vs_5-6.csv,concatinated,67.16,72.36,85.72,67.16,71.65
