# Sampling_Assignment


In [169]:
import numpy as np
import pandas as pd
import random 
import math
from random import randint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.power import TTestIndPower
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier

## Loading Dataset

In [170]:
data = pd.read_csv('Creditcard_data.csv')
df = data.copy()
y = df['Class']
X = df.drop('Class',axis = 1)
s = StandardScaler()
X['Amount'] = s.fit_transform(X['Amount'].values.reshape(-1,1))

## Using 

In [171]:
ros = RandomOverSampler(random_state=42)
X,y = ros.fit_resample(X,y)
new_df = pd.concat([X,y],axis = 1)
# print(new_df)

In [172]:
def Systematic(df,step):
    ind = np.arange(0,len(df),step = len(df)/step)
    Syst = df.iloc[ind]
    # print(Syst)
    new_y = Syst['Class']
    new_x = Syst.drop('Class',axis = 1)
    x_train,x_test,y_train,y_test = train_test_split(new_x,new_y,test_size=0.3,random_state=42)
    model1 = XGBClassifier()
    model2 = DecisionTreeClassifier()
    model3 = LGBMClassifier()
    model4 = SVC()
    model5 = GaussianNB()
    m = [model1,model2,model3,model4,model5]
    acc = []
    for i in range(0,5):
        new_model = m[i]
        new_model.fit(x_train,y_train)
        y_pred = new_model.predict(x_test)
        acc.append(accuracy_score(y_pred,y_test))
    return acc 

In [173]:
def SimpleRandom(df):
    np.random.seed(0)
    z = 1.96
    p = 0.5
    e = 0.05
    s_size = math.floor((pow(z,2)*p*(1-p))/pow(e,2))
    # print(s_size)
    ran_sam = df.sample(n = s_size,random_state = 0)
    # print(ran_sam)
    new_y = ran_sam['Class']
    new_x = ran_sam.drop('Class',axis = 1)
    x_train,x_test,y_train,y_test = train_test_split(new_x,new_y,test_size=0.25,random_state=42)
    model1 = XGBClassifier()
    model2 = DecisionTreeClassifier()
    model3 = LGBMClassifier()
    model4 = SVC()
    model5 = GaussianNB()
    m = [model1,model2,model3,model4,model5]
    acc = []
    for i in range(0,5):
        new_model = m[i]
        new_model.fit(x_train,y_train)
        y_pred = new_model.predict(x_test)
        acc.append(accuracy_score(y_pred,y_test))
    return acc 
    

In [174]:
def Stratified(df):
    z = 1.96
    p = 0.5
    e = 0.05
    s_size = (math.floor(((pow(z,2))*p*(1-p))/pow(e/2,2)))
    Strat_df = df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n = int(s_size/4)))
    # print(Strat_df)
    new_y = Strat_df['Class']
    new_x = Strat_df.drop('Class',axis = 1)
    x_train,x_test,y_train,y_test = train_test_split(new_x,new_y,test_size=0.25,random_state=42)
    model1 = XGBClassifier()
    model2 = DecisionTreeClassifier()
    model3 = LGBMClassifier()
    model4 = SVC()
    model5 = GaussianNB()
    m = [model1,model2,model3,model4,model5]
    acc = []
    for i in range(0,5):
        new_model = m[i]
        new_model.fit(x_train,y_train)
        y_pred = new_model.predict(x_test)
        acc.append(accuracy_score(y_pred,y_test))
    return acc 

In [175]:
def get_clustered_Sample(df):
    n_per_cluster = 100
    num_select_clusters = 5
    K = 10
    # print(K)

    samplex=pd.DataFrame()
    clusters=[]
    for i in range(K):
        clusters.append(df.sample(n_per_cluster))
    
    # while (samplex.shape[0])!=num_select_clusters:
    for i in range(num_select_clusters):
        h=np.random.randint(1,10)
        samplex=pd.concat([samplex,clusters[h]],axis=0)

    new_y = samplex['Class']
    new_x = samplex.drop('Class',axis = 1)
    x_train,x_test,y_train,y_test = train_test_split(new_x,new_y,test_size=0.25,random_state=42)
    model1 = XGBClassifier()
    model2 = DecisionTreeClassifier()
    model3 = LGBMClassifier()
    model4 = SVC()
    model5 = GaussianNB()
    m = [model1,model2,model3,model4,model5]
    acc = []
    for i in range(0,5):
        new_model = m[i]
        new_model.fit(x_train,y_train)
        y_pred = new_model.predict(x_test)
        acc.append(accuracy_score(y_pred,y_test))
    return acc 

In [176]:
def Multistage(df):
   z = 1.96
   p = 0.5
   e = 0.05
   s_size = math.floor((pow(z,2)*p*(1-p))/pow(e,2))
   s = [0,1]
   a = np.random.choice(s,384)
   sam = pd.DataFrame()
   for i in a:
      sam = df[df['Class'].isin(a)].sample(n = s_size,replace = False)
   new_y = sam['Class']
   new_x = sam.drop('Class',axis = 1)
   x_train,x_test,y_train,y_test = train_test_split(new_x,new_y,test_size=0.25,random_state=42)
   model1 = XGBClassifier()
   model2 = DecisionTreeClassifier()
   model3 = LGBMClassifier()
   model4 = SVC()
   model5 = GaussianNB()
   m = [model1,model2,model3,model4,model5]
   acc = []
   for i in range(0,5):
      new_model = m[i]
      new_model.fit(x_train,y_train)
      y_pred = new_model.predict(x_test)
      acc.append(accuracy_score(y_pred,y_test))
   return acc 
   
   

In [177]:
n = 1526
e = 0.05
step = math.floor(n/(1+(n*pow(e,2))))
acc1 = Systematic(new_df,step)
acc2 = SimpleRandom(new_df)
acc3 = Stratified(new_df)
acc4 = get_clustered_Sample(new_df)
acc5 = Multistage(new_df)


In [178]:
final_df = pd.DataFrame(list(zip(acc1,acc2,acc3,acc4,acc5)),columns = ['Systematic','Simple_Random','Stratified','Cluster','Multi-Stage'])
final_df.index = ['XGBClassifier','DecisionTreeClassifier','LGBMClassifier','SVM','Gaussian_Naive_Bayes']
print(final_df)

                        Systematic  Simple_Random  Stratified  Cluster  \
XGBClassifier             0.968421       0.989583    0.994792    0.968   
DecisionTreeClassifier    0.968421       0.947917    0.979167    0.960   
LGBMClassifier            0.989474       0.979167    0.994792    0.976   
SVM                       0.684211       0.656250    0.671875    0.640   
Gaussian_Naive_Bayes      0.757895       0.781250    0.781250    0.712   

                        Multi-Stage  
XGBClassifier              0.979167  
DecisionTreeClassifier     0.958333  
LGBMClassifier             0.989583  
SVM                        0.572917  
Gaussian_Naive_Bayes       0.531250  
