In [2]:
import pandas as pd
import numpy as np
import csv

In [3]:
f=pd.read_csv('data.csv',names=['balance','var1','var2','var3','var4'])




In [4]:
f.head()
#R for right-heavy, when var3*var4>var1*var2
#L for left-heavy, when var3*var4<var1*var2
#B for balanced, when var3*var4=var1*var2

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [5]:
f['balance'].value_counts()

R    288
L    288
B     49
Name: balance, dtype: int64

In [6]:
#balance=1, imbalance=0
f['balance']=[1 if b=='B'else 0 for b in f['balance']]

In [7]:
f['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

In [8]:
#logistic regression with default
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
y=f.balance
x=f.drop('balance',axis=1)

In [10]:
##############################
#drop the entire row
a=f.drop([0])
a.head()


Unnamed: 0,balance,var1,var2,var3,var4
1,0,1,1,1,2
2,0,1,1,1,3
3,0,1,1,1,4
4,0,1,1,1,5
5,0,1,1,2,1


In [11]:
################################
#drop the column
b=f.drop('var3',axis=1)
b.head()

Unnamed: 0,balance,var1,var2,var4
0,1,1,1,1
1,0,1,1,2
2,0,1,1,3
3,0,1,1,4
4,0,1,1,5


In [12]:
#train the model
clf_0=LogisticRegression().fit(x,y)
#predict on training set
pred_y_0=clf_0.predict(x)

In [13]:
#how's the accuracy
print(accuracy_score(pred_y_0,y))

0.9216


# Up-sample minority class

In [14]:
from sklearn.utils import resample

Next, we'll create a new DataFrame with an up-sampled minority class. Here are the steps:

#First, we'll separate observations from each class into different DataFrames.
#Next, we'll resample the minority class with  **replacement**, setting the number of samples to match that of the majority class.
#Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.

In [15]:
df_majority=f[f.balance==0]
df_minority=f[f.balance==1]

In [16]:
#upsampling the minority
df_minority_upsample=resample(df_minority,
                              replace=True,     #sample with replacement
                              n_samples=576,     #to match majority class
                              random_state=123)  #reproducible results

In [17]:
#combine majority class with unsampled minority class
df_upsampled=pd.concat([df_majority,df_minority_upsample])

In [18]:
#display new class counts
df_upsampled['balance'].value_counts()

1    576
0    576
Name: balance, dtype: int64

In [19]:
#train the model again
y=df_upsampled['balance']
x=df_upsampled.drop('balance',axis=1)
clf_1=LogisticRegression().fit(x,y)

#predict on training set
pred_y_1=clf_1.predict(x)

#accuracy
print(accuracy_score(pred_y_1,y))

0.513888888889


# Down-sample majority class

The process is similar to that of up-sampling. Here are the steps:

#First, we'll separate observations from each class into different DataFrames.
#Next, we'll resample the majority class **without replacement**, setting the number of samples to match that of the minority class.
#Finally, we'll combine the down-sampled majority class DataFrame with the original minority class DataFrame.

In [20]:
df_majority=f[f.balance==0]
df_minority=f[f.balance==1]
df_majority_downsampled=resample(df_majority,
                                replace=False,
                                n_samples=49,
                                random_state=123)

In [21]:
#combine two dfs
df_downsampled=pd.concat([df_majority_downsampled,df_minority])

df_downsampled['balance'].value_counts()

1    49
0    49
Name: balance, dtype: int64

In [22]:
y=df_downsampled['balance']
x=df_downsampled.drop('balance',axis=1)
#train the model again
clf_2=LogisticRegression().fit(x,y)

#predict on the training set
pred_y_2=clf_2.predict(x)

#accuracy
print(accuracy_score(pred_y_2,y))

0.581632653061


# changing performance metric
#AUC

In [23]:
from sklearn.metrics import roc_auc_score

#To calculate AUROC, you'll need predicted class probabilities instead of just the predicted classes. You can get them using the .predict_proba()  function like so:

In [24]:
#predict class probabilities
prob_y_2=clf_2.predict_proba(x)

In [25]:
#keep only the positive class
prob_y_2= [p[1]for p in prob_y_2]

In [26]:
print(roc_auc_score(y,prob_y_2))

0.568096626406


In [27]:
#how does this compare to the original model
prob_y_0=clf_0.predict_proba(x)
prob_y_0=[p[1] for p in prob_y_0]
print(roc_auc_score(y,prob_y_0))

0.474802165764


### so in here, if use accuracy, clf_0 performs better. But use AUC, clf_2 performs better

# Penalized algorithm (SVM)

#### use penalized algorithm that increase the cost of classification mistakes on the minority class

In [28]:
from sklearn.svm import SVC

During training, we can use the argument **class_weight='balanced'**  to penalize mistakes on the minority class by an amount proportional to how under-represented it is.

We also want to include the argument **probability=True**  if we want to enable probability estimates for SVM algorithms.

In [29]:
y=f.balance
x=f.drop('balance',axis=1)

In [31]:
#Train the model
clf_3=SVC(kernel='linear',
         class_weight='balanced', #penalized
         probability=True)
clf_3.fit(x,y)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
#predict on training set
pred_y_3=clf_3.predict(x)
#the accuracy of the model
print(accuracy_score(y,pred_y_3))

0.688


In [33]:
#the AUC
prob_y_3=clf_3.predict_proba(x)
prob_y_3=[p[1]for p in prob_y_3]
print(roc_auc_score(y,prob_y_3))

0.4694763322


# Use Tree-based algorithm

The final tactic we'll consider is using tree-based algorithms. Decision trees often perform well on imbalanced datasets because their hierarchical structure allows them to learn signals from both classes.

In modern applied machine learning, tree ensembles (Random Forests, Gradient Boosted Trees, etc.) almost always outperform singular decision trees, so we'll jump right into those:

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
#seperate the dataset
y=f.balance
x=f.drop('balance',axis=1)

In [36]:
#train the model
clf_4=RandomForestClassifier()
clf_4.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [38]:
#predict on the training set
pred_y_4=clf_4.predict(x)
#accuracy
print(accuracy_score(y,pred_y_4))

0.9824


In [41]:
#the AUC
prob_y_4=clf_4.predict_proba(x)
prob_y_4=[p[1] for p in prob_y_4]
print(roc_auc_score(y,prob_y_4))

0.999114229025
