In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as sklearn_lm
import matplotlib.pyplot as mp_plt
import seaborn as sns
import sklearn.decomposition as sklearn_dcmp
import sklearn.preprocessing as sklearn_pproc
import imblearn.over_sampling as imblearn_osmpl
import sklearn.metrics as sklearn_mt
import pdb

%matplotlib inline

### Load Data

In [2]:
# generate training and test set
def split_train_test(data, NUM_SAMPLES, dr=0.8):
  NUM_TRAIN = int(NUM_SAMPLES * dr) # dr = data ratio
  NUM_TEST = NUM_SAMPLES - NUM_TRAIN
#   pdb.set_trace()
  train_indexes = np.random.choice(NUM_SAMPLES, NUM_TRAIN, replace=False)
  test_indexes = np.setdiff1d(np.arange(0,NUM_SAMPLES), train_indexes, assume_unique=True)
  train_data = data[train_indexes]
  test_data = data[test_indexes]
  
  X_train = train_data[:,:-1]
  y_train = train_data[:,-1]
  X_test = test_data[:,:-1]
  y_test = test_data[:,-1]
  
  return {'train':[NUM_TRAIN, X_train, y_train], 'test':[NUM_TEST, X_test, y_test]}

# load data
data = pd.read_csv('german.txt', delimiter=',', header=None)
data = data.values
NUM_SAMPLES = data.shape[0]

# define constants
NUM_CLASSES = 2
NUM_ITER = 50


### Functions for Data Analysis

In [3]:
def get_corr(X):
  # compute covariance matrix
  X_cov = np.dot(X.T, X) / X.shape[0]
  
  mp_plt.figure(figsize=(8,6))
  ax = mp_plt.gca()
  sns.heatmap(X_cov, xticklabels=np.arange(1,25), yticklabels=np.arange(1,25))
  ax.set_title('Feature Correlation')
  

In [4]:
# compute ROC curve and ROC area for each class
def make_roc(y_test, y_score):  
  y_test = 2- y_test
  mp_plt.figure(figsize=(12,12))
  mp_plt.subplots_adjust(hspace=0.5)
  fpr, tpr, _ = sklearn_mt.roc_curve(y_test, y_score[:, 0], pos_label=1)
  roc_auc = sklearn_mt.auc(fpr, tpr)

  mp_plt.plot(fpr, tpr, color='darkorange', 
              linewidth=2, label='ROC curve (area = %0.2f)' % roc_auc)
  mp_plt.plot([0, 1], [0, 1], color='navy', linewidth=2, linestyle='--')
  mp_plt.xlim([0.0, 1.0])
  mp_plt.ylim([0.0, 1.05])
  mp_plt.xlabel('False Positive Rate')
  mp_plt.ylabel('True Positive Rate')
  mp_plt.title('Receiver operating characteristic for Class: {} (Good Credit Score)'.format(1))
  mp_plt.legend(loc="lower right")


In [5]:
# make boxplots
def make_boxplots(acc, recall, precision):
  mp_plt.figure(figsize=(12,12))
  mp_plt.subplots_adjust(wspace=0.5)
  mp_plt.subplot(1,3,1)
  mp_plt.boxplot(acc)
  mp_plt.ylim(acc.min()-0.05, acc.max()+0.05)
  mp_plt.title('Boxplot for accuracy')
  mp_plt.subplot(1,3,2)
  mp_plt.boxplot(recall)
  mp_plt.ylim(acc.min()-0.05, acc.max()+0.05)
  mp_plt.title('Boxplot for recall')
  mp_plt.subplot(1,3,3)
  mp_plt.boxplot(precision)
  mp_plt.ylim(acc.min()-0.05, acc.max()+0.05)
  mp_plt.title('Boxplot for precision')

### Classification with Subsampling

In [6]:
def get_accuracy(y_test, y_pred):
  return np.float64(sum(y_test == y_pred)) / np.float64(y_test.size)

def linear_classifier(data, NUM_SAMPLES, NUM_ITER):
  # initialise
  acc = np.empty((NUM_ITER, 1))
  recall = np.empty((NUM_ITER, 1))
  precision = np.empty((NUM_ITER, 1))
  
  for i in range(NUM_ITER):
    # random subsampling
    l1_idx = np.asarray([i for i in range(NUM_SAMPLES) if data[i,-1] == 1])
    l2_idx = np.asarray([i for i in range(NUM_SAMPLES) if data[i,-1] == 2])
    l1_idx = np.random.choice(l1_idx, l2_idx.size, replace=False)
    data_subsampled = np.concatenate((data[l1_idx], data[l2_idx]), axis=0)
    np.random.shuffle(data_subsampled)
    
    # generate training and test data
    data_dict = split_train_test(data_subsampled, data_subsampled.shape[0])
    NUM_TRAIN, NUM_TEST = data_dict['train'][0], data_dict['test'][0]
    X_train, X_test = data_dict['train'][1], data_dict['test'][1]
    y_train, y_test = data_dict['train'][2], data_dict['test'][2]

    # normalise data
    standardscaler = sklearn_pproc.StandardScaler()
    X_train = standardscaler.fit_transform(X_train.astype('float64'))
    X_test = standardscaler.transform(X_test.astype('float64'))
    
    # data correlation
#     get_corr(X_train)
    
    # dimensionality reduction
    pca = sklearn_dcmp.PCA(n_components=11, whiten=True)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    # randomly sample data from both classes
#     clf = sklearn_lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=10)
    clf = sklearn_lm.LogisticRegression(penalty='l2')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_score_pred = clf.decision_function(X_test)
    y_score_pred = clf.predict_proba(X_test)
    
    # compute accuracy, recall and precision
    y_test = 2 - y_test
    y_pred = 2 - y_pred
    acc[i] = sklearn_mt.accuracy_score(y_test, y_pred)
    recall[i] = sklearn_mt.recall_score(y_test, y_pred)
    precision[i] = sklearn_mt.precision_score(y_test, y_pred)

  return [acc, recall, precision, y_test, y_score_pred]

acc, recall, precision, y_test, y_score = linear_classifier(data, NUM_SAMPLES, 100)
print(np.mean(acc), np.var(acc))
# make_roc(y_test, y_score)
# make_boxplots(acc, recall, precision)

0.7215833333333334 0.0014390208333333332


### Classification with SMOTE

In [7]:
def get_accuracy(y_test, y_pred):
  return np.float64(sum(y_test == y_pred)) / np.float64(y_test.size)

def linear_classifier(data, NUM_ITER):
  # initialise
  acc = np.empty((NUM_ITER, 1))
  recall = np.empty((NUM_ITER, 1))
  precision = np.empty((NUM_ITER, 1))
  
  for i in range(NUM_ITER):
    # random oversampling
    smote_sample = imblearn_osmpl.SMOTE(ratio='auto')
    X_res, y_res = smote_sample.fit_sample(data[:,:-1], data[:,-1])
    data_oversampled = np.concatenate((X_res, np.expand_dims(y_res, axis=-1)), axis=-1)
    np.random.shuffle(data_oversampled)

    # generate training and test data
    data_dict = split_train_test(data_oversampled, data_oversampled.shape[0])
    NUM_TRAIN, NUM_TEST = data_dict['train'][0], data_dict['test'][0]
    X_train, X_test = data_dict['train'][1], data_dict['test'][1]
    y_train, y_test = data_dict['train'][2], data_dict['test'][2]

    # normalise data
    standardscaler = sklearn_pproc.StandardScaler()
    X_train = standardscaler.fit_transform(X_train)
    X_test = standardscaler.transform(X_test)
    
    # data correlation
#     get_corr(X_train)
    
    # dimensionality reduction
#     pca = sklearn_dcmp.PCA(n_components=2, whiten=True)
#     X_train = pca.fit_transform(X_train)
#     X_test = pca.transform(X_test)
    
    # randomly sample data from both classes
#     clf = sklearn_lm.SGDClassifier(loss='log', penalty='l2', max_iter=10)
    clf = sklearn_lm.LogisticRegression(penalty='l2')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_score_pred = clf.predict_proba(X_test)
    
    # compute accuracy, recall and precision
    y_test = 2 - y_test
    y_pred = 2 - y_pred
    acc[i] = sklearn_mt.accuracy_score(y_test, y_pred)
    recall[i] = sklearn_mt.recall_score(y_test, y_pred)
    precision[i] = sklearn_mt.precision_score(y_test, y_pred)

  return [acc, recall, precision, y_test, y_score_pred]

acc, recall, precision, y_test, y_score = linear_classifier(data, 100)
print(np.mean(acc), np.var(acc))
# make_roc(y_test, y_score)
# make_boxplots(acc, recall, precision)

0.7444999999999999 0.0005569948979591839


In [8]:
import sklearn.model_selection as smd

def get_accuracy(y_test, y_pred):
  return np.float64(sum(y_test == y_pred)) / np.float64(y_test.size)

def linear_classifier(data, NUM_ITER):
  # initialise
  acc = np.empty((NUM_ITER, 1))
  recall = np.empty((NUM_ITER, 1))
  precision = np.empty((NUM_ITER, 1))
  
  for i in range(NUM_ITER):
    # random oversampling
    smote_sample = imblearn_osmpl.SMOTE(ratio='auto')
    X_res, y_res = smote_sample.fit_sample(data[:,:-1], data[:,-1])
    data_oversampled = np.concatenate((X_res, np.expand_dims(y_res, axis=-1)), axis=-1)
    np.random.shuffle(data_oversampled)

    # generate training and test data
    data_dict = split_train_test(data_oversampled, data_oversampled.shape[0])
    NUM_TRAIN, NUM_TEST = data_dict['train'][0], data_dict['test'][0]
    X_train, X_test = data_dict['train'][1], data_dict['test'][1]
    y_train, y_test = data_dict['train'][2], data_dict['test'][2]

    # normalise data
    standardscaler = sklearn_pproc.StandardScaler()
    X_train = standardscaler.fit_transform(X_train)
    X_test = standardscaler.transform(X_test)
    
    # data correlation
#     get_corr(X_train)
    
    # dimensionality reduction
    pca = sklearn_dcmp.PCA(n_components=11, whiten=True)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    # randomly sample data from both classes
#     clf = sklearn_lm.SGDClassifier(loss='log', penalty='l2', max_iter=10)
    clf = sklearn_lm.LogisticRegression(penalty='l2')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # compute accuracy, recall and precision
    y_test = 2 - y_test
    y_pred = 2 - y_pred
    acc[i] = sklearn_mt.accuracy_score(y_test, y_pred)
    recall[i] = sklearn_mt.recall_score(y_test, y_pred)
    precision[i] = sklearn_mt.precision_score(y_test, y_pred)

  return [acc, recall, precision]

acc, recall, precision = linear_classifier(data, 100)
print(np.mean(acc), np.var(acc))

0.7421428571428572 0.000546173469387755
