In [1]:
import pandas as pd
import numpy as np

In [2]:
import copy

def dummy_encode_categorical_columns(data):
    result_data = copy.deepcopy(data)
    for column in data.columns.values:
        result_data = pd.concat([result_data, pd.get_dummies(result_data[column], prefix = column, prefix_sep = ': ')], axis = 1)
        del result_data[column]
    return result_data

In [3]:
def parse_file(name):
    df = pd.read_csv(name, sep=',')
    df = df.replace(to_replace='positive', value=1)
    df = df.replace(to_replace='negative', value=0)
    y = np.array(df['V10'])
    del df['V10']
    bin_df = dummy_encode_categorical_columns(df)
    return np.array(bin_df).astype(int), y
    

In [4]:
df_test = pd.read_csv('../test1.csv', sep=',')
df_train = pd.read_csv('../train1.csv', sep=',')

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [13]:
def pred_data(i, part):
    X_train, y_train = parse_file('../train' + str(i) + '.csv')
    X_test, y_test = parse_file('../test' + str(i) + '.csv')
    X_train_pos = X_train[y_train == 1]
    X_train_neg = X_train[y_train == 0]
    
    y_pred = []

    for test_obj in X_test:
        pos = 0
        neg = 0
        for pos_obj in X_train_pos:
            if np.sum(test_obj == pos_obj) > int(len(pos_obj) * part):
                pos += 1
        for neg_obj in X_train_neg:
            if np.sum(test_obj == neg_obj) > int(len(neg_obj) * part):
                neg += 1

        pos = pos / float(len(X_train_pos))
        neg = neg / float(len(X_train_neg))
        if (pos > neg):
            y_pred.append(1)
        else:
            y_pred.append(0)
            
    y_pred = np.array(y_pred)
    #print y_pred
    
    '''
    TP = np.sum(y_test * y_pred)
    TN = np.sum(y_test + y_pred == 0)
    FP = np.sum((y_test  == 0) * (y_pred == 1))
    FN = np.sum((y_test  == 1) * (y_pred == 0))
    TPR = float(TP) / np.sum(y_test == 1)
    TNR = float(TN) / np.sum(y_test == 0)
    FPR = float(FP) / (TP + FN)
    NPV = float(TN) / (TN + FN)
    FDR = float(FP) / (TP + FP)
    '''
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    print "Dataset {}".format(i)
    #print "True Positive: {}\nTrue Negative: {}\nFalse Positive: {}\nFalse Negative: {}\nTrue Positive Rate: {}\nTrue Negative Rate: {}\n\
    #Negative Predictive Value: {}\nFalse Positive Rate: {}\nFalse Discovery Rate: {}\nAccuracy: {}\nPrecision: {}\nRecall: {}".format(TP, TN, FP, FN, TPR, TNR, FPR, NPV, FDR, acc, prec, rec)
    print "Accuracy: {}\nPrecision: {}\nRecall: {}".format(acc, prec, rec)
    print("===========")

In [14]:
for i in range(0, 10):
    pred_data(i+1, 0.5)

Dataset 1
Accuracy: 0.47311827957
Precision: 0.666666666667
Recall: 0.393442622951
Dataset 2
Accuracy: 0.51724137931
Precision: 0.595744680851
Recall: 0.549019607843
Dataset 3
Accuracy: 0.61
Precision: 0.770833333333
Recall: 0.569230769231
Dataset 4
Accuracy: 0.516853932584
Precision: 0.690476190476
Recall: 0.491525423729
Dataset 5
Accuracy: 0.52808988764
Precision: 0.717391304348
Recall: 0.532258064516
Dataset 6
Accuracy: 0.6
Precision: 0.75
Recall: 0.589285714286
Dataset 7
Accuracy: 0.570175438596
Precision: 0.677966101695
Recall: 0.571428571429
Dataset 8
Accuracy: 0.588785046729
Precision: 0.737704918033
Recall: 0.616438356164
Dataset 9
Accuracy: 0.660194174757
Precision: 0.786885245902
Recall: 0.685714285714
Dataset 10
Accuracy: 0.516483516484
Precision: 0.659574468085
Recall: 0.525423728814


In [15]:
for i in range(0, 10):
    pred_data(i+1, 0.4)

Dataset 1
Accuracy: 0.881720430108
Precision: 1.0
Recall: 0.819672131148
Dataset 2
Accuracy: 0.885057471264
Precision: 0.918367346939
Recall: 0.882352941176
Dataset 3
Accuracy: 0.91
Precision: 0.98275862069
Recall: 0.876923076923
Dataset 4
Accuracy: 0.876404494382
Precision: 0.961538461538
Recall: 0.847457627119
Dataset 5
Accuracy: 0.932584269663
Precision: 1.0
Recall: 0.903225806452
Dataset 6
Accuracy: 0.870588235294
Precision: 0.941176470588
Recall: 0.857142857143
Dataset 7
Accuracy: 0.850877192982
Precision: 0.964912280702
Recall: 0.785714285714
Dataset 8
Accuracy: 0.88785046729
Precision: 0.955223880597
Recall: 0.876712328767
Dataset 9
Accuracy: 0.854368932039
Precision: 0.936507936508
Recall: 0.842857142857
Dataset 10
Accuracy: 0.89010989011
Precision: 0.980392156863
Recall: 0.847457627119


In [16]:
for i in range(0, 10):
    pred_data(i+1, 0.3)

Dataset 1
Accuracy: 0.344086021505
Precision: 0.0
Recall: 0.0
Dataset 2
Accuracy: 0.413793103448
Precision: 0.0
Recall: 0.0
Dataset 3
Accuracy: 0.35
Precision: 0.0
Recall: 0.0
Dataset 4
Accuracy: 0.337078651685
Precision: 0.0
Recall: 0.0
Dataset 5
Accuracy: 0.303370786517
Precision: 0.0
Recall: 0.0
Dataset 6
Accuracy: 0.341176470588
Precision: 0.0
Recall: 0.0
Dataset 7
Accuracy: 0.385964912281
Precision: 0.0
Recall: 0.0
Dataset 8
Accuracy: 0.317757009346
Precision: 0.0
Recall: 0.0
Dataset 9
Accuracy: 0.320388349515
Precision: 0.0
Recall: 0.0
Dataset 10
Accuracy: 0.351648351648
Precision: 0.0
Recall: 0.0


  'precision', 'predicted', average, warn_for)


In [17]:
for i in range(0, 10):
    pred_data(i+1, 0.6)

Dataset 1
Accuracy: 0.451612903226
Precision: 0.631578947368
Recall: 0.393442622951
Dataset 2
Accuracy: 0.48275862069
Precision: 0.5625
Recall: 0.529411764706
Dataset 3
Accuracy: 0.54
Precision: 0.702127659574
Recall: 0.507692307692
Dataset 4
Accuracy: 0.483146067416
Precision: 0.658536585366
Recall: 0.457627118644
Dataset 5
Accuracy: 0.494382022472
Precision: 0.68085106383
Recall: 0.516129032258
Dataset 6
Accuracy: 0.552941176471
Precision: 0.714285714286
Recall: 0.535714285714
Dataset 7
Accuracy: 0.526315789474
Precision: 0.637931034483
Recall: 0.528571428571
Dataset 8
Accuracy: 0.551401869159
Precision: 0.71186440678
Recall: 0.575342465753
Dataset 9
Accuracy: 0.572815533981
Precision: 0.709677419355
Recall: 0.628571428571
Dataset 10
Accuracy: 0.417582417582
Precision: 0.568181818182
Recall: 0.423728813559
