In [1]:
import numpy as np
import pandas as pd
import time
from itertools import combinations
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,NuSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
import sys
#import tensorflow as tf
#from ELM.elm import ELM
import os
import argparse
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Check if the data is eligible
def If_Elig(data):
    return (data>=5).astype(int)

# Find the position of the data
def find_pos(data,value, order):
    max_num = np.count_nonzero(data[:,0]==value)
    if order > max_num:
        return -1
    else:
        return np.nonzero(data[:,0]==value)[0][order-1]

# Model Prediction
def model_predict(model,x):
    return model.predict(x)

# Hand Written Soft Voting
def Vote(Classifier,Prediction,Score):
    leng = len(Classifier)
    sum_score = sum(Score)
    Weighted = 0
    for i in range(leng):
        Weighted += Prediction[i]*Score[i]/sum_score
    return Weighted

In [3]:
# The following def are packages training, which can be searched on sklearn

def KNN(n,x,y):
    knn = KNeighborsClassifier(n_neighbors=n, n_jobs=-1)
    knn.fit(x,y)
    return knn


def SVM(x,y):
    svc = SVC()
    svc.fit(x,y)
    return svc


def NuSVM(x,y):
    nus = [_ / 10 for _ in range(1, 11, 1)]
    for nu in nus:
        nusvc = NuSVC(nu=nu)
        try:
            nusvc.fit(x,y)
            return nusvc
        except ValueError as e:
            print("nu {} not feasible".format(nu))

def Bayesian(x,y):
    bayesian = GaussianNB()
    bayesian.fit(x,y)
    return bayesian


def DecisionTree(x,y,criterion='gini',max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None):
    Tree = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,min_samples_split= \
                                               min_samples_split,min_samples_leaf=min_samples_leaf,
                                               max_features=max_features)
    Tree.fit(x,y)
    return Tree


def RandomForest(x,y,n_estimator=10,criterion='gini'):
    Forest = RandomForestClassifier(n_estimators=n_estimator,criterion=criterion)
    Forest.fit(x,y)
    return Forest


def LinearC(x,y):
    linear = SGDClassifier()
    linear.fit(x,y)
    return linear


def MLP(x,y,hidden_layer_size=(100,),activation='relu',solver='adam',learning_rate_init=0.01, learning_rate='adaptive'):
    if solver=='adam':
        MLP = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation=activation, solver= \
                                               solver, learning_rate_init=learning_rate_init, learning_rate=learning_rate)
    elif solver=='sgd':
        MLP = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation=activation, solver= \
            solver, learning_rate_init=learning_rate_init)
    else:
        MLP = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation=activation, solver= \
            solver)
    MLP.fit(x,y)
    return MLP

In [4]:
# Read Data
np.set_printoptions(threshold=sys.maxsize)
cleaned_filepath = 'ConcernLv_RF.csv'
riskfactor_filepath = 'Questions_ReconstructedFactor.csv'
df = pd.read_csv(cleaned_filepath)
df_rf = pd.read_csv(riskfactor_filepath)

In [5]:
dfr = df.loc[:, 'UniqueID':'Section_t']
dft = df_rf.loc[:,"ReconstructedFactor"]
d = dfr.convert_objects(convert_numeric=True)
dt = dft.convert_objects(convert_numeric=True)
xr = d.values
y = dt.values
x = xr[~np.isnan(xr).any(axis=1)]
y = y[~np.isnan(xr).any(axis=1)]

# make new dataframe with the same time period
x_revise = np.empty((0,8))
y_revise = np.empty((0,))

for i in range(x.shape[0]):
    if i == 0 and x[i,0] == x[i+1,0]:
        x_revise = np.vstack([x_revise,[x[i,0:8]]])
        y_revise = np.append(y_revise,y[i])
    elif i == x.shape[0]-1 and x[i,0] == x[i-1,0]:
        x_revise = np.vstack([x_revise,[x[i,0:8]]])
        y_revise = np.append(y_revise,y[i])
    elif i == x.shape[0]-1:
        continue
    elif x[i,0]==x[i+1,0] or x[i,0]==x[i-1,0]:
        x_revise = np.vstack([x_revise,[x[i,0:8]]])
        y_revise = np.append(y_revise,y[i])

In [14]:
y_revise.shape

(18077,)

In [15]:
real_x = np.empty((0,x_revise.shape[1]-1))
real_y = np.empty((0,))
for j in range(1,4):
    for i in range(1000,int(max(x_revise[:,0]))):
        pos1 = find_pos(x_revise,i,j)
        pos2 = find_pos(x_revise,i,j+1)
        if pos1 != -1 and pos2 != -1:
            real_x = np.vstack([real_x,[x_revise[pos1,1:]]])
            real_y = np.append(real_y,y_revise[pos2])

In [13]:
unique, counts = np.unique(y_revise, return_counts=True)
print(unique,counts)

[0. 1. 2. 3. 4. 5. 6. 7. 8.] [4179 2852 1942 1772 1953 2113 1583 1032  651]


In [9]:
# Balance the dataset
for i in range(9):
    print("Number of Instance %d is %d" % (i,np.count_nonzero(real_y==i)))
zero = 0
for i in range(real_y.shape[0]):
    if real_y[i] == 0 and zero<1000:
        real_x = np.vstack([real_x,[real_x[i,:]]])
        real_y = np.append(real_y,real_y[i])
        zero += 1
    if real_y[i] == 1:
        real_x = np.vstack([real_x,[real_x[i,:]]])
        real_y = np.append(real_y,real_y[i])
    elif real_y[i] == 2:
        for _ in range(2):
            real_x = np.vstack([real_x,[real_x[i,:]]])
            real_y = np.append(real_y,real_y[i])
    elif real_y[i] == 3:
        for _ in range(3):
            real_x = np.vstack([real_x,[real_x[i,:]]])
            real_y = np.append(real_y,real_y[i])
    elif real_y[i] == 4 :
        for _ in range(3):
            real_x = np.vstack([real_x,[real_x[i,:]]])
            real_y = np.append(real_y,real_y[i])
    elif real_y[i] == 5:
        for _ in range(4):
            real_x = np.vstack([real_x,[real_x[i,:]]])
            real_y = np.append(real_y,real_y[i])
    elif real_y[i] == 6:
        for _ in range(5):
            real_x = np.vstack([real_x,[real_x[i,:]]])
            real_y = np.append(real_y,real_y[i])
    elif real_y[i] == 7:
        for _ in range(9):
            real_x = np.vstack([real_x,[real_x[i,:]]])
            real_y = np.append(real_y,real_y[i])
    elif real_y[i] == 8:
        for _ in range(18):
            real_x = np.vstack([real_x,[real_x[i,:]]])
            real_y = np.append(real_y,real_y[i])
for i in range(9):
    print("Number of Instance %d is %d" % (i,np.count_nonzero(real_y==i)))

Number of Instance 0 is 3076
Number of Instance 1 is 1959
Number of Instance 2 is 1244
Number of Instance 3 is 1017
Number of Instance 4 is 917
Number of Instance 5 is 861
Number of Instance 6 is 622
Number of Instance 7 is 370
Number of Instance 8 is 197
Number of Instance 9 is 0
Number of Instance 0 is 4076
Number of Instance 1 is 3918
Number of Instance 2 is 3732
Number of Instance 3 is 4068
Number of Instance 4 is 3668
Number of Instance 5 is 4305
Number of Instance 6 is 3732
Number of Instance 7 is 3700
Number of Instance 8 is 3743
Number of Instance 9 is 0


In [10]:
# Train Test Split
x_train,x_test,y_train,y_test = train_test_split(real_x,real_y,test_size=0.2)
model1 = RandomForestClassifier()
model1.fit(x_train,y_train)
model2 = RandomForestClassifier(criterion='entropy')
model2.fit(x_train,y_train)
model3 = RandomForestClassifier(bootstrap=False)
model3.fit(x_train,y_train)
bag = VotingClassifier(estimators=[('RF1',model1),('RF2',model2),('RF3',model3)],voting='soft')
boost = AdaBoostClassifier(base_estimator=model1,n_estimators=5)
bag.fit(x_train,y_train)
boost.fit(x_train,y_train)
y_pred1 = model1.predict(x_test)
y_pred2 = bag.predict(x_test)
y_pred3 = boost.predict(x_test)

In [11]:
# Prediction
print(classification_report(y_pred1,y_test))
print(classification_report(y_pred2,y_test))
print(classification_report(y_pred3,y_test))

             precision    recall  f1-score   support

        0.0       0.55      0.37      0.44      1258
        1.0       0.39      0.48      0.43       658
        2.0       0.55      0.63      0.59       660
        3.0       0.58      0.71      0.64       658
        4.0       0.62      0.69      0.65       669
        5.0       0.70      0.71      0.71       824
        6.0       0.71      0.76      0.74       692
        7.0       0.88      0.81      0.84       770
        8.0       0.96      0.88      0.92       800

avg / total       0.66      0.65      0.65      6989

             precision    recall  f1-score   support

        0.0       0.52      0.38      0.44      1140
        1.0       0.42      0.45      0.43       769
        2.0       0.55      0.65      0.59       648
        3.0       0.58      0.72      0.64       652
        4.0       0.62      0.70      0.66       656
        5.0       0.72      0.71      0.71       837
        6.0       0.72      0.75      0.74