In [None]:
#Import libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

#Preprocessing
from sklearn.preprocessing import StandardScaler

#Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#Model Validations
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# FEATURE SELECTION #

Input Parameter:
(1) train_data: Training samples

Output:
(1) selected_columns: Features to choose for the training and testing samples

In [None]:
def corr_feature_selection(train_data):
    #REf: https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf
    
    data_corr = train_data.corr()
    
    columns = np.full((data_corr.shape[0],), True, dtype=bool)
    for i in range(data_corr.shape[0]):
        for j in range(i+1, data_corr.shape[0]):
            if data_corr.iloc[i,j] >= 0.9:
                if columns[j]:
                    columns[j] = False
                   
    selected_columns = train_data.columns[columns]
    return selected_columns

# CLASSIFIER #

Input Parameters:
(1) xtrain = Training samples;
(2) ytrain = Training Labels;
(3) xtest = Testing samples;
(4 - For KNN only) k = number of k neighbors to choose;
(4 - For RF only) n_trees = no. of decision trees;
(5) num_k_fold = k for the k-fold cross validation

Output:
(1) scores = five fold cross-validation scores;
(2) rf_probs = prediction probability

In [None]:
def knn_implement(xtrain, ytrain, xtest, k, num_k_fold):
    # implement a KNN classifier where k is based on the sample size; k = sqrt(sample_size)
    
    clf_knn = KNeighborsClassifier(n_neighbors=k)

    # run five fold cross-validation
    scores = cross_val_score(clf_knn, xtrain, ytrain, cv=num_k_fold, scoring='roc_auc')
    
    clf_knn.fit(xtrain, ytrain)
    rf_probs = clf_knn.predict_proba(xtest)
    
    return scores, rf_probs

def rf_implement(xtrain, ytrain, xtest, n_trees, num_k_fold):
    # Implement Random Forest; get the five fold cross-validation
    clf_RF = RandomForestClassifier(n_estimators=n_trees, 
                               bootstrap = True,
                               max_features = 'sqrt')
    
    # run five fold cross-validation
    scores = cross_val_score(clf_RF, xtrain, ytrain, cv=num_k_fold, scoring='roc_auc')
    
    # Fit on training data
    clf_RF.fit(X, train_label)
    results_RF = clf_RF.predict_proba(xtest)
    
    return scores, results_RF

# OTHER FUNCTIONS #

In [None]:
def convert_rt_mat(xnum, ynum, results):
# Used the same code from the notebook provided in the competition
# transfrom the results to prediction
# clf.predict_proba returns a tuple with (probability_0, probability_1)

    mat = np.zeros(shape=(xnum, ynum))
    for ix, i in enumerate(results):
        for jx, j in enumerate(results[ix]):
            mat[jx, ix] = j[1]
            
    return mat

# Main Code #

In [None]:
#### Load datasets #####

#Training data
train_data = pd.read_csv("X_train.csv", index_col=0)
train_label = pd.read_csv("y_train.csv", index_col=0)

#Testing data
X_test = pd.read_csv("X_test.csv", index_col=0)

# Data Preprocessing 
# Standarization training and testing data sets
scaler = StandardScaler()
X_train = scaler.fit_transform(train_data)
X_test = scaler.fit_transform(X_test) 

#total training sample and features
train_samp = train_data.shape[0]
train_feat = train_label.shape[0]

In [None]:
k = round(math.sqrt(new_samp_count)) 
#print("The k for K-Nearest-neighbor: ", k)
k_fold = 5
trees = 100

# implement a KNN classifier
scores_knn, results_knn = knn_implement(X_train, train_label, X_test, k, k_fold)
print(scores_knn)

# Implement Random Forest model with 100 trees
scores_RF, results_RF = rf_implement(X_train, train_label, X_test, trees, k_fold)
print(scores_RF)

In [None]:
xnum = X_test.shape[0]
ynum = 200

mat_knn = convert_rt_mat(xnum, ynum, results_knn)
mat_RF = convert_rt_mat(xnum, ynum, results_RF)

mat = (mat_knn + mat_RF)/2 

#Obtained from the Kaggle notebook.
# Extract the sample and class names from the test submission
y_test_sample = pd.read_csv("y_test_sample.csv", index_col=0)

# build the dataframe with proper index and column names
df_results = pd.DataFrame(data=mat, index=y_test_sample.index, columns=y_test_sample.columns)

# save to a file for submission
df_results.to_csv("ind_knn_results.csv")

# FUTURE WORK RELATED #

In [None]:
def sample_data_distribution(train_label):
#For each GO-term (ie. train label/class), identify samples that fall under that class and that do not.
# Visulize the unbalanced datasets

    label1_count = train_label.sum()
    sample_size = train_label.shape[0]
    label0_count = sample_size - label1_count

    label1_count = label1_count.tolist()
    label0_count = label0_count.tolist()

    class_count.plot(kind='bar', title='Count (target)');
    num_list = list(range(99))

    plt.plot(num_list, label1_count[1:100], label = 'Train Label = 1')
    plt.plot(num_list, label0_count[1:100], label = 'Train Label = 0')
    plt.xlabel("Gene Annotation (Class)")
    plt.ylabel("Number of training sample for each GO class")
    plt.title('Unbalanced Training Dataset')

    plt.legend()
    plt.show()

In [None]:
def balance_train_data(y_train, GO_term, total_samp):
# balancing data for classifier
# Identify the lowest sample count (samples that are either 0 or 1)
# and the balance the other one with randomly selecting the exact count

    total_1 = sum(y_train[GO_term] == 1)
    
    total_0 = total_samp - total_1
    new_samp_count = total_1 * 2

    bal_train_label = pd.DataFrame()
    if total_1 > total_0:

        train_label_0 = train_label[GO_term][y_train[GO_term] == 0]
        #find data associated to particular terms
        curr_samp = pd.DataFrame()
        curr_samp = train_label[GO_term][y_train[GO_term] == 1]
        curr_samp = curr_samp.sample(n = total_0)
        bal_train_label = pd.concat([train_label_0, curr_samp])

    else:
        train_label_1 = train_label[GO_term][y_train[GO_term] == 1]
        #find data associated to particular terms
        curr_samp = pd.DataFrame()
        curr_samp = train_label[GO_term][y_train[GO_term] == 0]
        curr_samp = curr_samp.sample(n = total_1)
        bal_train_label = pd.concat([curr_samp, train_label_1])
        
    return bal_train_label, new_samp_count

In [None]:
def ind_knn_implement(xtrain, ytrain, xtest):
    # store prediction results
    mat = np.zeros(shape=(xtest.shape[0], 200))
    #all five fold cross-validation for individual class
    all_scores = []
    #total training sample and features
    train_samp = ytrain.shape[0]

    for go_ix, GO_term in enumerate(all_GO_terms):
        
        bal_train_label, new_samp_count = balance_train_data(ytrain, GO_term, train_samp)

        #Identify the training data to use
        selected_data = bal_train_label.index.values
        xtrain = train_data.loc[selected_data]
        k = round(math.sqrt(new_samp_count))   

        scores, results = knn_implement(xtrain, bal_train_label, xtest, k, 5)
        all_scores.append(scores)

        for ix, i in enumerate(results):
            mat[ix, go_ix] = i[1] 

    return all_scores, mat

In [None]:
all_scores, mat = ind_knn_implement(X_train, train_label, X_test)