In [1]:
import numpy as np
import pandas as pd

## Getting the Data ##

In [2]:
train_data = pd.read_csv('train.dat', header=None, sep='\t')

# separating labels from features of training data
train_labels = train_data.iloc[:, 0]
train_data = train_data.iloc[:, 1]

test_data = pd.read_csv('test.dat', header=None, sep='\t')

## c-mer function ##

In [3]:
# create list of c-mers for the row, taking c letters at a time
# cmer refers to a count of characters
# Given a row and parameter c, return the vector of c-mers associated with the row
def cmer(row, c):
    if len(row) < c:
        return [row]
    cmers = []
    for i in range(len(row)-c+1):
        cmers.append(row[i:(i+c)])
    return cmers

## Function to build sparse matrix ##

In [4]:
from scipy.sparse import csr_matrix
from collections import Counter

# build sparse matrix from list of documents, where each is a list of words/terms in the document
def build_matrix(data, num):
    matrix = [cmer(row, num) for row in data]
    nrows = len(matrix)
    dictionary = {}
    ID = 0
    nnz = 0
    for d in matrix:
        wordlist = [x[0] for x in d]
        nnz += len(set(wordlist))
        d = wordlist
        for w in d:
            if w not in dictionary:
                dictionary[w] = ID
                ID += 1
    ncols = len(dictionary)
        
    # set up memory
    ind = np.zeros(nnz, dtype = int)
    val = np.zeros(nnz, dtype = np.double)
    ptr = np.zeros(nrows+1, dtype = int)
    
    # row counter
    i = 0
    # non-zero counter
    n = 0
    
    # transfering values
    for d in matrix:
        list_of_words = [x[0] for x in d]
        count = Counter(list_of_words)
        keys = list(k for k,_ in count.most_common())
        l = len(keys)

        for j, k in enumerate(keys):
            ind[j + n] = dictionary[k]
            val[j + n] = count[k]

        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
        
    matrix = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype = np.double)
    matrix.sort_indices()
    return matrix

## Creating sparse matrices for training and test data ##

In [5]:
complete_data = np.append(train_data, test_data)

# change c value to change length of cmers
c = 3
# creating sparse matrix of frequencies
train_matrix = build_matrix(complete_data, c)[0:1566, :]
test_matrix = build_matrix(complete_data, c)[1566:, :]

## Classifying using various classifiers ##

In [6]:
# Random Forest Classifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef

# classifier_1 = RandomForestClassifier()
# classifier_1.fit(train_matrix, train_labels)
# train_predict_1 = classifier_1.predict(train_matrix)
# print("Random Forest MCC:", matthews_corrcoef(train_labels, train_predict_1))

In [7]:
# # AdaBoost Classifier
# from sklearn.ensemble import AdaBoostClassifier

# classifier_2 = AdaBoostClassifier()
# classifier_2.fit(train_matrix, train_labels)
# train_predict_2 = classifier_2.predict(train_matrix)
# print("AdaBoost MCC:", matthews_corrcoef(train_labels, train_predict_2))

In [8]:
# # Complement Naive Bayes
# from sklearn.naive_bayes import ComplementNB

# classifier_3 = ComplementNB()
# classifier_3.fit(train_matrix, train_labels)
# train_predict_3 = classifier_3.predict(train_matrix)
# print("Complement Naive Bayes MCC:", matthews_corrcoef(train_labels, train_predict_3))

In [9]:
# # SVM
# from sklearn.svm import SVC

# classifier_4 = SVC(random_state = 0)
# classifier_4.fit(train_matrix, train_labels)
# train_predict_4 = classifier_4.predict(train_matrix)
# print("SVM MCC:", matthews_corrcoef(train_labels, train_predict_4))

In [10]:
# test_predictions = classifier_4.predict(test_matrix)
# test_predictions_file = open('svm_output.txt', 'w+')
# pd.Series(test_predictions).to_csv('svm_output.txt', index = False, header = None)
# test_predictions_file.close()

In [11]:
# # Extra-Trees Classifier
# from sklearn.ensemble import ExtraTreesClassifier

# classifier_5 = ExtraTreesClassifier(n_estimators = 1000, max_features = None, )
# classifier_5.fit(train_matrix, train_labels)
# train_predict_5 = classifier_5.predict(train_matrix)
# print("Extra-Trees MCC:", matthews_corrcoef(train_labels, train_predict_5))

In [12]:
# test_predictions = classifier_5.predict(test_matrix)
# test_predictions_file = open('ExtraTrees_output.txt', 'w+')
# pd.Series(test_predictions).to_csv('ExtraTrees_output.txt', index = False, header = None)
# test_predictions_file.close()

In [13]:
# # SGDClassifier
# from sklearn.linear_model import SGDClassifier

# classifier_6 = SGDClassifier()
# classifier_6.fit(train_matrix, train_labels)
# train_predict_6 = classifier_6.predict(train_matrix)
# print("SGD Classifier MCC:", matthews_corrcoef(train_labels, train_predict_6))

In [24]:
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

classifier_7 = GradientBoostingClassifier(n_estimators = 10000, learning_rate = 0.1, max_depth = 3, random_state = 0)
classifier_7.fit(train_matrix, train_labels)
train_predict_7 = classifier_7.predict(train_matrix)
print("Gradient Boosting Classifier MCC:", matthews_corrcoef(train_labels, train_predict_7))

Gradient Boosting Classifier MCC: 1.0


In [25]:
test_predictions = classifier_7.predict(test_matrix)
test_predictions_file = open('GradientBoosting_output.txt', 'w+')
pd.Series(test_predictions).to_csv('GradientBoosting_output.txt', index = False, header = None)
test_predictions_file.close()

In [16]:
#!pip install xgboost

In [17]:
# from xgboost import XGBClassifier
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# train_labels = le.fit_transform(train_labels)
# classifier_8 = XGBClassifier(n_estimators = 100, learning_rate = 0.5, max_depth = 3, random_state = 0)
# classifier_8.fit(train_matrix, train_labels)
# train_predict_8 = classifier_8.predict(train_matrix)
# print("XGBoost MCC: ", matthews_corrcoef(train_labels, train_predict_8))

In [18]:
# test_predictions = classifier_8.predict(test_matrix)
# test_predictions_file = open('xgboost_output.txt', 'w+')
# pd.Series(test_predictions).to_csv('xgboost_output.txt', index = False, header = None)
# test_predictions_file.close()

## Predicting on test data ##

In [19]:
# test_predictions = classifier_5.predict(test_matrix)

## Writing predictions to file ##

In [20]:
# test_predictions_file = open('output_8.txt', 'w+')
# pd.Series(test_predictions).to_csv('output_8.txt', index = False, header = None)
# test_predictions_file.close()

In [21]:
# output - random forest
# output2 - adaboost
# output3 - complement naive bayes
# output4 - svm
# output5 - extra trees
# output6 - SGDClassifier
# output7 - GradientBoostingClassifier
# output8 - extra trees 10000