In [21]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit

from sklearn.metrics import classification_report, accuracy_score

import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [None]:
# when reading from Hacker Rank

# import fileinput 

# temp = []  
    
# for f in fileinput.input(): 
#     temp.append(f)
    
# df = pd.DataFrame(temp)
# df[0] = df[0].str.replace('\n', '')

In [2]:
# when reading locally
df = pd.read_csv('input00.txt', header= None)
print(df.head())

                                                   0
0                                            4500 23
1  Nt8FJ +1 1:12087620705283 2:4.797982 3:1 4:4.9...
2  VCaTF +1 1:282114466020 2:3.151926 3:1 4:3.737...
3  gParY +1 1:173284955 2:1.785813 3:1 4:1.791759...
4  DtWDw +1 1:4708728355523 2:2.394989 3:1 4:3.09...


In [3]:
first_line = df[0][0]
feature_num = int(first_line.split(' ')[1])
train_cnt = int(first_line.split(' ')[0])
input_cnt = df[0][train_cnt+1]

In [4]:
df_train_validation = pd.DataFrame(df[1:train_cnt+1][0].str.split(" ").tolist())

df_train, df_validation = train_test_split(df_train_validation, test_size=.2)

In [5]:
def process_training_validation(data):
    y = data[1].astype(float)
    y = y.replace(-1, 0)
    X = data.drop(data.columns[[0, 1]], axis=1)
    X = X.T.reset_index(drop=True).T
    
    for i in range(feature_num):
        X[i] = X[i].str.split(":", expand = True)[1]
        
    return X, y

In [6]:
X_train, y_train = process_training_validation(df_train)
X_validation, y_validation = process_training_validation(df_validation)

In [7]:
df_input = pd.DataFrame(df[train_cnt+2:][0].str.split(" ").tolist())

input_features = df_input.drop(df_input.columns[0], axis=1)

input_features = input_features.T.reset_index(drop=True).T


for i in range(feature_num):
    input_features[i] = input_features[i].str.split(":", expand = True)[1]


In [8]:
def evaluate_model(method):
    model = method
    model.fit(X_train, y_train)
    y_validation_pred = model.predict(X_validation)
    print (classification_report(y_validation, y_validation_pred))
    print ("The accuracy score is {:.2%}".format(accuracy_score(y_validation, y_validation_pred)))
    return model

In [9]:
model_LogisticRegression = evaluate_model(LogisticRegression())

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       439
         1.0       0.51      1.00      0.68       461

   micro avg       0.51      0.51      0.51       900
   macro avg       0.26      0.50      0.34       900
weighted avg       0.26      0.51      0.35       900

The accuracy score is 51.22%


In [10]:
model_RandomForest = evaluate_model(RandomForestClassifier())

              precision    recall  f1-score   support

         0.0       0.82      0.85      0.83       439
         1.0       0.85      0.82      0.83       461

   micro avg       0.83      0.83      0.83       900
   macro avg       0.83      0.83      0.83       900
weighted avg       0.83      0.83      0.83       900

The accuracy score is 83.11%


In [11]:
model_SGDC = evaluate_model(SGDClassifier())

              precision    recall  f1-score   support

         0.0       0.49      1.00      0.66       439
         1.0       0.00      0.00      0.00       461

   micro avg       0.49      0.49      0.49       900
   macro avg       0.24      0.50      0.33       900
weighted avg       0.24      0.49      0.32       900

The accuracy score is 48.78%


In [12]:
model_SVC = evaluate_model(LinearSVC(C=100))

              precision    recall  f1-score   support

         0.0       0.49      1.00      0.66       439
         1.0       0.00      0.00      0.00       461

   micro avg       0.49      0.49      0.49       900
   macro avg       0.24      0.50      0.33       900
weighted avg       0.24      0.49      0.32       900

The accuracy score is 48.78%


In [None]:
#SVC is too slow
# model_SVC = evaluate_model(SVC(kernel = 'linear', C=100))

In [13]:
model_MultinomialNB = evaluate_model(MultinomialNB())

              precision    recall  f1-score   support

         0.0       0.59      0.46      0.52       439
         1.0       0.57      0.69      0.63       461

   micro avg       0.58      0.58      0.58       900
   macro avg       0.58      0.58      0.57       900
weighted avg       0.58      0.58      0.57       900

The accuracy score is 58.00%


In [14]:
model_GaussianNB = evaluate_model(GaussianNB())

              precision    recall  f1-score   support

         0.0       0.43      0.13      0.19       439
         1.0       0.50      0.84      0.63       461

   micro avg       0.49      0.49      0.49       900
   macro avg       0.47      0.48      0.41       900
weighted avg       0.47      0.49      0.42       900

The accuracy score is 49.33%


In [16]:
model_SGDC = evaluate_model(SGDClassifier( loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=8, tol=None ))

              precision    recall  f1-score   support

         0.0       0.49      1.00      0.66       439
         1.0       0.00      0.00      0.00       461

   micro avg       0.49      0.49      0.49       900
   macro avg       0.24      0.50      0.33       900
weighted avg       0.24      0.49      0.32       900

The accuracy score is 48.78%


In [20]:
model_GradientBoosting = evaluate_model(GradientBoostingClassifier( n_estimators=200, max_depth=3, learning_rate=0.3 ) )

              precision    recall  f1-score   support

         0.0       0.83      0.84      0.84       439
         1.0       0.84      0.84      0.84       461

   micro avg       0.84      0.84      0.84       900
   macro avg       0.84      0.84      0.84       900
weighted avg       0.84      0.84      0.84       900

The accuracy score is 83.89%


In [26]:
name_list = [i[0] for i in df[train_cnt+2:][0].str.split(" ")]

prediction_list = model_RandomForest.predict(input_features)

final_prediction_list = []


for i in range(len(prediction_list)):
    if prediction_list[i] == 1:
        final_prediction_list.append(name_list[i] + " " + '+1')
#         print(name_list[i] + " " + '+1')
    else:
        final_prediction_list.append(name_list[i] + " " + '-1')
#         print(name_list[i] + " " + '-1')
    
print(final_prediction_list[:20])

['3rCWr -1', 'snInN -1', 'ibfT7 +1', 'IcbKR +1', 'SIXmF +1', 'dLCdh +1', 'ziFJ8 -1', '1WtTD -1', '9uIKh +1', 'df4Mc -1', '3nxpY -1', 'aesmq +1', 'MyTDz +1', 'TDMhx +1', 'Y0rW3 +1', 'KCcKf +1', '2cz5M -1', 'kqIJj -1', 'C1Sg2 +1', 'VAmIt -1']
