In [237]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re 

import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [276]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [239]:
columnes = []
hexidecimal_list = "0123456789ABCDEF"
for i in hexidecimal_list:
     for j in hexidecimal_list:
            columnes.append(i+j)

In [259]:
X_train = pd.read_csv("./data/df_train.csv")
X_test_real = pd.read_csv("./data/df_test.csv")

In [260]:
X_train_split = pd.DataFrame(np.swapaxes(np.concatenate([np.array(x)[:,None] for x in X_train["1-gram_features"].apply(eval)], axis=1),0,1), columns = columnes)
y_train = X_train["Labels"]

In [261]:
X_test_split = pd.DataFrame(np.swapaxes(np.concatenate([np.array(x)[:,None] for x in X_test_real["1-gram_features"].apply(eval)], axis=1),0,1), columns = columnes)

In [262]:
y_train.shape

(8147,)

In [263]:
X_train, X_test, y_train, y_test = train_test_split(X_train_split, y_train, test_size=0.05, random_state=0)

In [264]:
y_train.unique()

array([3, 1, 2, 4, 9, 5, 8, 6, 7])

In [265]:
class NaiveBayesClassifier():
    '''
    Bayes Theorem form
    P(y|X) = P(X|y) * P(y) / P(X)
    '''
    def calc_prior(self, features, target):
        '''
        prior probability P(y)
        calculate prior probabilities
        '''
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
        '''
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        '''
        calculate probability from gaussian density function (normally distributed)
        we will assume that probability of specific target value given specific class is normally distributed 
        
        probability density function derived from wikipedia:
        (1/√2pi*σ) * exp((-1/2)*((x-μ)^2)/(2*σ²)), where μ is mean, σ² is variance, σ is quare root of variance (standard deviation)
        '''
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
#         numerator = np.exp(-((x-mean)**2 / (2 * var)))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for i in range(self.count):
            prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
            posterior = prior + conditional
            posteriors.append(posterior)
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

    def visualize(self, y_true, y_pred, target):
        
        tr = pd.DataFrame(data=y_true, columns=[target])
        pr = pd.DataFrame(data=y_pred, columns=[target])
        
        
        fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
        sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
        sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)
        

        fig.suptitle('True vs Predicted Comparison', fontsize=20)

        ax[0].tick_params(labelsize=12)
        ax[1].tick_params(labelsize=12)
        ax[0].set_title("True values", fontsize=18)
        ax[1].set_title("Predicted values", fontsize=18)
        plt.show()


In [266]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7739, 256) (7739,)
(408, 256) (408,)


### Random Forest

In [267]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [268]:
# Use the forest's predict method on the test data
predictions = clf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 2.39 degrees.


In [269]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 64.7 %.


In [271]:
predictions_test = clf.predict(X_test_split)

In [274]:
predictions_test

array([1, 1, 2, ..., 1, 2, 1])

In [275]:
with open("y_test.txt", "w") as txt_file:
    for line in predictions_test:
        txt_file.write(str(line) + "\n") # works with any number of elements in a line

#### GDBT

In [None]:
gdbt = GradientBoostingClassifier(n_estimators=500, learning_rate=1.0,
    max_depth=2, random_state=0).fit(X_train, y_train)
gdbt.score(X_test, y_test)

In [278]:
predictions_test = gdbt.predict(X_test_split)

In [279]:
with open("y_test.txt", "w") as txt_file:
    for line in predictions_test:
        txt_file.write(str(line) + "\n") # works with any number of elements in a line

#### Naive Bayes

In [216]:
# train the model
x = NaiveBayesClassifier()
x.fit(X_train, y_train)

In [217]:
x.classes, x.feature_nums, x.rows, x.count

(array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 256, 303, 9)

In [228]:
predictions = x.predict(X_train)



In [229]:
x.accuracy(y_train, predictions)

0.006600660066006601

In [230]:
x.calc_prior(X_train, y_train)

array([0.12871287, 0.23762376, 0.28712871, 0.04620462, 0.00660066,
       0.05610561, 0.04290429, 0.0990099 , 0.09570957])

In [231]:
x.prior

array([0.12871287, 0.23762376, 0.28712871, 0.04620462, 0.00660066,
       0.05610561, 0.04290429, 0.0990099 , 0.09570957])

In [232]:
x.calc_statistics(X_train, y_train)

(array([[7.22156667e+04, 4.38784615e+03, 2.23441026e+03, ...,
         1.13302564e+03, 1.54164103e+03, 1.37507949e+04],
        [6.92538056e+04, 2.01003889e+04, 3.04675694e+04, ...,
         5.80527778e+03, 6.99333333e+03, 4.01564722e+04],
        [1.00953333e+04, 5.08804598e+03, 3.30741379e+03, ...,
         3.26737931e+03, 3.27252874e+03, 6.22340230e+03],
        ...,
        [5.40623077e+03, 4.31153846e+02, 5.67846154e+02, ...,
         2.48461538e+01, 2.80384615e+02, 8.04307692e+02],
        [2.49716667e+04, 8.96300000e+02, 4.26033333e+02, ...,
         4.09566667e+02, 4.63200000e+02, 4.14893333e+03],
        [1.35016207e+05, 4.79910345e+03, 2.30617241e+03, ...,
         1.72975862e+03, 2.27162069e+03, 1.83510690e+04]]),
 array([[5.36561999e+09, 3.31901597e+07, 6.09683470e+06, ...,
         2.02274044e+06, 3.26849808e+06, 1.80390746e+08],
        [7.63959999e+09, 4.57229238e+08, 5.32137460e+09, ...,
         7.13801611e+08, 1.75549547e+07, 4.08382200e+08],
        [4.02980739e+06, 

In [233]:
x.mean

array([[7.22156667e+04, 4.38784615e+03, 2.23441026e+03, ...,
        1.13302564e+03, 1.54164103e+03, 1.37507949e+04],
       [6.92538056e+04, 2.01003889e+04, 3.04675694e+04, ...,
        5.80527778e+03, 6.99333333e+03, 4.01564722e+04],
       [1.00953333e+04, 5.08804598e+03, 3.30741379e+03, ...,
        3.26737931e+03, 3.27252874e+03, 6.22340230e+03],
       ...,
       [5.40623077e+03, 4.31153846e+02, 5.67846154e+02, ...,
        2.48461538e+01, 2.80384615e+02, 8.04307692e+02],
       [2.49716667e+04, 8.96300000e+02, 4.26033333e+02, ...,
        4.09566667e+02, 4.63200000e+02, 4.14893333e+03],
       [1.35016207e+05, 4.79910345e+03, 2.30617241e+03, ...,
        1.72975862e+03, 2.27162069e+03, 1.83510690e+04]])

In [234]:
x.var

array([[5.36561999e+09, 3.31901597e+07, 6.09683470e+06, ...,
        2.02274044e+06, 3.26849808e+06, 1.80390746e+08],
       [7.63959999e+09, 4.57229238e+08, 5.32137460e+09, ...,
        7.13801611e+08, 1.75549547e+07, 4.08382200e+08],
       [4.02980739e+06, 1.20944283e+06, 2.91391047e+05, ...,
        2.86756994e+05, 3.20104433e+05, 2.20477902e+06],
       ...,
       [6.62724793e+05, 1.53012644e+06, 3.39024951e+06, ...,
        9.47976331e+02, 7.61923621e+05, 1.18009598e+05],
       [2.78957761e+08, 5.37312143e+05, 2.48281366e+05, ...,
        9.93170456e+04, 2.37012093e+05, 2.26019349e+07],
       [2.16169904e+10, 1.91946911e+07, 4.92950828e+06, ...,
        2.22048956e+06, 4.90218527e+06, 3.97806139e+08]])

In [235]:
X_train

Unnamed: 0,00,01,02,03,04,05,06,07,08,09,...,F6,F7,F8,F9,FA,FB,FC,FD,FE,FF
8,10581,5543,3297,3204,3224,3303,3158,3355,3295,3256,...,3152,3240,3252,3223,3142,3132,3180,3209,3193,6728
190,19864,1329,895,880,986,535,442,617,2150,675,...,331,10242,1342,234,973,935,644,498,1503,7408
303,6798,3034,3118,3172,3060,3085,3196,3061,3098,3020,...,3108,3194,3142,3089,3218,3114,3021,3088,3156,3588
237,5368,72,27,26,35,28,9,8,17,8,...,20,6,6,6,4,7,7,14,28,1574
329,74323,11945,5057,6467,8194,1972,2011,2137,4668,1250,...,1403,1481,2275,2867,1495,1080,1531,2593,2283,12050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,19621,714,292,441,576,384,262,223,362,222,...,477,824,951,344,236,273,632,230,231,2193
192,36571,1341,743,659,1652,1061,645,811,1228,446,...,739,406,855,172,687,491,514,327,784,6851
117,5762,61,20,36,30,24,6,3,108,4,...,6,20,10,10,6,7,11,7,28,630
47,3835,771,868,722,659,657,688,636,696,660,...,696,674,664,665,640,646,690,614,674,1028


In [236]:
y_train

8      3
190    2
303    3
237    7
329    1
      ..
323    8
192    9
117    7
47     6
172    3
Name: Labels, Length: 303, dtype: int64