In [88]:
# Importing modules

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob

In [89]:
# Reading training dataset

pd.set_option('display.max_columns', None)
df = pd.read_csv('dataset/train/training-data.csv')
df

Unnamed: 0,time,f1_c,f1_a,f1_s,f1_d,f2_c,f2_a,f2_s,f2_d,prg_c,prg_a,prg_s,prg_d,prd_c,prd_a,prd_s,prd_d,pr_s,pr_d,lq_s,lq_d,cmp_a_s,cmp_a_d,cmp_b_s,cmp_b_d,cmp_c_s,cmp_c_d
0,0.0000,0,0,0,0,1851,1831,155,0,0,0,0,0,50522,51872,21637,0,53281,0,28700,0,30781,0,1186,0,33567,0
1,0.1083,0,0,0,0,1851,1831,155,0,0,0,0,0,50522,51872,21637,0,53281,0,28700,0,30781,0,1186,0,33567,0
2,0.2103,0,0,0,0,1851,1831,155,0,0,0,0,0,50522,51872,21637,0,53281,0,28700,0,30781,0,1186,0,33567,0
3,0.3121,0,0,0,0,1851,1831,155,0,0,0,0,0,50522,51872,21637,0,53281,0,28700,0,30781,0,1186,0,33567,0
4,0.4141,0,0,0,0,1851,1831,155,0,0,0,0,0,50522,51872,21637,0,53281,0,28700,0,30781,0,1186,0,33567,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9780,999.1758,53824,61269,65535,0,1896,1923,163,0,0,0,0,0,51424,49824,21149,0,55103,0,28995,0,30828,0,980,0,33725,0
9781,999.2778,53824,61269,65535,0,1896,1923,163,0,0,0,0,0,51424,49824,21149,0,55103,0,28995,0,30828,0,980,0,33725,0
9782,999.3796,42919,53824,65535,0,1868,1895,161,-2,0,0,0,0,52984,51424,21841,692,55163,60,28991,-4,30828,0,980,0,33725,0
9783,999.4818,42919,53824,65535,0,1868,1895,161,0,0,0,0,0,52984,51424,21841,0,55163,0,28991,0,30828,0,980,0,33725,0


In [94]:
# Computing mean and covariance matrix for multivariate guassian analysis

def compute_mu_and_covariance(df):
    mu = np.mean(df,axis=0)
    sigma = np.cov(df.T)
    return mu,sigma

mu,sigma = compute_mu_and_covariance(df)

# Applying the multivariate guassian model for the probabilities

def est_mult_gaus(dataset, mu, sigma):
    eq_part_one = 1/m.sqrt(np.power(2 * np.pi, len(dataset)) * np.linalg.det(sigma))
    eq_part_2 = np.atleast_2d(dataset - mu)
    eq_part3 = np.linalg.inv(sigma)
    eq_part4 = dataset - mu
    a = np.matmul(eq_part_2, eq_part3)
    b = np.matmul(a, eq_part4)
    result = eq_part_one * np.exp(-0.5 * b)
    return result

p = est_mult_gaus(df, mu, sigma)
p

[[ 8.33215422e+04  1.29032733e+05  1.46236434e+05  1.34228503e+05
  -9.71858748e+02  1.97304921e+03  2.24652179e+03  1.93266008e+02
  -2.41535653e+00  1.25125565e+05  1.33368672e+05  1.66667274e+05
  -8.25776134e+02  1.10980391e+04 -7.69392688e+03 -2.76542367e+03
   8.06003030e+02  7.79404690e+03 -3.61722570e+01  1.91062947e+03
   6.77747835e-01  2.74038201e+02  3.73328720e-01 -1.43266989e+04
   6.78258116e-01  1.40546454e+04 -1.04993008e+00]
 [ 1.29032733e+05  9.36874690e+08  9.31363977e+08  9.29972881e+08
   1.78022203e+06  9.43269817e+06  9.41431995e+06  7.99123213e+05
   3.59605539e+02  1.47822674e+08  1.43849774e+08  1.77154042e+08
   6.89086047e+05 -6.23570796e+08 -6.29943895e+08 -2.64776010e+08
   2.27476235e+05  9.13488864e+06  8.80338661e+04  4.27019177e+05
   1.53990747e+04 -1.84240680e+04  2.70170588e+03  7.68292589e+03
  -2.02090392e+03  1.07362147e+04 -6.90634379e+02]
 [ 1.46236434e+05  9.31363977e+08  9.38680009e+08  9.37349146e+08
   5.80976333e+05  9.43767168e+06  9.446

0       1.857403e-58
1       1.857403e-58
2       1.857403e-58
3       1.857403e-58
4       1.857403e-58
            ...     
9780    1.857403e-58
9781    1.857403e-58
9782    1.857403e-58
9783    1.857403e-58
9784    1.857403e-58
Length: 9785, dtype: float64

In [97]:
# After cross-validation tuning, which included using the best epsilon function
# created and also used for the last task, and after manual analysis and
# trial and error, we found the best epsilon to be 6000

eps = 6000

# Running the model against test dataset after getting best epsilon

test_dir = 'dataset/test'
test_filenames = glob.glob(test_dir + "\*.csv")
pred_list = []

counter = 0
for i in test_filenames:
    curr_pred_list = []
    test_df = pd.read_csv(i)
    test_df = test_df.drop(['time'], axis=1)
    mu,sigma = compute_mu_and_covariance(test_df)
    arr = est_mult_gaus(test_df, mu, sigma)
    for i in arr:
        if i < eps:
            curr_pred_list.append(1)
        else:
            curr_pred_list.append(0)
    pred_list.append(curr_pred_list)
    print('Done with file ' + str(counter))
    counter += 1
    

[[ 9.62284670e+08  9.55413287e+08  2.69904310e+05  2.69904310e+05
   9.94175000e+06  9.91346758e+06  8.41486591e+05  4.23795731e+02
   1.55673637e+08  1.46862529e+08  1.78624315e+08  1.02125205e+06
  -6.49882946e+08 -6.56268097e+08 -2.75798628e+08  2.46389051e+05
   9.72742487e+06  9.06531169e+04  4.09614967e+05  1.60047293e+04
  -2.84070236e+04  2.77320791e+03  8.45264202e+04 -2.06438407e+03
  -5.60244907e+04 -7.21799703e+02]
 [ 9.55413287e+08  9.61158082e+08  2.70313546e+05  2.70313546e+05
   9.92483559e+06  9.92364639e+06  8.42349218e+05  2.02247287e+02
   1.60881292e+08  1.53820827e+08  1.87399229e+08  8.83615232e+05
  -6.41451021e+08 -6.49352686e+08 -2.72722582e+08  3.21158109e+05
   1.05859383e+07  8.92484406e+04  5.60909078e+05  1.58805332e+04
  -1.34397421e+03  2.80790579e+03  6.33500500e+04 -2.09261313e+03
  -6.20080495e+04 -7.25638497e+02]
 [ 2.69904310e+05  2.70313546e+05  4.38382793e+05  4.38382793e+05
   4.50389746e+03  4.53718550e+03  3.86181004e+02 -6.14573640e-02
   4.4

  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))


Done with file 9
[[ 9.69472776e+08  9.63844370e+08  9.63540265e+08  1.79492334e+06
   9.94157738e+06  9.91443039e+06  8.42509217e+05  4.40851431e+02
   1.57213036e+08  1.56376933e+08  1.89251145e+08  2.48814622e+05
  -6.52819084e+08 -6.58734201e+08 -2.77232977e+08  2.04796412e+05
   8.36454334e+06  8.64657121e+04  3.82641295e+05  1.60731633e+04
  -2.72111355e+04  2.80955296e+03  1.22677346e+04 -1.18679226e+03
   1.51783695e+04 -1.62766060e+03]
 [ 9.63844370e+08  9.70674332e+08  9.70427944e+08  5.95503135e+05
   9.94783775e+06  9.94773078e+06  8.45371021e+05  2.15917150e+02
   1.57626054e+08  1.57003367e+08  1.89963943e+08  1.94140511e+05
  -6.46086133e+08 -6.53507828e+08 -2.74876803e+08  2.70537998e+05
   9.21103449e+06  8.84535396e+04  5.33596145e+05  1.59973013e+04
  -1.74385084e+02  2.85551784e+03  8.03069989e+02 -1.21034181e+03
  -4.56346807e+02 -1.65468114e+03]
 [ 9.63540265e+08  9.70427944e+08  9.74856424e+08  7.60529954e+05
   9.88047429e+06  9.88110103e+06  8.39689479e+05  2.09

  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))


[[ 9.63372012e+08  9.56445117e+08  8.33584004e+08  1.63579527e+06
   9.91262889e+06  9.88382870e+06  8.39268142e+05  4.28468247e+02
   1.54750190e+08  1.51193215e+08  1.85726367e+08  6.06699564e+05
  -6.50266284e+08 -6.56697532e+08 -2.76248107e+08  2.44033088e+05
   8.81020277e+06  8.84394582e+04  4.11899792e+05  1.60445398e+04
  -2.89279457e+04  2.79117045e+03  8.66374089e+04 -2.07134444e+03
  -5.80818541e+04 -7.23688689e+02]
 [ 9.56445117e+08  9.62114520e+08  8.39481835e+08  7.09116673e+05
   9.90384519e+06  9.90211063e+06  8.40827567e+05  2.09143286e+02
   1.57914494e+08  1.54641687e+08  1.90165222e+08  5.35002397e+05
  -6.41802580e+08 -6.49742803e+08 -2.73154971e+08  3.18505858e+05
   9.66237581e+06  8.86967688e+04  5.62499945e+05  1.59191708e+04
  -1.86199668e+03  2.83124804e+03  6.57271708e+04 -2.09829259e+03
  -6.42453044e+04 -7.31526304e+02]
 [ 8.33584004e+08  8.39481835e+08  7.38858290e+08  7.15246590e+05
   8.57777559e+06  8.57773163e+06  7.28357333e+05  1.70027419e+02
   1.3

  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))


[[ 9.62440319e+08  9.55741715e+08  9.54454598e+08  1.88078571e+06
   9.92223249e+06  9.89353388e+06  0.00000000e+00  0.00000000e+00
   1.44618098e+08  1.36427009e+08  1.66312214e+08  8.77519589e+05
  -6.50047350e+08 -6.56437601e+08 -2.75866423e+08  2.49920080e+05
   9.71536588e+06  9.55744366e+04  4.10633655e+05  1.60389726e+04
  -2.86577068e+04  2.78584768e+03  8.46257280e+04 -2.06386223e+03
  -5.57631048e+04 -7.21477584e+02]
 [ 9.55741715e+08  9.61621052e+08  9.61334159e+08  7.93415685e+05
   9.91433275e+06  9.91280654e+06  0.00000000e+00  0.00000000e+00
   1.51958665e+08  1.43345996e+08  1.75816978e+08  9.62972673e+05
  -6.41593221e+08 -6.49500601e+08 -2.72771665e+08  3.24308631e+05
   1.06175481e+07  9.45630648e+04  5.61726362e+05  1.58987719e+04
  -1.49978062e+03  2.82341049e+03  6.38050573e+04 -2.09045049e+03
  -6.20579188e+04 -7.29496751e+02]
 [ 9.54454598e+08  9.61334159e+08  9.65786681e+08  8.72212732e+05
   9.84118427e+06  9.84145354e+06  0.00000000e+00  0.00000000e+00
   1.5

  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))


Done with file 33
[[ 9.62893149e+08  9.56396083e+08  9.55032941e+08  1.91017485e+06
   9.91297552e+06  9.88440056e+06  8.39055841e+05  3.89446105e+02
   1.60000294e+08  1.54779578e+08  0.00000000e+00  0.00000000e+00
  -6.49824442e+08 -6.56275930e+08 -2.76126463e+08  2.52693691e+05
   8.73600072e+06  8.65440145e+04  4.16920146e+05  1.60143495e+04
  -2.84788245e+04  2.78025912e+03  8.60421675e+04 -2.06656875e+03
  -5.66502587e+04 -7.16480612e+02]
 [ 9.56396083e+08  9.62454026e+08  9.62114341e+08  7.89756624e+05
   9.90032145e+06  9.89858995e+06  8.40244907e+05  1.78839403e+02
   1.62099519e+08  1.56988686e+08  0.00000000e+00  0.00000000e+00
  -6.41322782e+08 -6.49275479e+08 -2.73019998e+08  3.25875562e+05
   9.55171390e+06  8.74964592e+04  5.66717701e+05  1.58673288e+04
  -1.63593878e+03  2.81341773e+03  6.52769857e+04 -2.09437367e+03
  -6.27846586e+04 -7.23397839e+02]
 [ 9.55032941e+08  9.62114341e+08  9.66606570e+08  8.81692791e+05
   9.82694426e+06  9.82733926e+06  8.34201353e+05  1.5

  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))


[[ 9.62284670e+08  9.55413287e+08  2.69904310e+05  2.69904310e+05
   9.94175000e+06  9.91346758e+06  8.41486591e+05  4.23795731e+02
   1.55673637e+08  1.46862529e+08  1.78624315e+08  1.02125205e+06
  -6.49882946e+08 -6.56268097e+08 -2.75798628e+08  2.46389051e+05
   9.72742487e+06  9.06531169e+04  4.09614967e+05  1.60047293e+04
  -2.84070236e+04  2.77320791e+03  8.45264202e+04 -2.06438407e+03
  -5.60244907e+04 -7.21799703e+02]
 [ 9.55413287e+08  9.61158082e+08  2.70313546e+05  2.70313546e+05
   9.92483559e+06  9.92364639e+06  8.42349218e+05  2.02247287e+02
   1.60881292e+08  1.53820827e+08  1.87399229e+08  8.83615232e+05
  -6.41451021e+08 -6.49352686e+08 -2.72722582e+08  3.21158109e+05
   1.05859383e+07  8.92484406e+04  5.60909078e+05  1.58805332e+04
  -1.34397421e+03  2.80790579e+03  6.33500500e+04 -2.09261313e+03
  -6.20080495e+04 -7.25638497e+02]
 [ 2.69904310e+05  2.70313546e+05  4.38382793e+05  4.38382793e+05
   4.50389746e+03  4.53718550e+03  3.86181004e+02 -6.14573640e-02
   4.4

  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))


[[ 9.63372012e+08  9.56445117e+08  8.33584004e+08  1.63579527e+06
   9.91262889e+06  9.88382870e+06  8.39268142e+05  4.28468247e+02
   1.54750190e+08  1.51193215e+08  1.85726367e+08  6.06699564e+05
  -6.50266284e+08 -6.56697532e+08 -2.76248107e+08  2.44033088e+05
   8.81020277e+06  8.84394582e+04  4.11899792e+05  1.60445398e+04
  -2.89279457e+04  2.79117045e+03  8.66374089e+04 -2.07134444e+03
  -5.80818541e+04 -7.23688689e+02]
 [ 9.56445117e+08  9.62114520e+08  8.39481835e+08  7.09116673e+05
   9.90384519e+06  9.90211063e+06  8.40827567e+05  2.09143286e+02
   1.57914494e+08  1.54641687e+08  1.90165222e+08  5.35002397e+05
  -6.41802580e+08 -6.49742803e+08 -2.73154971e+08  3.18505858e+05
   9.66237581e+06  8.86967688e+04  5.62499945e+05  1.59191708e+04
  -1.86199668e+03  2.83124804e+03  6.57271708e+04 -2.09829259e+03
  -6.42453044e+04 -7.31526304e+02]
 [ 8.33584004e+08  8.39481835e+08  7.38858290e+08  7.15246590e+05
   8.57777559e+06  8.57773163e+06  7.28357333e+05  1.70027419e+02
   1.3

  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))
  p = 1/((2*np.pi)**(m/2)*np.linalg.det(covariance)**(0.5))*np.exp(-0.5*np.sum(X.dot(np.linalg.pinv(covariance))*X,axis=1))


Done with file 56
[[ 9.62026052e+08  9.55555807e+08  9.54122780e+08  1.88323662e+06
   9.89246201e+06  9.86435222e+06  8.37509808e+05  4.11066524e+02
   1.51734832e+08  1.44509401e+08  1.55659399e+08  8.93512269e+05
  -6.50139383e+08 -6.56546168e+08 -2.75971891e+08  2.45583691e+05
   9.43955938e+06  9.20401008e+04  4.10180752e+05  1.60045041e+04
  -2.78350256e+04  2.78991987e+03  8.52497975e+04 -2.06117833e+03
  -5.71910075e+04 -7.17239548e+02]
 [ 9.55555807e+08  9.61624114e+08  9.61283629e+08  7.91794052e+05
   9.88802692e+06  9.88687793e+06  8.39398108e+05  1.90918190e+02
   1.56704581e+08  1.50656158e+08  1.62613139e+08  7.39577377e+05
  -6.41706383e+08 -6.49618055e+08 -2.72885557e+08  3.18946289e+05
   1.03192693e+07  9.11093196e+04  5.60358406e+05  1.58726441e+04
  -8.73903759e+02  2.82939098e+03  6.44649070e+04 -2.09004139e+03
  -6.32750366e+04 -7.28226801e+02]
 [ 9.54122780e+08  9.61283629e+08  9.65716867e+08  8.72621509e+05
   9.81443370e+06  9.81543194e+06  8.33345217e+05  1.6

In [98]:
# Writing the results to submission txt file and also printing the results here

import collections
c = 0
content = ''
for i in pred_list:
    frequency = collections.Counter(i)
    print(str(c) + ' ' + str(max(dict(frequency))))
    content += str(c) + ' ' + str(max(dict(frequency)))
    c += 1
file = open('task3_submission.txt', 'w')
file.write(content)
file.close()

0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 0
10 1
11 1
12 1
13 0
14 1
15 0
16 0
17 0
18 1
19 1
20 1
21 1
22 1
23 1
24 0
25 1
26 1
27 0
28 0
29 0
30 1
31 0
32 0
33 1
34 0
35 1
36 0
37 0
38 1
39 0
40 1
41 1
42 0
43 1
44 0
45 1
46 1
47 1
48 1
49 1
50 1
51 1
52 0
53 1
54 1
55 0
56 1
57 1
