# Having a look at the data before running ML algorithms

In [1]:
import helpers as hlp
path_train = './data/train.csv'
path_test = './data/test.csv'

X, y = hlp.load_train(path_train)
test = hlp.load_test(path_test)

In [2]:
print(f'Dims of train :{X.shape}, dims of test: {test.shape}')

Dims of train :(90847, 770), dims of test: (44747, 770)


Almost as much test data as half of a training data.
A lot of features, too. But the sample size should be sufficient for this dimensionality.

In [3]:
# let's look at the class distribution
print(y.value_counts())

7     81399
4      2928
11     1220
5       954
6       822
1       631
9       563
0       531
8       505
10      428
12      308
3       252
13      180
2       126
Name: class_label, dtype: int64


Ugh, that's some harsh class imbalance! Given our evaluation metric, weighted $F_1$, the classes are sorted in the order of their influence.

In [4]:
# let's look at the head of the data
# it's an embedding and doesn't offer insights after glancing
# except that the last two features might be word counts / total character counts
print(X.head(10))

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.468241  -0.298164   0.232144   0.236039  -0.121149  -0.154046   
1   0.258637   0.299068  -0.513382   0.975493  -0.081143  -0.188614   
2   0.537912   0.173653   0.144184  -0.404364  -0.487603  -0.381795   
3  -0.034786   0.129126   0.832070  -0.008820  -0.345535  -0.799636   
4   0.245363  -0.316505   0.092601  -0.151767  -0.252607  -0.144336   
5   0.647218  -0.751298   0.767836   0.460903  -0.687291   0.220317   
6  -0.034786   0.129126   0.832070  -0.008820  -0.345535  -0.799636   
7   1.109431  -0.063879  -0.238730   0.544430  -0.049863   0.185409   
8   0.481202  -0.527403  -0.002329  -0.506448   0.018539  -0.832392   
9   0.149740   0.153304   0.019636  -0.021915  -0.618835  -0.617984   

   feature_6  feature_7  feature_8  feature_9     ...       feature_760  \
0  -0.316271   0.568260   0.215421  -0.330563     ...         -0.305612   
1  -0.963915   0.800069  -0.592174  -0.770610     ...         -0.102

In [5]:
# ...and the tail
print(X.tail(10))

       feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
90837   0.211207  -0.009690  -0.313616  -0.516867   0.004674  -0.907727   
90838   0.463171  -0.062059   0.313066  -0.310785  -0.208178  -0.404631   
90839   0.576993  -0.487787   1.701951   1.256900  -0.004793  -0.587477   
90840  -0.034786   0.129126   0.832070  -0.008820  -0.345535  -0.799636   
90841   0.402460   0.824382   0.247189   0.195181   0.060316   0.350839   
90842   0.291578  -0.080009   0.282752  -0.285583  -0.234645  -0.441086   
90843   0.731541   0.333620  -0.433959   0.681171  -0.235746  -0.788672   
90844   0.324741   0.337772  -0.089793   0.014846  -0.112630  -0.561022   
90845   0.531690   0.120806  -0.080768   0.250866  -0.081647  -0.227169   
90846   0.215008  -0.005489   0.399295  -0.702625   0.098100  -1.694497   

       feature_6  feature_7  feature_8  feature_9     ...       feature_760  \
90837  -0.807024   0.006060  -0.017045  -0.469801     ...         -0.282040   
90838  -0.246174

In [6]:
print(test.head(10))

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.291123  -0.253527   0.083138   0.135609  -0.076418  -0.633257   
1   0.163060   0.234249  -0.548482  -0.342477  -0.249341  -0.985170   
2   0.703451   0.098007   0.544434   0.014882   0.518644  -1.115419   
3   0.061817   0.186173   0.763211   1.427817  -0.001758   0.213115   
4   0.899542  -0.470085   0.829839  -0.233863  -0.282362  -0.952958   
5   0.279203  -0.042705   0.274041   0.080256  -0.395341  -0.154649   
6   0.088833   0.554278  -0.015305  -0.234385  -0.346961  -0.366256   
7  -0.118167   0.077198   0.109425   0.032560  -0.459996  -0.484399   
8   0.819473   0.205875   1.044265  -0.261501  -0.080032  -0.513911   
9   0.660235   0.118785  -0.341049   0.301321  -0.282937   0.761635   

   feature_6  feature_7  feature_8  feature_9     ...       feature_760  \
0  -0.246189  -0.042812   0.110341  -0.698179     ...         -0.180544   
1   0.162305  -0.148927   0.142731  -0.011801     ...          0.069