#Triage Data Set

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()

Saving dev.csv to dev.csv
Saving train.csv to train.csv


In [None]:
triage_train_data = pd.read_csv('train.csv', sep='|')
triage_test_data = pd.read_csv('dev.csv', sep='|')

triage_train_text = triage_train_data['Text']
triage_test_text = triage_test_data['Text']
triage_train_label = triage_train_data['Label']
triage_test_label = triage_test_data['Label']


#Group Labels


In [None]:
# train set label groups
label_groups = triage_train_data['Label'].groupby(triage_train_data['Label'])
print(label_groups.count())

Label
0    12361
1     8685
Name: Label, dtype: int64


In [None]:
# test set label groups
label_groups = triage_test_data['Label'].groupby(triage_test_data['Label'])
print(label_groups.count())

Label
0    1525
1    1048
Name: Label, dtype: int64


In [None]:
# print information about the train set
print(triage_train_data.head(10))
print(triage_train_data.shape)
print(triage_train_data['Label'].unique())

                                                Text  Label
0  i would like to know when the national archive...      0
1  the strongest wind force measured near the cen...      0
2  i need help because i lost my mother and my fa...      1
3  i would like to send some informations for me ...      0
4  in addition it intends to concentrate on promo...      0
5  while the total impact on grain production is ...      1
6  researchers from the university of oxford and ...      0
7  it will also construct the 1 000 cubic metre w...      0
8  an important effort to enhance protection part...      1
9  priority activities of the organisation includ...      0
(21046, 2)
[0 1]


In [None]:
# print information about the test set
print(triage_test_data.head(10))
print(triage_test_data.shape)
print(triage_test_data['Label'].unique())

                                                Text  Label
0  the tremor measured 5 8 on the richter scale t...      0
1  in trincomalee where nearly 361 people died th...      1
2  i would like to know if someone did not go to ...      0
3         holy moley this storm is going to be scary      0
4  small grants programme is not eligible to fund...      0
5  an analysis of meteosat satellite imagery noaa...      0
6                   notes this message is incomplete      0
7  this has greatly improved the condition of cro...      0
8  i make you to know that now the sms service 46...      0
9  i am in the street i have a house all my cards...      0
(2573, 2)
[0 1]


#Data set is already split between train and test sets. 



#BOW 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Vectorization
vectorizer = CountVectorizer()
Xtrain_data_vectorized = vectorizer.fit_transform(triage_train_text.values.astype('U'))
Xtest_data_vectorized = vectorizer.transform(triage_test_text.values.astype('U'))



#Neural Network Model

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), max_iter=1000000, random_state=1)

clf.fit(Xtrain_data_vectorized, triage_train_label )

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), max_iter=1000000,
              random_state=1, solver='lbfgs')

In [None]:
from sklearn import metrics


predicted = clf.predict(Xtest_data_vectorized)

display (metrics.accuracy_score(triage_test_label, predicted))

display (metrics.confusion_matrix(triage_test_label, predicted))

display (metrics.f1_score(triage_test_label, predicted))

display (metrics.recall_score(triage_test_label, predicted))


0.6937427127866304

array([[1118,  407],
       [ 381,  667]])

0.6286522148916116

0.6364503816793893

#BOW and Tf-Idf

In [None]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

Xtrain_data_tfidf = vectorizer.fit_transform(triage_train_text.values.astype('U'))
Xtest_data_tfidf = vectorizer.transform(triage_test_text.values.astype('U'))


print('tfidf train shape:', Xtrain_data_tfidf.shape)
# (# training docs, # of unique vocab)
print('tfidf train type:', Xtrain_data_tfidf.dtype)

print('\n', Xtrain_data_tfidf)


tfidf train shape: (21046, 31211)
tfidf train type: float64

   (0, 30642)	0.2954894475651658
  (0, 3878)	0.42690764510139395
  (0, 30473)	0.22140206075187935
  (0, 2795)	0.5177012051739505
  (0, 18980)	0.32013914230668744
  (0, 27787)	0.09063139129711087
  (0, 30382)	0.2746355260857302
  (0, 15791)	0.25622105181925453
  (0, 28122)	0.21995840228161678
  (0, 16583)	0.23692939437884228
  (0, 30683)	0.2389731843195103
  (1, 1456)	0.17246060313592979
  (1, 12571)	0.23359158773504177
  (1, 30557)	0.0797152711905626
  (1, 13464)	0.1639933157677384
  (1, 20957)	0.15459738024329572
  (1, 15687)	0.17710863610337899
  (1, 770)	0.20858671840634693
  (1, 672)	0.1982499528434742
  (1, 24645)	0.31486584434424025
  (1, 3820)	0.5231983414873964
  (1, 3075)	0.08781602814358155
  (1, 30130)	0.09942666546452114
  (1, 19375)	0.11029734793874252
  (1, 8150)	0.20704117850532255
  :	:
  (21043, 14767)	0.21410632588611136
  (21044, 14328)	0.5126324608835391
  (21044, 13438)	0.43135771996754263
  (21044, 26461

#Neural Network Model with TfIdf

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), max_iter=1000000, random_state=42)

clf.fit(Xtrain_data_tfidf, triage_train_label )

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), max_iter=1000000,
              random_state=42, solver='lbfgs')

In [None]:
from sklearn import metrics
predicted = clf.predict(Xtest_data_tfidf)

display (metrics.accuracy_score(triage_test_label, predicted))

display (metrics.confusion_matrix(triage_test_label, predicted))

display (metrics.f1_score(triage_test_label, predicted))

display (metrics.recall_score(triage_test_label, predicted))

0.7582588418188885

array([[1212,  313],
       [ 309,  739]])

0.7038095238095239

0.7051526717557252