# Triage Data Set

#### dev.csv is the test data, train.csv is the train data

In [3]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()

Saving dev.csv to dev (1).csv
Saving train.csv to train (1).csv


In [4]:
triage_train_data = pd.read_csv('train.csv', sep='|')
triage_test_data = pd.read_csv('dev.csv', sep='|')


# group Labels

In [5]:
# train set label groups
label_groups = triage_train_data['Label'].groupby(triage_train_data['Label'])
print(label_groups.count())

Label
0    12361
1     8685
Name: Label, dtype: int64


In [6]:
# test set label groups
label_groups = triage_test_data['Label'].groupby(triage_test_data['Label'])
print(label_groups.count())

Label
0    1525
1    1048
Name: Label, dtype: int64


In [7]:
# print information about the train set
print(triage_train_data.head(10))
print(triage_train_data.shape)
print(triage_train_data['Label'].unique())

                                                Text  Label
0  i would like to know when the national archive...      0
1  the strongest wind force measured near the cen...      0
2  i need help because i lost my mother and my fa...      1
3  i would like to send some informations for me ...      0
4  in addition it intends to concentrate on promo...      0
5  while the total impact on grain production is ...      1
6  researchers from the university of oxford and ...      0
7  it will also construct the 1 000 cubic metre w...      0
8  an important effort to enhance protection part...      1
9  priority activities of the organisation includ...      0
(21046, 2)
[0 1]


In [8]:
# print information about the test set
print(triage_test_data.head(10))
print(triage_test_data.shape)
print(triage_test_data['Label'].unique())

                                                Text  Label
0  the tremor measured 5 8 on the richter scale t...      0
1  in trincomalee where nearly 361 people died th...      1
2  i would like to know if someone did not go to ...      0
3         holy moley this storm is going to be scary      0
4  small grants programme is not eligible to fund...      0
5  an analysis of meteosat satellite imagery noaa...      0
6                   notes this message is incomplete      0
7  this has greatly improved the condition of cro...      0
8  i make you to know that now the sms service 46...      0
9  i am in the street i have a house all my cards...      0
(2573, 2)
[0 1]


# Data set is already split betweem train and test sets.

# BOW and Tf-Idf

In [9]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train = triage_train_data['Text']
y_train = triage_train_data['Label']

X_test = triage_test_data['Text']
y_test = triage_test_data['Label']

X_train_tfidf = vectorizer.fit_transform(X_train.values.astype('U')).toarray() # form of BOW


print('tfidf train shape:', X_train_tfidf.shape)
# (# training docs, # of unique vocab)
print('tfidf train type:', X_train_tfidf.dtype)

print('\n', X_train_tfidf)

tfidf train shape: (21046, 31211)
tfidf train type: float64

 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
X_test_tfidf = vectorizer.transform(X_test.values.astype('U')).toarray()

print('tfidf train shape:', X_test_tfidf.shape)
# (# training docs, # of unique vocab)
print('tfidf train type:', X_test_tfidf.dtype)
print('\ntfidf test:', X_test_tfidf)

tfidf train shape: (2573, 31211)
tfidf train type: float64

tfidf test: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Model or Classifier

Classification: build a model (here using Naive Bayes) on train data and labels then predict the labels of the test set

In [11]:

# train = build the model (fit) on the train set
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)  # model building phase
    
# test results = predict on test set data, to get predicted labels
predicted = clf.predict(X_test_tfidf)
print(predicted)

[0 1 0 ... 0 1 0]


In [12]:

from sklearn import metrics

display (metrics.accuracy_score(y_test, predicted))

# display (metrics.confusion_matrix(y_test, predicted))

display (metrics.precision_score(y_test, predicted))

display (metrics.recall_score(y_test, predicted))


0.7516517683637777

0.7284916201117319

0.6221374045801527

# Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# use count vectorizer to get the BOW matrix with frequency of term per doc
count_vectorizer = CountVectorizer()

# train count vectorizer
x_train_count_vectorizer = count_vectorizer.fit_transform(X_train.values.astype('U'))

# test count vectorizer
x_test_count_vectorizer = count_vectorizer.transform(X_test.values.astype('U'))


print('tfidf train shape:', x_train_count_vectorizer.shape)
# (# training docs, # of unique vocab)
print('tfidf train type:', x_train_count_vectorizer.dtype)
print('\n', x_train_count_vectorizer.toarray())

print(x_test_count_vectorizer.toarray())

tfidf train shape: (21046, 31211)
tfidf train type: int64

 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Model or Classifier

Classification: build a model (here using Naive Bayes) on train data and labels then predict the labels of the test set

In [14]:
# train = build the model (fit) on the train set
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train_count_vectorizer, y_train)  # model building phase
    
# test results = predict on test set data, to get predicted labels
predicted = clf.predict(x_test_count_vectorizer)
print(predicted)

[0 1 0 ... 0 1 0]


In [15]:
from sklearn import metrics

display (metrics.accuracy_score(y_test, predicted))

display (metrics.confusion_matrix(y_test, predicted))

display (metrics.precision_score(y_test, predicted))

display (metrics.recall_score(y_test, predicted))

0.7396035755926934

array([[1096,  429],
       [ 241,  807]])

0.6529126213592233

0.7700381679389313