# Coronavirus Data Set

#### dev.csv is the test data, train.csv is the train data

In [1]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()

Saving dev.csv to dev.csv
Saving train.csv to train.csv


In [2]:
coronavirus_train_data = pd.read_csv('train.csv',sep='|')
coronavirus_test_data = pd.read_csv('dev.csv',sep='|')

# Group Labels

In [3]:
# train set label groups
label_groups = coronavirus_train_data['Label'].groupby(coronavirus_train_data['Label'])
print(label_groups.count())

Label
0    38837
1    41163
Name: Label, dtype: int64


In [4]:
# test set label groups
label_groups = coronavirus_test_data['Label'].groupby(coronavirus_test_data['Label'])
print(label_groups.count())

Label
0    4963
1    5037
Name: Label, dtype: int64


# Data set is already split betweem train and test sets.

# BOW and Tf-Idf

In [5]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train = coronavirus_train_data['Text']
y_train = coronavirus_train_data['Label']

X_test = coronavirus_test_data['Text']
y_test = coronavirus_test_data['Label']

X_train_tfidf = vectorizer.fit_transform(X_train.values.astype('U')).toarray() # form of BOW

X_test_tfidf = vectorizer.transform(X_test.values.astype('U')).toarray()

# Model or Classifier

Classification: build a model (here using Naive Bayes) on train data and labels then predict the labels of the test set

In [6]:

# train = build the model (fit) on the train set
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)  # model building phase
    
# test results = predict on test set data, to get predicted labels
predicted = clf.predict(X_test_tfidf)
# print(predicted)

In [7]:
from sklearn import metrics

display (metrics.accuracy_score(y_test, predicted))

display (metrics.confusion_matrix(y_test, predicted))

display (metrics.precision_score(y_test, predicted))

display (metrics.recall_score(y_test, predicted))


0.8036

array([[4069,  894],
       [1070, 3967]])

0.8160872248508537

0.787571967440937

# Count Vectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# use count vectorizer to get the BOW matrix with frequency of term per doc
count_vectorizer = CountVectorizer()

# train count vectorizer
x_train_count_vectorizer = count_vectorizer.fit_transform(X_train.values.astype('U'))

# test count vectorizer
x_test_count_vectorizer = count_vectorizer.transform(X_test.values.astype('U'))


# Model or Classifier

Classification: build a model (here using Naive Bayes) on train data and labels then predict the labels of the test set

In [9]:
# train = build the model (fit) on the train set
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train_count_vectorizer, y_train)  # model building phase
    
# test results = predict on test set data, to get predicted labels
predicted = clf.predict(x_test_count_vectorizer)
print(predicted)

[1 0 0 ... 0 1 1]


In [10]:
from sklearn import metrics

display (metrics.accuracy_score(y_test, predicted))

display (metrics.confusion_matrix(y_test, predicted))

display (metrics.precision_score(y_test, predicted))

display (metrics.recall_score(y_test, predicted))

0.7932

array([[4025,  938],
       [1130, 3907]])

0.8063983488132095

0.7756601151479054