<a href="https://colab.research.google.com/github/feliciahf/data_science_exam/blob/main/hippocorpus_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages

In [28]:
import pandas as pd
import numpy as np

# Preprocessing
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Naive Bayes model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Accuracies
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support as score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import data

In [18]:
# import csv file as dataframe (from GitHub repo)
url = 'https://raw.githubusercontent.com/feliciahf/data_science_exam/main/hippoCorpusV2.csv'
df = pd.read_csv(url, encoding='latin1')

In [19]:
# remove columns with uninformative information (AssignmentId, WorkerId, recAgnPairId, recImgPairId)
uninformative_cols = ["AssignmentId", "WorkerId", "recAgnPairId", "recImgPairId"]
df = df.drop(columns=uninformative_cols)

In [20]:
# make labels column using numerical values
df.memType = pd.Categorical(df.memType)
df['label'] = df.memType.cat.codes

In [21]:
# story type corresponding to label
print(f"Label 0: {df.loc[df['label'] == 0,'memType'].unique()}")
print(f"Label 1: {df.loc[df['label'] == 1,'memType'].unique()}")
print(f"Label 2: {df.loc[df['label'] == 2,'memType'].unique()}")

Label 0: ['imagined']
Categories (1, object): ['imagined']
Label 1: ['recalled']
Categories (1, object): ['recalled']
Label 2: ['retold']
Categories (1, object): ['retold']


# Preprocessing

In [22]:
# case collapsing
df['story'] = df.story.map(lambda x: x.lower())
# remove punctuation
df['story'] = df.story.str.replace('[^\w\s]', '')
# tokenization
df['story'] = df['story'].apply(nltk.word_tokenize)

In [23]:
df['story']

0       [concerts, are, my, most, favorite, thing, and...
1       [the, day, started, perfectly, with, a, great,...
2       [it, seems, just, like, yesterday, but, today,...
3       [five, months, ago, my, niece, and, nephew, we...
4       [about, a, month, ago, i, went, to, burning, m...
                              ...                        
6849    [my, dog, was, diagnosed, with, lymphoma, a, y...
6850    [over, my, vacation, from, my, job, i, went, t...
6851    [this, event, was, a, birthday, party, for, my...
6852    [this, event, occurred, about, two, weeks, ago...
6853    [over, the, past, year, i, have, been, involve...
Name: story, Length: 6854, dtype: object

In [24]:
# transform data into occurrences
# This converts the list of words into space-separated strings
df['story'] = df['story'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['story'])

In [25]:
# tf-idf
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

# Training NB model

In [None]:
# split data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2, random_state=69)

In [26]:
# Fit model
model = MultinomialNB().fit(X_train, y_train)
# Test model
predicted = model.predict(X_test)

# Evaluating NB model

In [29]:
# compute overall accuracy, precision, recall, f1 scores
print('Accuracy: ', accuracy_score(y_test, predicted))
print('Precision: ', precision_score(y_test, predicted, average='weighted', zero_division=1))
print('Recall: ', recall_score(y_test, predicted, average='weighted', zero_division=1))
print('F1:', f1_score(y_test, predicted, average='weighted'))

Accuracy:  0.5200583515681984
Precision:  0.6305268108008216
Recall:  0.5200583515681984
F1: 0.4632705065662839


In [30]:
# precision, recall, fscore, support separated by label
precision, recall, fscore, support = score(y_test, predicted)

df_acc = pd.DataFrame()
df_acc['precision']=pd.Series(precision)
df_acc['recall']=pd.Series(recall)
df_acc['fscore']=pd.Series(fscore)
df_acc['support']=pd.Series(support)

print(df_acc)

   precision    recall    fscore  support
0   0.613588  0.521661  0.563902      554
1   0.471111  0.766727  0.583620      553
2   0.000000  0.000000  0.000000      264


  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# examine class distribution (in test dataset)
y_test.value_counts()

0    554
1    553
2    264
Name: label, dtype: int64