<a href="https://colab.research.google.com/github/feliciahf/data_science_exam/blob/main/hippocorpus_NB_%2B_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages

In [1]:
import pandas as pd
import numpy as np

# Preprocessing
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Naive Bayes model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Accuracies
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support as score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Import data

In [2]:
# import csv file as dataframe (from GitHub repo)
url = 'https://raw.githubusercontent.com/feliciahf/data_science_exam/main/hippoCorpusV2.csv'
df = pd.read_csv(url, encoding='latin1')

In [3]:
# drop retold label
df = df[df.memType != 'retold']

In [4]:
# make labels column using numerical values
df.memType = pd.Categorical(df.memType)
df['label'] = df.memType.cat.codes

# story type corresponding to label
print(f"Label 0: {df.loc[df['label'] == 0,'memType'].unique()}")
print(f"Label 1: {df.loc[df['label'] == 1,'memType'].unique()}")

Label 0: ['imagined']
Categories (1, object): ['imagined']
Label 1: ['recalled']
Categories (1, object): ['recalled']


# Preprocessing

In [5]:
# case collapsing
df['story'] = df.story.map(lambda x: x.lower())
# remove punctuation
df['story'] = df.story.str.replace('[^\w\s]', '')
# tokenization
df['story'] = df['story'].apply(nltk.word_tokenize)

In [6]:
# check whether preprocessing worked
df['story']

0       [concerts, are, my, most, favorite, thing, and...
1       [the, day, started, perfectly, with, a, great,...
2       [it, seems, just, like, yesterday, but, today,...
3       [five, months, ago, my, niece, and, nephew, we...
4       [about, a, month, ago, i, went, to, burning, m...
                              ...                        
6849    [my, dog, was, diagnosed, with, lymphoma, a, y...
6850    [over, my, vacation, from, my, job, i, went, t...
6851    [this, event, was, a, birthday, party, for, my...
6852    [this, event, occurred, about, two, weeks, ago...
6853    [over, the, past, year, i, have, been, involve...
Name: story, Length: 5535, dtype: object

In [7]:
# transform data into occurrences
# This converts the list of words into space-separated strings
df['story'] = df['story'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['story'])

In [8]:
# tf-idf
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

# Training NB model

In [9]:
# split data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2, random_state=69)

In [10]:
# Fit model
model = MultinomialNB().fit(X_train, y_train)
# Test model
predicted = model.predict(X_test)

# Evaluating NB model

In [11]:
# compute overall accuracy, precision, recall, f1 scores
print('Accuracy: ', accuracy_score(y_test, predicted))
print('Precision: ', precision_score(y_test, predicted, average='weighted', zero_division=1))
print('Recall: ', recall_score(y_test, predicted, average='weighted', zero_division=1))
print('F1:', f1_score(y_test, predicted, average='weighted'))

Accuracy:  0.6278229448961157
Precision:  0.6485532653841489
Recall:  0.6278229448961157
F1: 0.6150819361589105


In [12]:
# precision, recall, fscore, support (number of stories)
precision, recall, fscore, support = score(y_test, predicted)

# create dataframe with accuracies by category
df_acc = pd.DataFrame()
df_acc['precision']=pd.Series(precision)
df_acc['recall']=pd.Series(recall)
df_acc['fscore']=pd.Series(fscore)
df_acc['support']=pd.Series(support)
print(df_acc)

   precision    recall    fscore  support
0   0.704545  0.446043  0.546256      556
1   0.592053  0.811252  0.684533      551


In [13]:
# Matthews correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_test, predicted)

0.2762488425224322

##The SVM model

In [14]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

##Evaluate SVM model

In [18]:
# compute overall accuracy, precision, recall, f1 scores
print('Accuracy: ', accuracy_score(y_test,y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted', zero_division=1))
print('Recall: ', recall_score(y_test, y_pred, average='weighted', zero_division=1))
print('F1:', f1_score(y_test, y_pred, average='weighted'))

Accuracy:  0.7091237579042458
Precision:  0.7091441742201728
Recall:  0.7091237579042458
F1: 0.7091247073569915


In [15]:
# precision, recall, fscore, support (number of stories)
precision, recall, fscore, support = score(y_test, y_pred)

# create dataframe with accuracies by category
df_acc = pd.DataFrame()
df_acc['precision']=pd.Series(precision)
df_acc['recall']=pd.Series(recall)
df_acc['fscore']=pd.Series(fscore)
df_acc['support']=pd.Series(support)
print(df_acc)

   precision    recall    fscore  support
0   0.711957  0.706835  0.709386      556
1   0.706306  0.711434  0.708861      551


In [16]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7091237579042458


In [17]:
# Matthews correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_test, y_pred)

0.4182655586037596