In [12]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [13]:
import pandas as pd
import numpy as np
from load_data import *
from preprocessing import *

# Load Data from File

In [14]:
train, test = load_tagged_sentences("../data/brown-universal.txt", split=0.8)
tags = load_tags("../data/tags-universal.txt")

In [15]:
print("There are {} sentences in the training set.".format(len(train)))
print("There are {} sentences in the testing set.".format(len(test)))

There are 45872 sentences in the training set.
There are 11468 sentences in the testing set.


In [16]:
# partition train so only a few of the samples are used for the initial probabilities
train_sample = train

# Count data

In [17]:
tag_counts, tag_transition_counts = create_count_dictionaries(train_sample)

In [18]:
words, word_to_tag_counts = word_to_tag_counts(train_sample)

## Make actual probability tables out of counts

In [19]:
transition_matrix, emission_matrix, initial_probs = create_probability_matrices(words, tags, tag_counts, tag_transition_counts, word_to_tag_counts)

Create a matrix for Tag -> Tag transitions

In [20]:
trans_matrix_df = pd.DataFrame(transition_matrix, columns = list(tags), index=list(tags))
ems_matrix_df = np.exp(trans_matrix_df)
ems_matrix_df

Unnamed: 0,.,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,VERB,X
.,0.106972,0.028205,0.065296,0.042948,0.069695,0.067517,0.083489,0.012013,0.046594,0.018388,0.07674,0.001306
ADJ,0.099533,0.05723,0.089233,0.009643,0.036974,0.005732,0.652337,0.007046,0.004,0.019883,0.017599,0.000448
ADP,0.00966,0.082837,0.020071,0.015129,0.001906,0.455123,0.258371,0.030396,0.070348,0.01437,0.041289,0.00044
ADV,0.168756,0.13579,0.142347,0.097136,0.016818,0.074073,0.032944,0.012981,0.048401,0.028996,0.241245,8.9e-05
CONJ,0.021076,0.109873,0.073325,0.091419,0.000262,0.152452,0.244788,0.019044,0.06772,0.025141,0.194277,0.000557
DET,0.012876,0.239609,0.00907,0.017393,0.000703,0.005986,0.626233,0.009746,0.010092,0.001944,0.064798,0.001433
NOUN,0.283738,0.013103,0.244528,0.026401,0.059648,0.015504,0.149549,0.008119,0.019727,0.017838,0.158225,0.000317
NUM,0.267672,0.060637,0.129899,0.020184,0.039866,0.013652,0.382077,0.020854,0.008459,0.005193,0.045896,0.000168
PRON,0.102803,0.009167,0.055582,0.053441,0.011157,0.01768,0.009167,0.001083,0.008613,0.024001,0.70718,2.5e-05
PRT,0.075859,0.018348,0.089568,0.036696,0.012121,0.082672,0.036111,0.005392,0.006645,0.012121,0.624133,1e-06


Create a matrix for Tag -> Word probabilities

In [21]:
ems_matrix_df = pd.DataFrame(emission_matrix, columns = list(words), index=list(tags))
ems_matrix_df = np.exp(ems_matrix_df)
ems_matrix_df

Unnamed: 0,mr.,podger,had,thanked,him,gravely,",",and,now,he,...,$8.50,tab,pressed-paper,twotiming,racking,biologic,saloonkeeper,murrin,musn't,polyelectrolytes
.,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,0.395715,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
ADJ,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1.5e-05,1e-06,1e-06,1e-06,1e-06
ADP,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
ADV,1e-06,1e-06,1e-06,1e-06,1e-06,0.000134,1e-06,1e-06,0.022951,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
CONJ,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,0.756064,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
DET,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
NOUN,0.003153,8.6e-05,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,5e-06,5e-06,5e-06,1e-06,1e-06,1e-06,5e-06,5e-06,1e-06,5e-06
NUM,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
PRON,1e-06,1e-06,1e-06,1e-06,0.052787,1e-06,1e-06,1e-06,1e-06,0.192636,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
PRT,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06


Create initial probabilities matrix (the probability a sentence starts with a tag)

In [22]:
initial_probs_df = pd.DataFrame([initial_probs], columns = tags)
initial_probs_df = np.exp(initial_probs_df)
initial_probs_df

Unnamed: 0,.,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,VERB,X
0,0.088834,0.034989,0.122188,0.090796,0.049464,0.211938,0.141437,0.016633,0.161362,0.036449,0.045409,0.000501


# Naive Bayes Model

In [None]:
class NaiveBayes:
    def __init__(self, something):
        self.something = something