<a href="https://colab.research.google.com/github/evilsizord/mscs-data-mining-hw2/blob/main/Data_Mining_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 2: Naive Bayes Classifier
Daniel Evilsizor \
November 13, 2022

In [1]:
# Get Data from Kaggle.com

# NOTE: Requires you to have a Kaggle.com account. From your account you can generate an API key.
# It will be provided in kaggle.json. Upload the JSON file to this project BEFORE RUNNING.

# src: https://www.kaggle.com/general/74235

! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download gaveshjain/ford-sentence-classifiaction-dataset   # yes there is a typo in the actual URL
! mkdir sentence-dataset
! unzip ford-sentence-classifiaction-dataset.zip -d sentence-dataset

Downloading ford-sentence-classifiaction-dataset.zip to /content
  0% 0.00/2.92M [00:00<?, ?B/s]
100% 2.92M/2.92M [00:00<00:00, 126MB/s]
Archive:  ford-sentence-classifiaction-dataset.zip
  inflating: sentence-dataset/sample_submission.csv  
  inflating: sentence-dataset/test_data.csv  
  inflating: sentence-dataset/train_data.csv  


In [15]:
# load all necessary libraries

import os
import pandas
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np

nltk.download('punkt')  # i guess this is needed for the stemmer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [30]:
# class it up

class MyTokenizer:
  def __init__(self):
    self.stemmer = PorterStemmer()
    self.stop_words = set(stopwords.words('english'))

  def tokenize(self, doc):
    tokens = word_tokenize(doc)
    # remove punctuation and stop words
    # ref: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    # Also, convert to a set so we only get unique words per row (no duplicates)
    words = set()
    for tok in tokens:
      word = self.stemmer.stem(tok)
      if not word.lower() in self.stop_words:
        if not word in '.,':
          words.add(word)
    return words


class NaiveBayesClassifier:

  def __init__(self, vocab, categories, tokenizer):
    self.class_probs = {}     # p(class)
    self.categories = categories    # [class1, class2, ..]
    self.word_probs = {}      # p(w|class)
    self.category_counts = {}
    self.vocab = {}
    self.tokenize = tokenizer.tokenize

    for cat in categories:
      self.class_probs[cat] = 1
      self.category_counts[cat] = 0

    for word in vocab:
      self.vocab[word] = {'total_frequency': 0}
      for cat in categories:
        self.vocab[word][cat] = 0

  # return most likely class for each document
  def predict(self, dataset):
    # foreach document:
    # foreach class, calculate p(class|tokens) = p(tokens|class)*p(class) / p(tokens)
    # since p(tokens) is effectively a constant scaling factor when comparing these, we can ignore it
    y_hat = []
    for index, row in dataset.iterrows():
      words = self.tokenize(row['New_Sentence'])
      category = row['Type']
      probs = {}
      for cat in categories:
        p = self.class_probs[cat]
        for w in words:
          p2key = w + '_' + cat
          p2 = self.word_probs[p2key] if p2key in self.word_probs else 0
          p *= p2
        probs[cat] = p
      
      # predicted class is the one with max probability
      y_hat.append(max(probs))
      
    return y_hat

  def count_frequencies(self, dataset):
    for index, row in dataset.iterrows():
      words = self.tokenize(row['New_Sentence'])
      category = row['Type']

      # increment overall category counts
      self.dict_increment(self.category_counts, category)

      # count overall word frequency, and frequency per class (label)
      for word in words:
        self.dict_increment(self.vocab[word], 'total_frequency')
        if category not in self.vocab[word]:
          self.vocab[word][category] = 0
        self.dict_increment(self.vocab[word], category)

  # calculate probs from a training dataset
  def train(self, dataset):
    self.count_frequencies(dataset)

    # Calculate	Conditional probability of all words based on category
    for cat in self.categories:
      self.class_probs[cat] = self.categorory_counts[cat] / len(dataset)

      for word in self.vocab:
        #	P(word|cat)  = # of documents in category containing word / num of all documents in that category
        word_cat_freq = self.vocab[word][cat] if cat in self.vocab[word] else 0
        self.word_probs[word + '_' + cat] = word_cat_freq / self.category_counts[cat]

  # helper function
  def dict_increment(self, mydict, mykey):
    if not mykey:
      return  # if key is False, ignore
    if mykey in mydict:
      mydict[mykey] += 1
    else:
      mydict[mykey] = 1

  #def _parse(corpus)
    #calculate probabilities




In [21]:
# Load and preview the data
input_train = pandas.read_csv('sentence-dataset/train_data.csv')
input_test = pandas.read_csv('sentence-dataset/test_data.csv')

# throw out the integer indexes before merging
input_train = input_train.drop(input_train.columns[[0]], axis=1)
input_test = input_test.drop(input_test.columns[[0]], axis=1)

all_data = pandas.concat([input_train, input_test])
# how many rows?
print(len(all_data), 'total rows loaded')


# throw out null sentences (see https://stackoverflow.com/a/56708633)
# todo: sigh why this not working..
#all_data = all_data[ all_data['New_Sentence'].str.strip().astype(bool) & all_data['New_Sentence'] != nan ]
#print(len(all_data), 'rows after removing empty')

all_data['New_Sentence'].replace('', np.nan, inplace=True)
all_data.dropna(subset=['New_Sentence'], inplace=True)
print(len(all_data), 'rows after removing empty 2')

#debug: what are values for type?
#print(all_data['Type'].unique())
categories = ['Responsibility', 'Requirement', 'Skill', 'SoftSkill', 'Education', 'Experience']

testdata = all_data[ all_data['Type'].isnull() ]
traindevdata = all_data[ all_data['Type'].notnull() ]

split = int(.5*len(traindevdata))
traindata = traindevdata[:split]
devdata = traindevdata[(split+1):]

print(len(traindata), 'training,', len(devdata), 'dev,', len(testdata), 'test rows loaded')

# preview the result
all_data.head(5)

75144 total rows loaded
73750 rows after removing empty 2
29501 training, 29500 dev, 14748 test rows loaded


Unnamed: 0,Sentence_id,New_Sentence,Type
0,GERRES15609,Author and/or Review architecture/design and o...,Responsibility
1,PHERES15784,Should be able to develop custom dynamic shape...,Responsibility
2,GERREQ10457,Experience in working crosslly with a larger ...,Requirement
3,GERSKL27235,"Previous business experience, including but no...",Skill
4,HONSSK18415,Delivering fast and right the first time.,SoftSkill


In [31]:
vocab = set()
tokenizer = MyTokenizer()

for index, row in all_data.iterrows():
  tokens = tokenizer.tokenize(row['New_Sentence'])
  for tok in tokens:
    vocab.add(tok)

# preview
print('Found', len(vocab), 'words in vocabulary')
for id,val in enumerate(vocab):
  if id < 12:
    print(val + ", ")


Found 24646 words in vocabulary
20yr, 
critico, 
30-40, 
mold, 
bm, 
salesforce/salesforc, 


In [25]:
model = NaiveBayesClassifier(vocab, categories, tokenizer)

model.train(traindata)



In [26]:
# Calculate probability of occurrance of each word
vocab_probs = {}
for word in vocab:
  if (model.vocab[word]['total_frequency'] > 0):    # if freq == 0 that means it did not appear in the training data
    vocab_probs[word] = model.vocab[word]['total_frequency'] / len(traindata)

# preview: words with highest probability
print('Top 5')
for w in sorted(vocab_probs, key=vocab_probs.get, reverse=True)[:5]:
  print(w, vocab_probs[w])

# preview: words with lowest probability
print('Bottom 5')
for w in sorted(vocab_probs, key=vocab_probs.get)[:5]:
  print(w, vocab_probs[w])

Top 5
experi 0.2547032303989695
year 0.13914782549744076
manag 0.10874207653977831
work 0.09823395817090946
( 0.09047150944035795
Bottom 5
20yr 3.38971560286092e-05
hsten 3.38971560286092e-05
multius 3.38971560286092e-05
dtupc 3.38971560286092e-05
cobal 3.38971560286092e-05


In [29]:
# preview Conditional Probabilities (computed during training)

print('Top 5 Conditional probability')
for w in sorted(model.word_probs, key=model.word_probs.get, reverse=True)[:5]:
  print(w, model.word_probs[w])


Top 5 Conditional probability (Responsibility)
year_Experience 0.8757982823166703
experi_Experience 0.8465095793878
degre_Education 0.5115967885816235
engin_Education 0.34834968777876896
experi_Skill 0.32862606649014414


In [None]:
# try predicting labels for dev data
Y_hat = model.predict(devdata)

num_correct = 0
for index, row in devdata.iterrows():
  y_hat = Y_hat[index]
  y = row['Type']
  num_correct += 1 if y_hat == y else 0

accuracy = num_correct / len(devdata)
print('Accuracy for Dev data:', accuracy)



In [5]:
# get vocabulary statistics
vocab = {}
category_counts = {}
ps = PorterStemmer()

# helper function
def dict_increment(mydict, mykey):
  if not mykey:
    return  # if key is false, ignore
  if mykey in mydict:
    mydict[mykey] += 1
  else:
    mydict[mykey] = 1

for index, row in all_data.iterrows():
  if 'New_Sentence' in row:
    tokens = word_tokenize(row['New_Sentence'])

    stemwords = [ps.stem(tok) for tok in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_stemwords = [w for w in stemwords if not w.lower() in stop_words]

    # remove punctuation
    # ref: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    stemwords = list(filter(lambda token: token not in '.,', filtered_stemwords))

    # convert to a set so we only get unique words per row (no duplicates)
    stemwords = set(stemwords)

    # get category (if train/dev data)
    myclass = row['Type'] if row['Type'] else False

    # increment overall category counts
    dict_increment(category_counts, myclass)

    # count overall word frequency, and frequency per class (label)
    for word in stemwords:
      if word in vocab:
        dict_increment(vocab[word], 'total_frequency')
        dict_increment(vocab[word], myclass)
      else:
        vocab[word] = {'total_frequency': 1}
        for cat in categories:
          vocab[word][cat] = 0
        dict_increment(vocab[word], myclass)
  else:
    print('ERROR: no sentence in row', row)

# preview
print('Found', len(vocab.keys()), 'words in vocabulary')

Found 37781 words in vocabulary


In [7]:
# Calculate probability of occurrance of each word
vocab_probs = {}
for word in vocab:
  vocab_probs[word] = vocab[word]['total_frequency'] / len(all_data)

# preview: words with highest probability
print('Top 5')
for w in sorted(vocab_probs, key=vocab_probs.get, reverse=True)[:5]:
  print(w, vocab_probs[w])

# preview: words with lowest probability
print('Bottom 5')
for w in sorted(vocab_probs, key=vocab_probs.get)[:5]:
  print(w, vocab_probs[w])

# Calculate	Conditional probability of all words based on category
#	P[“the” | Skill]  = # of skill documents containing “the” / num of all skill documents
# (we can only do this for the dev and train datasets, since we do not have categories given for the test data)
conditional_probs = {}
for cat in categories:
  for word in vocab:
    #x = sum(vocab[word][cat] for word in vocab)
    conditional_probs[word + '_' + cat] = vocab[word][cat] / category_counts[cat]

# preview
print('Top 5 Conditional probability (Responsibility)')
for w in sorted(conditional_probs, key=conditional_probs.get, reverse=True)[:5]:
  print(w, conditional_probs[w])


Top 5
and 0.4694915254237288
of 0.2896
in 0.2870508474576271
to 0.2600406779661017
experience 0.19086101694915253
Bottom 5
crosslly 1.3559322033898305e-05
Nebraska 1.3559322033898305e-05
aeroespacial 1.3559322033898305e-05
automotriz 1.3559322033898305e-05
-Typhoon 1.3559322033898305e-05
Top 5 Conditional probability (Responsibility)
years_Experience 0.8038114122053316
experience_Experience 0.7933465521039876
and_Responsibility 0.677983876253523
of_Experience 0.6488213262833223
and_SoftSkill 0.54428632115548


In [8]:
# Calculate accuracy using dev dataset
def predict(document):
  # tokenize
  tokens = word_tokenize(document)

  # todo: remove stopwords
  stemwords = [ps.stem(tok) for tok in tokens]

  # remove punctuation
  # ref: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
  stemwords = list(filter(lambda token: token not in '.,', tokens))

  # convert to a set so we only get unique words per row (no duplicates)
  stemwords = set(stemwords)

  # foreach class, calculate p(class|tokens) = p(tokens|class)*p(class) / p(tokens)
  # since p(tokens) is effectively a constant scaling factor when comparing these, we can ignore it
  probs = {}
  for cat in categories:
    p = category_counts[cat] / len(traindevdata)
    for w in stemwords:
      p *= conditional_probs[w + '_' + cat]
    probs[cat] = p

  # return class with max probability
  return max(probs)


num_correct = 0
for index, row in devdata.iterrows():
  y_hat = predict(row['New_Sentence'])
  y = row['Type']
  num_correct += 1 if y_hat == y else 0

accuracy = num_correct / len(devdata)
print('Accuracy:', accuracy)


Accuracy: 0.15922033898305085


## References

CSV reader example code:\
https://realpython.com/python-csv/

https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

https://www.geeksforgeeks.org/python-stemming-words-with-nltk/?ref=lbp

https://stackoverflow.com/questions/29314033/drop-rows-containing-empty-cells-from-a-pandas-dataframe

https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value