<a href="https://colab.research.google.com/github/evilsizord/mscs-data-mining-hw2/blob/main/Data_Mining_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 2: Naive Bayes Classifier
Daniel Evilsizor \
November 13, 2022

In [1]:
# Get Data from Kaggle.com

# NOTE: Requires you to have a Kaggle.com account. From your account you can generate an API key.
# It will be provided in kaggle.json. Upload the JSON file to this project BEFORE RUNNING.

# src: https://www.kaggle.com/general/74235

! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download gaveshjain/ford-sentence-classifiaction-dataset   # yes there is a typo in the actual URL
! mkdir sentence-dataset
! unzip ford-sentence-classifiaction-dataset.zip -d sentence-dataset

Downloading ford-sentence-classifiaction-dataset.zip to /content
  0% 0.00/2.92M [00:00<?, ?B/s]
100% 2.92M/2.92M [00:00<00:00, 171MB/s]
Archive:  ford-sentence-classifiaction-dataset.zip
  inflating: sentence-dataset/sample_submission.csv  
  inflating: sentence-dataset/test_data.csv  
  inflating: sentence-dataset/train_data.csv  


In [2]:
# load all necessary libraries
import os
import pandas
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np

nltk.download('punkt')  # i guess this is needed for the stemmer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [37]:
# class it up

class MyTokenizer:
  def __init__(self):
    self.stemmer = PorterStemmer()
    self.stop_words = set(stopwords.words('english'))

  def tokenize(self, doc):
    tokens = word_tokenize(doc)
    # remove punctuation and stop words
    # ref: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
    # Also, convert to a set so we only get unique words per row (no duplicates)
    words = set()
    for tok in tokens:
      word = self.stemmer.stem(tok)
      if not word.lower() in self.stop_words:
        if not word in '.,()':
          words.add(word)
    return words


class NaiveBayesClassifier:

  def __init__(self, vocab, categories, tokenizer):
    self.class_probs = {}     # p(class)
    self.categories = categories    # [class1, class2, ..]
    self.word_probs = {}      # p(w|class)
    self.category_counts = {}
    self.vocab = {}
    self.tokenize = tokenizer.tokenize
    self.use_smoothing = False

    for cat in categories:
      self.class_probs[cat] = 1
      self.category_counts[cat] = 0

    for word in vocab:
      self.vocab[word] = {'total_frequency': 0}
      for cat in categories:
        self.vocab[word][cat] = 0

  # return most likely class for each document
  def predict(self, dataset):
    # foreach document:
    # foreach class, calculate p(class|tokens) = p(tokens|class)*p(class) / p(tokens)
    # since p(tokens) is effectively a constant scaling factor when comparing these, we can ignore it
    y_hat = []
    index = 0
    dataset_dict = dataset.copy().to_dict('records')
    for row in dataset_dict:
      words = self.tokenize(row['New_Sentence'])
      category = row['Type']
      probs = {}
      for cat in categories:
        p = self.class_probs[cat]
        for w in words:
          p2key = w + '_' + cat
          p2 = self.word_probs[p2key] if p2key in self.word_probs else 0
          p *= p2
        probs[cat] = p
      
      # predicted class is the one with max probability
      y_hat.append(max(probs, key=probs.get))
      #debug
      if index < 10:
        print('predict(): probs:', probs)

      index += 1
      
    return y_hat

  def count_frequencies(self, dataset):
    # use laplace smoothing to ensure there are no zero counts
    if self.use_smoothing:
      for word in self.vocab:
        for cat in self.categories:
          dataset.append({'Sentence_id': '__ADDITIVE__', 'New_Sentence': word, 'Type': cat})
          # this increases our dataset by ~(24k * 6) ..?

    for index, row in dataset.iterrows():
      words = self.tokenize(row['New_Sentence'])
      category = row['Type']

      # increment overall category counts
      self.dict_increment(self.category_counts, category)

      # count overall word frequency, and frequency per class (label)
      for word in words:
        self.dict_increment(self.vocab[word], 'total_frequency')
        if category not in self.vocab[word]:
          self.vocab[word][category] = 0
        self.dict_increment(self.vocab[word], category)

  # calculate probs from a training dataset
  def train(self, dataset):
    self.count_frequencies(dataset)

    # Calculate	Conditional probability of all words based on category
    for cat in self.categories:
      self.class_probs[cat] = self.category_counts[cat] / len(dataset)

      for word in self.vocab:
        #	P(word|cat)  = # of documents in category containing word / num of all documents in that category
        word_cat_freq = self.vocab[word][cat] if cat in self.vocab[word] else 0
        self.word_probs[word + '_' + cat] = word_cat_freq / self.category_counts[cat]

  # helper function
  def dict_increment(self, mydict, mykey):
    if not mykey:
      return  # if key is False, ignore
    if mykey in mydict:
      mydict[mykey] += 1
    else:
      mydict[mykey] = 1

  #def _parse(corpus)
    #calculate probabilities




In [17]:
# Load and preview the data
input_train = pandas.read_csv('sentence-dataset/train_data.csv')
input_test = pandas.read_csv('sentence-dataset/test_data.csv')

# throw out the integer indexes before merging
input_train = input_train.drop(input_train.columns[[0]], axis=1)
input_test = input_test.drop(input_test.columns[[0]], axis=1)

all_data = pandas.concat([input_train, input_test])
# how many rows?
print(len(all_data), 'total rows loaded')

# throw out null sentences (see https://stackoverflow.com/a/56708633)
all_data['New_Sentence'].replace('', np.nan, inplace=True)
all_data.dropna(subset=['New_Sentence'], inplace=True)
print(len(all_data), 'rows after removing empty sentences')

categories = ['Responsibility', 'Requirement', 'Skill', 'SoftSkill', 'Education', 'Experience']

testdata = all_data[ all_data['Type'].isnull() ]
traindevdata = all_data[ all_data['Type'].notnull() ]

split = int(.5*len(traindevdata))
traindata = traindevdata[:split]
devdata = traindevdata[(split+1):]

print(len(traindata), 'training,', len(devdata), 'dev,', len(testdata), 'test rows loaded')

# preview the result
all_data.head(5)

75144 total rows loaded
73750 rows after removing empty sentences
29501 training, 29500 dev, 14748 test rows loaded


Unnamed: 0,Sentence_id,New_Sentence,Type
0,GERRES15609,Author and/or Review architecture/design and o...,Responsibility
1,PHERES15784,Should be able to develop custom dynamic shape...,Responsibility
2,GERREQ10457,Experience in working crosslly with a larger ...,Requirement
3,GERSKL27235,"Previous business experience, including but no...",Skill
4,HONSSK18415,Delivering fast and right the first time.,SoftSkill


In [18]:
vocab = set()
tokenizer = MyTokenizer()

all_data_dict = all_data.to_dict('records')
for row in all_data_dict:
  tokens = tokenizer.tokenize(row['New_Sentence'])
  for tok in tokens:
    vocab.add(tok)

# preview
print('Found', len(vocab), 'words in vocabulary')
for id,val in enumerate(vocab):
  if id < 25:
    print(val + ", ")


Found 24644 words in vocabulary
oracle/, 
ecsa, 
issue-track, 
rltat, 
reconfigur, 
strike, 
order-book, 
rig, 
contempl, 
resn, 
product/tool, 
subtl, 
gasp, 
projektmanag, 
hardware/circuit, 
multidimension, 
prolin, 
livrabl, 
diagnos, 
telnet, 
foster, 
site-level, 
d6, 
nft, 
vca, 


In [38]:
# Train the model
model = NaiveBayesClassifier(vocab, categories, tokenizer)
model.train(traindata)

In [None]:
#print ('Tokenizer test::')
#print(tokenizer.tokenize('Experienced person with funny hat'))

In [39]:
# Calculate probability of occurrance of each word
vocab_probs = {}
for word in vocab:
  if (model.vocab[word]['total_frequency'] > 0):    # if freq == 0 that means it did not appear in the training data
    vocab_probs[word] = model.vocab[word]['total_frequency'] / len(traindata)

# preview: words with highest probability
print('Top 8')
for w in sorted(vocab_probs, key=vocab_probs.get, reverse=True)[:8]:
  print(w, vocab_probs[w])

# preview: words with lowest probability
print('Bottom 8')
for w in sorted(vocab_probs, key=vocab_probs.get)[:8]:
  print(w, vocab_probs[w])

Top 8
experi 0.2547032303989695
year 0.13914782549744076
manag 0.10874207653977831
work 0.09823395817090946
develop 0.08779363411409782
skill 0.08606487915663875
abil 0.08416663841903664
team 0.07318395986576726
Bottom 8
oracle/ 3.38971560286092e-05
resn 3.38971560286092e-05
nft 3.38971560286092e-05
prize 3.38971560286092e-05
qad 3.38971560286092e-05
navigu 3.38971560286092e-05
maladi 3.38971560286092e-05
scope/cost 3.38971560286092e-05


In [33]:
# preview Conditional Probabilities (computed during training)

print('Top 5 Conditional probability')
for w in sorted(model.word_probs, key=model.word_probs.get, reverse=True)[:5]:
  print(w, model.word_probs[w])

print('Bottom 5 Conditional probability')
for w in sorted(model.word_probs, key=model.word_probs.get)[:5]:
  print(w, model.word_probs[w])

# Preview class probabilities
print('Class probabilities')
for cat in categories:
  print(cat, model.class_probs[cat])


Top 5 Conditional probability
year_Experience 0.8757982823166703
experi_Experience 0.8465095793878
degre_Education 0.5115967885816235
engin_Education 0.34834968777876896
experi_Skill 0.32862606649014414
Bottom 5 Conditional probability
oracle/_Responsibility 0.0
ecsa_Responsibility 0.0
issue-track_Responsibility 0.0
reconfigur_Responsibility 0.0
strike_Responsibility 0.0
Class probabilities
Responsibility 0.258567506186231
Requirement 0.23633097183146334
Skill 0.11521643334124267
SoftSkill 0.1599606792990068
Education 0.07599742381614183
Experience 0.15392698552591438


In [40]:
# try predicting labels for dev data
Y_hat = model.predict(devdata)

#debug
for i in range(10):
  # something is wrong with the predictions, they are all the same..
  print(Y_hat[i])
  print(Y_hat[-1*i])



predict(): probs: {'Responsibility': 1.1816508956975298e-07, 'Requirement': 1.4227763468921548e-08, 'Skill': 1.1177913805473844e-09, 'SoftSkill': 3.7255921909823126e-10, 'Education': 3.706876586859519e-09, 'Experience': 4.5185091051510246e-09}
predict(): probs: {'Responsibility': 0.0, 'Requirement': 4.616674566021973e-21, 'Skill': 0.0, 'SoftSkill': 0.0, 'Education': 5.394091927120312e-19, 'Experience': 8.347573644211712e-19}
predict(): probs: {'Responsibility': 5.058700202494621e-35, 'Requirement': 0.0, 'Skill': 0.0, 'SoftSkill': 0.0, 'Education': 0.0, 'Experience': 0.0}
predict(): probs: {'Responsibility': 1.363322710028491e-24, 'Requirement': 0.0, 'Skill': 0.0, 'SoftSkill': 0.0, 'Education': 0.0, 'Experience': 0.0}
predict(): probs: {'Responsibility': 0.0, 'Requirement': 0.0, 'Skill': 0.0, 'SoftSkill': 0.0, 'Education': 0.0, 'Experience': 0.0}
predict(): probs: {'Responsibility': 2.4885169606739843e-07, 'Requirement': 1.7065263577225802e-06, 'Skill': 7.230196563913406e-06, 'SoftSkill

In [41]:
print('Responsibility predictions:', len([y for y in Y_hat if y == 'Responsibility']))
print('SoftSkill predictions:', len([y for y in Y_hat if y == 'SoftSkill']))

Responsibility predictions: 14045
SoftSkill predictions: 3754


In [43]:
# Calculate accuracy
def calculate_accuracy(Y, Y_hat):
  num_correct = 0
  index=0
  if len(Y) != len(Y_hat):
    print('data invalid length', len(Y), ':', len(Y_hat))
    return False
  for row in Y:
    y_hat = Y_hat[index]
    y = row['Type']
    num_correct += 1 if y_hat == y else 0
    index += 1
  return num_correct / len(Y)

Y = devdata.to_dict('records')
accuracy = calculate_accuracy(Y, Y_hat)

print('Accuracy for Dev data:', accuracy)


Accuracy for Dev data: 0.580406779661017
Accuracy for Dev data: 0.580406779661017


In [46]:
# Compare the result with smoothing
model2 = NaiveBayesClassifier(vocab, categories, tokenizer)
model2.use_smoothing = True
model2.train(traindata)

Y_hat = model2.predict(devdata)
accuracy = calculate_accuracy(Y, Y_hat)

print('Accuracy with smoothing:', accuracy)


TypeError: ignored

## References

CSV reader example code:\
https://realpython.com/python-csv/

https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

https://www.geeksforgeeks.org/python-stemming-words-with-nltk/?ref=lbp

https://stackoverflow.com/questions/29314033/drop-rows-containing-empty-cells-from-a-pandas-dataframe

https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value

https://towardsdatascience.com/heres-the-most-efficient-way-to-iterate-through-your-pandas-dataframe-4dad88ac92ee