<a href="https://colab.research.google.com/github/christopherdiamana/nlp/blob/main/catch_up1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Natural Language Processing Catch-up 1

## The dataset

In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 4.6 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 61.3 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 65.5 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 59.7 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.4 MB/s 
Collecting responses<0.19
  Downloading respo

In [2]:
from datasets import load_dataset_builder

In [3]:
ds_builder = load_dataset_builder("imdb")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [4]:
# Inspect dataset description
ds_builder.info.description

'Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.'

In [5]:
# Inspect dataset features
ds_builder.info.features

{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
 'text': Value(dtype='string', id=None)}

In [6]:
from datasets import get_dataset_split_names

In [7]:
get_dataset_split_names("imdb")

['train', 'test', 'unsupervised']

In [8]:
from datasets import load_dataset

In [9]:
dataset = load_dataset("imdb")

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [11]:
imdb_train = load_dataset('imdb', split='train')
imdb_test = load_dataset('imdb', split='test')

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


In [12]:
from collections import Counter

In [13]:
Counter(imdb_train['label'])

Counter({0: 12500, 1: 12500})

In [14]:
Counter(imdb_test['label'])

Counter({0: 12500, 1: 12500})



---



## Naive Bayes classifier

In [15]:
import numpy as np

In [16]:
def compute_occurences(vocabulary, documents):
  
  # create a vocabulary dictionary where the elements are initialise to 0 
  occurences = dict.fromkeys(vocabulary, 0)

  for text in documents:
    for current_word in text.split():
      occurences[current_word] +=1

  return occurences

In [18]:
def compute_class_log_likelihood(log_likelihood, vocabulary, class_label, occurences):
  '''
  Computes the log likelihoods of words in the documents by class.

  Parameters
  ----------
  log_likelihood: the current log likelihood vocabulary dictionary 
  vocabulary: the vocabulary of the full corpus
  class_label: the label of the class for whom the likelihood will be calculate
  occurences: list of words occurence of a specific class

  Returns
  -------
  dictionary with words as keys and log likelihood as values
  '''

  denominator = sum([occurences[word] + 1 for word in vocabulary])

  for word in occurences:
    log_likelihood[word][class_label] = math.log((occurences[word] + 1) / denominator)

  return log_likelihood

In [32]:
import math
from operator import countOf


def train_naive_bayes(documents, classes):
  log_prior = {}
  vocabulary = set()
  log_likelihood = {}
  
  [vocabulary.update(document['text'].split()) for index, document in documents.iterrows()]
  num_documents = len(documents['text'])
  
  log_likelihood = { word : {} for word in vocabulary }

  for class_label in classes:
    num_documents_of_class = np.count_nonzero(np.array(documents['label']) == class_label)
    log_prior[class_label] = math.log(num_documents_of_class / num_documents)

    big_document = [document['text'] for index, document in documents.iterrows() if document['label'] == class_label]
    occurences = compute_occurences(vocabulary, big_document)

    log_likelihood = compute_class_log_likelihood(log_likelihood, vocabulary, class_label, occurences)

  return log_prior, log_likelihood, vocabulary

In [40]:
def test_naive_bayes(test_document, log_prior, log_likelihood, classes, vocabulary):
  summation = []

  for class_label in classes:
    summation[class_label] = log_prior[class_label]
    
    for word in test_document.split():
      if word in vocabulary:
        summation[class_label] += log_likelihood[word][class_label] 
  
  return np.argmax(summation)

### Pretreatment

In [21]:
from string import punctuation
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [22]:
# For the Lemmatizer 
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [23]:
def pre_treatment(text):
  new_text = text.lower()
  new_text = new_text.translate(str.maketrans('', '', punctuation))
  
  #Other pretreatments
  new_text = re.sub(r'[^a-z]+', ' ', new_text)
  new_text = re.sub(r'\b\w\b', ' ', new_text)
  new_text = re.sub(r'\b\w\w\b', ' ', new_text)
  word_tokens = word_tokenize(new_text)
  lemmatizer = WordNetLemmatizer()

  return ' '.join([lemmatizer.lemmatize(w) for w in word_tokens])

In [24]:
imdb_train['text'][0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [25]:
pre_treated_text = [pre_treatment(text) for text in imdb_train['text']]

In [26]:
pre_treated_text[0]

'rented curiousyellow from video store because all the controversy that surrounded when wa first released also heard that first wa seized custom ever tried enter this country therefore being fan film considered controversial really had see this for myselfbr the plot centered around young swedish drama student named lena who want learn everything she can about life particular she want focus her attention making some sort documentary what the average swede thought about certain political issue such the vietnam war and race issue the united state between asking politician and ordinary denizen stockholm about their opinion politics she ha sex with her drama teacher classmate and married menbr what kill about curiousyellow that year ago this wa considered pornographic really the sex and nudity scene are few and far between even then it not shot like some cheaply made porno while countryman mind find shocking reality sex and nudity are major staple swedish cinema even ingmar bergman arguably

### Naive Bayes classifier on the training set

#### Converte to a dataframe

In [27]:
import pandas as pd


data_imdb_train = {'text': pre_treated_text, 'label': imdb_train['label']}
df_imdb_train = pd.DataFrame(data = data_imdb_train)

In [28]:
df_imdb_train.head()

Unnamed: 0,text,label
0,rented curiousyellow from video store because ...,0
1,curious yellow risible and pretentious steamin...,0
2,only avoid making this type film the future th...,0
3,this film wa probably inspired godard masculin...,0
4,brotherafter hearing about this ridiculous fil...,0


In [29]:
type(df_imdb_train['label'][0])

numpy.int64

In [30]:
classes = [0, 1]

In [None]:
log_prior, log_likelihood, vocabulary = train_naive_bayes(df_imdb_train, classes)

In [34]:
log_prior, log_likelihood

({0: -0.6931471805599453, 1: -0.6931471805599453},
 {'hardcore': {0: -10.413491571271488, 1: -10.918721937791918},
  'shantytown': {0: -13.983024267752858, 1: -12.30501629891181},
  'polarised': {0: -14.676171448312804, 1: -14.009764391150235},
  'transcended': {0: -13.066733535878702, 1: -13.604299283042069},
  'brotherswe': {0: -14.676171448312804, 1: -14.009764391150235},
  'positionbr': {0: -13.983024267752858, 1: -13.31661721059029},
  'frawleymadeleine': {0: -14.676171448312804, 1: -14.009764391150235},
  'cedrick': {0: -14.676171448312804, 1: -14.009764391150235},
  'guildernstern': {0: -14.676171448312804, 1: -14.009764391150235},
  'anywayand': {0: -13.983024267752858, 1: -14.70291157171018},
  'erase': {0: -12.278276175514433, 1: -12.400326478716133},
  'viewersthe': {0: -14.676171448312804, 1: -13.604299283042069},
  'utterly': {0: -8.975727874922118, 1: -9.64666576636187},
  'limbo': {0: -12.111222090851268, 1: -13.093473659276079},
  'turnedup': {0: -13.983024267752858, 1:

### Accuracy on both training and test set

#### Converte test imdb to a dataframe

#### Prediction

In [None]:
test_naive_bayes(test_document, log_prior, log_likelihood, classes, vocabulary)

In [None]:
def bayes_predict_log(document, priors, likelihoods):
    '''
    Predicts the label for a document given the trained
    priors and likelihoods.
    Parameters
    ----------
    document: the document to analyse
    priors: the trained priors
    likelihoods: the trained likelihoods
    Return
    ------
    A tuple (best_class, probabilities), where the first element
    is the name of the best class, and the second element is the dictionary of
    the computed probabilities.
    '''
    classes_probabilities = {}
    # unpack the dictionary and iterate 
    # through the priors
    for label, prior in priors.items():   
      # prob_class = np.log(prior)
      # initialise the probability of a class to the log of its its prior
      for word in document:
        if word in likelihoods[label]:
          # sum the prior with the log-likelihood of each word
          prob_class = log_
          prob_class + np.log(likelihoods[label][word])

      classes_probabilities[label] = prob_class
    # get the names of the classes
    class_names = list(priors.keys())
    
    # -------------------------#
    #      E X E R C I S E     #
    # -------------------------#
    # complete the next line by selecting the name of the best class.
    # hint: the probabilities are saved in the dictionaries, and
    # you can access them by using classes_probabilities.values(), which
    # can be converted to a list using list(x)

    best_class = class_names[
                             np.argmax(
                                 list(classes_probabilities.values())
                                 )
                             ]

    # ~      end exercise    ~ #
    return best_class, classes_probabilities    

In [None]:
correct_answers = 0
# iterate over the documents in the training set;
# unpack each (document, label) tuple
for doc, true_label in test_docs_thumbsup:
  # predict the label for the document
  predicted_label, _ = bayes_predict_log(doc, log_prior, log_likelihood)
  
  # if the predicted label is the same as the true label, 
  # update the counter
  if true_label == predicted_label:
    correct_answers += 1

# calculate the accuracy score
accuracy = correct_answers/len(test_docs_thumbsup)

# print the accuracy
print(accuracy)

In [None]:
log_prior, sorted(log_likelihood[0], key=log_likelihood.get)log_likelihood, vocabulary

SyntaxError: ignored

In [None]:
from IPython.core.display import HTML

# Visualizing data 
HTML(pre_treated_text[0].to_html())

TypeError: ignored

In [None]:
from gensim.parsing.preprocessing import STOPWORDS

#Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]

stop_words = STOPWORDS.union(set(new_words))

In [None]:
{idx:label for idx, label in enumerate(labels)}

In [None]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}