<a href="https://colab.research.google.com/github/christopherdiamana/nlp/blob/main/Copy_of_catch_up1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Natural Language Processing Catch-up 1

## The dataset

In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 8.2 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 61.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 63.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 73.9 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |█████████████████████

In [2]:
from datasets import load_dataset_builder

In [3]:
ds_builder = load_dataset_builder("imdb")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [4]:
# Inspect dataset description
ds_builder.info.description

'Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.'

In [5]:
# Inspect dataset features
ds_builder.info.features

{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
 'text': Value(dtype='string', id=None)}

In [6]:
from datasets import get_dataset_split_names

In [7]:
get_dataset_split_names("imdb")

['train', 'test', 'unsupervised']

In [8]:
from datasets import load_dataset

In [9]:
dataset = load_dataset("imdb")

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [11]:
imdb_train = load_dataset('imdb', split='train')
imdb_test = load_dataset('imdb', split='test')

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


In [12]:
from collections import Counter

In [13]:
Counter(imdb_train['label'])

Counter({0: 12500, 1: 12500})

In [14]:
Counter(imdb_test['label'])

Counter({0: 12500, 1: 12500})



---



## Naive Bayes classifier

In [15]:
import numpy as np

In [16]:
def compute_occurences(vocabulary, documents):
  
  # create a vocabulary dictionary where the elements are initialise to 0 
  occurences = dict.fromkeys(vocabulary, 0)

  for text in documents:
    for current_word in text.split():
      occurences[current_word] +=1

  return occurences

In [17]:
def compute_class_log_likelihood(log_likelihood, vocabulary, class_label, occurences):
  '''
  Computes the log likelihoods of words in the documents by class.

  Parameters
  ----------
  log_likelihood: the current log likelihood vocabulary dictionary 
  vocabulary: the vocabulary of the full corpus
  class_label: the label of the class for whom the likelihood will be calculate
  occurences: list of words occurence of a specific class

  Returns
  -------
  dictionary with words as keys and log likelihood as values
  '''

  denominator = sum([occurences[word] + 1 for word in vocabulary])

  for word in occurences:
    log_likelihood[word][class_label] = math.log((occurences[word] + 1) / denominator)

  return log_likelihood

In [18]:
import math
from operator import countOf


def train_naive_bayes(documents, classes):
  log_prior = {}
  vocabulary = set()
  log_likelihood = {}
  
  [vocabulary.update(document['text'].split()) for index, document in documents.iterrows()]
  num_documents = len(documents['text'])
  
  log_likelihood = { word : {} for word in vocabulary }

  for class_label in classes:
    num_documents_of_class = np.count_nonzero(np.array(documents['label']) == class_label)
    log_prior[class_label] = math.log(num_documents_of_class / num_documents)

    big_document = [document['text'] for index, document in documents.iterrows() if document['label'] == class_label]
    occurences = compute_occurences(vocabulary, big_document)

    log_likelihood = compute_class_log_likelihood(log_likelihood, vocabulary, class_label, occurences)

  return log_prior, log_likelihood, vocabulary

In [19]:
def test_naive_bayes(test_document, logprior, loglikelihood, classes, vocabulary):

  summation = []

  for class_label in classes:
    summation.append(logprior[class_label])
    
    for word in test_document.split():
      if word in vocabulary:
        summation[class_label] += loglikelihood[word][class_label] 
  
  return np.argmax(summation)

### Pretreatment

In [20]:
from string import punctuation
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [21]:
# For the Lemmatizer 
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [22]:
def pre_treatment(text):
  new_text = text.lower()
  new_text = new_text.translate(str.maketrans('', '', punctuation))
  
  #Other pretreatments
  new_text = re.sub(r'[^a-z]+', ' ', new_text)
  new_text = re.sub(r'\b\w\b', ' ', new_text)
  new_text = re.sub(r'\b\w\w\b', ' ', new_text)
  word_tokens = word_tokenize(new_text)
  lemmatizer = WordNetLemmatizer()

  return ' '.join([lemmatizer.lemmatize(w) for w in word_tokens])

In [23]:
imdb_train['text'][0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [24]:
pre_treated_text = [pre_treatment(text) for text in imdb_train['text']]

In [25]:
pre_treated_text[0]

'rented curiousyellow from video store because all the controversy that surrounded when wa first released also heard that first wa seized custom ever tried enter this country therefore being fan film considered controversial really had see this for myselfbr the plot centered around young swedish drama student named lena who want learn everything she can about life particular she want focus her attention making some sort documentary what the average swede thought about certain political issue such the vietnam war and race issue the united state between asking politician and ordinary denizen stockholm about their opinion politics she ha sex with her drama teacher classmate and married menbr what kill about curiousyellow that year ago this wa considered pornographic really the sex and nudity scene are few and far between even then it not shot like some cheaply made porno while countryman mind find shocking reality sex and nudity are major staple swedish cinema even ingmar bergman arguably

### Naive Bayes classifier on the training set

#### Converte to a dataframe

In [26]:
import pandas as pd

In [27]:
data_imdb_train = {'text': pre_treated_text, 'label': imdb_train['label']}
df_imdb_train = pd.DataFrame(data = data_imdb_train)

In [28]:
df_imdb_train.head()

Unnamed: 0,text,label
0,rented curiousyellow from video store because ...,0
1,curious yellow risible and pretentious steamin...,0
2,only avoid making this type film the future th...,0
3,this film wa probably inspired godard masculin...,0
4,brotherafter hearing about this ridiculous fil...,0


#### Train the imdb

In [29]:
classes = [0, 1]

In [30]:
log_prior, log_likelihood, vocabulary = train_naive_bayes(df_imdb_train, classes)

In [31]:
log_prior, log_likelihood

({0: -0.6931471805599453, 1: -0.6931471805599453},
 {'couplebr': {0: -13.289877087192913, 1: -12.623470030030344},
  'wahlberg': {0: -11.903582726073022, 1: -14.009764391150235},
  'seidl': {0: -13.577559159644695, 1: -12.063854242094921},
  'definatey': {0: -13.983024267752858, 1: -14.70291157171018},
  'internationale': {0: -14.676171448312804, 1: -14.009764391150235},
  'geare': {0: -14.676171448312804, 1: -14.009764391150235},
  'ellissen': {0: -13.983024267752858, 1: -14.70291157171018},
  'gonzales': {0: -13.577559159644695, 1: -13.093473659276079},
  'sportsfan': {0: -14.676171448312804, 1: -14.009764391150235},
  'daughtersbr': {0: -14.676171448312804, 1: -13.093473659276079},
  'quota': {0: -12.73026129925749, 1: -13.093473659276079},
  'simonetta': {0: -13.289877087192913, 1: -14.70291157171018},
  'dalebr': {0: -14.676171448312804, 1: -14.009764391150235},
  'envahisseurs': {0: -13.983024267752858, 1: -14.70291157171018},
  'totoandy': {0: -14.676171448312804, 1: -14.0097643

### Accuracy on both training and test set

#### Converte test imdb to a dataframe

In [32]:
df_imdb_test = pd.DataFrame(data = imdb_test)

In [33]:
df_imdb_test.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


#### Pretreatment of test imdb

In [34]:
df_imdb_test['text'] = df_imdb_test['text'].apply(pre_treatment)

In [35]:
df_imdb_test.head()

Unnamed: 0,text,label
0,love scifi and willing put with lot scifi movi...,0
1,worth the entertainment value rental especiall...,0
2,it totally average film with few semialright a...,0
3,star rating saturday night friday night friday...,0
4,first off let say you havent enjoyed van damme...,0


#### Prediction

In [36]:
df_imdb_train['prediction'] = df_imdb_train.apply(lambda row: test_naive_bayes(row['text'], log_prior, log_likelihood, classes, vocabulary), axis=1)

In [37]:
df_imdb_test['prediction'] = df_imdb_test.apply(lambda row: test_naive_bayes(row['text'], log_prior, log_likelihood, classes, vocabulary), axis=1)

In [38]:
df_imdb_train.head()

Unnamed: 0,text,label,prediction
0,rented curiousyellow from video store because ...,0,0
1,curious yellow risible and pretentious steamin...,0,0
2,only avoid making this type film the future th...,0,0
3,this film wa probably inspired godard masculin...,0,0
4,brotherafter hearing about this ridiculous fil...,0,0


In [39]:
df_imdb_test.head()

Unnamed: 0,text,label,prediction
0,love scifi and willing put with lot scifi movi...,0,0
1,worth the entertainment value rental especiall...,0,0
2,it totally average film with few semialright a...,0,0
3,star rating saturday night friday night friday...,0,0
4,first off let say you havent enjoyed van damme...,0,0


#### Accuracy

In [40]:
def compute_accuracy(dataframe):
  correct_answers = 0

  for index, document in dataframe.iterrows():
    if document['label'] == document['prediction']:
      correct_answers += 1

  accuracy = correct_answers/len(dataframe)

  return accuracy

Test set accuracy

In [41]:
test_set_accuracy = compute_accuracy(df_imdb_test)
print(f"Test set accuracy: {test_set_accuracy:.2%}")

Test set accuracy: 81.67%


Train set accuracy

In [42]:
train_set_accuracy = compute_accuracy(df_imdb_train)
print(f"Train set accuracy: {train_set_accuracy:.2%}")

Train set accuracy: 91.14%


### Question 2.4: Why is accuracy a sufficient measure of evaluation here?

...


### What are the top 10 most important words (features) for each class?


In [46]:
type(log_likelihood["and"])

dict

In [76]:
def sort_log_likelihood(loglikelihood, class_label):
  
  loglikelihood_items = {}
  
  for word in loglikelihood.keys():
    loglikelihood_items[word] = loglikelihood[word][class_label]
  
  # tuples = zip(loglikelihood.col, coo_matrix.data)
  
  sorted_tuples = sorted(loglikelihood_items.items(), key=lambda item: item[1], reverse=True)
  
  return sorted_tuples
  # return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

# #generate tf-idf for the given document
# tf_idf_vector=tfidf_transformer.transform(cv.transform(["change number node recognition rate defined relative frequency"]))

# #sort the tf-idf vectors by descending order of scores
# sorted_items=sort_coo(tf_idf_vector.tocoo())

# sorted_items

#### Look at the words with the highest likelihood in each class.


In [82]:
Top_10_negative = sort_log_likelihood(log_likelihood, 0)[:10]

In [83]:
Top_10_negative

[('the', -2.677978196365483),
 ('and', -3.471675128939145),
 ('this', -4.068201205552985),
 ('that', -4.204278280380475),
 ('movie', -4.443128381768861),
 ('wa', -4.50357320891033),
 ('for', -4.691380444712641),
 ('but', -4.708020382450953),
 ('film', -4.708301636847407),
 ('with', -4.7366895355533)]

In [84]:
Top_10_positive = sort_log_likelihood(log_likelihood, 1)[:10]

In [85]:
Top_10_positive

[('the', -2.645785670000755),
 ('and', -3.308623257048428),
 ('this', -4.247120615961752),
 ('that', -4.266915184576918),
 ('film', -4.622701440780578),
 ('with', -4.653507144273896),
 ('for', -4.69164642700062),
 ('wa', -4.711642105790518),
 ('movie', -4.713108717722131),
 ('but', -4.778053993437859)]

#### Remove stopwords (see NLTK stopwords corpus) and check again.

In [86]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
stop_words = set(stopwords.words('english'))

In [88]:
log_likelihood_without_stop = log_likelihood.copy()

In [89]:
for word in stop_words:
    log_likelihood_without_stop.pop(word, None)

In [90]:
print("Length with stop word: ", len(log_likelihood))
print("Length without stop word: ", len(log_likelihood_without_stop))

Length with stop word:  108227
Length without stop word:  108106


In [91]:
Top_10_negative_without_stop = sort_log_likelihood(log_likelihood_without_stop, 0)[:10]

In [92]:
Top_10_negative_without_stop

[('movie', -4.443128381768861),
 ('wa', -4.50357320891033),
 ('film', -4.708301636847407),
 ('one', -5.206162750340039),
 ('like', -5.357514932853162),
 ('even', -5.739084412136281),
 ('ha', -5.744355211003636),
 ('good', -5.797813407684981),
 ('time', -5.804245197195175),
 ('bad', -5.8094487811187046)]

In [93]:
Top_10_positive_without_stop = sort_log_likelihood(log_likelihood_without_stop, 1)[:10]

In [94]:
Top_10_positive_without_stop

[('film', -4.622701440780578),
 ('wa', -4.711642105790518),
 ('movie', -4.713108717722131),
 ('one', -5.177103741274279),
 ('ha', -5.581839667492178),
 ('like', -5.601716646994794),
 ('time', -5.746560631219308),
 ('good', -5.788688751006879),
 ('story', -5.820797527501359),
 ('character', -5.867555600588574)]

### Take at least 2 wrongly classified example from the test set and try explaining why the model failed.