***
This notebook contains the exercise solutions for the Text Representation Section of the Natural Language Processing Course. 
<br>
<br>
If you have any question refer to the Lecture **'Tutorial - How to complete the exercises'** in section 2 of the course.
<br>
<br>
**NOTE: Depending on your Python version and library versions, your code may be correct but it may fail the asserts in the Validation cells - if your code matches the one on the solutions, don't worry and consider your exercise correct.**
***


# Exercise 1

In [10]:
# Read the txt file in exercise data 
# Called positive_movie_review.txt 
# into an object called positive_review
with open('./exercise_data/positive_movie_review.txt', encoding='utf-8') as file:
    positive_review = file.read()

# Tokenize the positive_review 
# file into sentences using sent_tokenize
# and store in a review_token object
from nltk.tokenize import sent_tokenize
review_token = sent_tokenize(positive_review)

# Create a one-hot vectorized version of the
# positive review and call the object 
# vector_positive_review
# You should have a row for each sentence

# Store the object in a DataFrame format
# with the column names as the vocab
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

cv = CountVectorizer(binary=True)

vector_positive_review = pd.DataFrame(
    cv.fit_transform(review_token).todense(),
    columns=cv.get_feature_names_out()
)


# Create a count vectorize version of the 
# review_token object and store it
# in a dataframe called 
# vector_positive_review_count

# Use the vocab as column names

# Only use words that appear
# in more than 1% of the corpus
cv_count = CountVectorizer(binary=False, min_df=0.01)

vector_positive_review_count = pd.DataFrame(
    cv_count.fit_transform(review_token).todense(),
    columns=cv_count.get_feature_names_out()
)

# Validation - Exercise 1

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from nltk.tokenize import sent_tokenize
cv = CountVectorizer(binary=True)
cv_count = CountVectorizer(binary=False, min_df=0.01)

with open('./exercise_data/positive_movie_review.txt', encoding='utf-8') as f:
    assert_1 = f.read()

try:
    positive_review
except NameError:
    raise NameError('Did you create the object positive_review?')
    
try:
    review_token
except NameError:
    raise NameError('Did you create the object review_token?')
    
try:
    vector_positive_review
except NameError:
    raise NameError('Did you create the object vector_positive_review?')
    
try:
    vector_positive_review_count
except NameError:
    raise NameError('Did you create the object vector_positive_review_count?')

assert_2 = sent_tokenize(assert_1)
    
assert(assert_1 == positive_review)
assert(assert_2 == review_token)
assert(pd.DataFrame(
    cv.fit_transform(assert_2).todense(),
    columns=cv.get_feature_names_out()
).equals(vector_positive_review))
assert(pd.DataFrame(
    cv_count.fit_transform(assert_2).todense(),
    columns=cv_count.get_feature_names_out()
).equals(vector_positive_review_count))

print('Your code is correct!')

NameError: Did you create the object positive_review?

# Exercise 2

In [1]:
# Read the txt file in exercise data 
# Called negative_movie_review.txt 
# into an object called negative_review
with open('./exercise_data/negative_movie_review.txt', encoding='utf-8') as file:
    negative_review = file.read()

# Tokenize the negative_review 
# file into sentences using sent_tokenize
# and store in a review_token_neg object
from nltk.tokenize import sent_tokenize
review_token_neg = sent_tokenize(negative_review)

# Score the TF-IDF for the word killer in the
# first sentence of the corpus tokenized above.

# Remember that you have to:
# - count the occurences of the word in the sentence
# - check the presence of the word in the other sentences
# - apply the formula we have learned in the lectures

# Name the object tf_idf_killer_score
count_killer = review_token_neg[0].count('killer')
count_killers = sum(['killer' in review for review in review_token_neg])
        
import math
tf_idf_killer_score = count_killer*(
    math.log(
        len(review_token_neg)
        /
        count_killers
    )
)

# Use the TFIDF Vectorizer on the 
# review_token_neg object 
# and store the object in a sparse matrix format
# called sparse_tf
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()

sparse_tf = tf_idf.fit_transform(review_token_neg)

In [9]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
tftest = TfidfVectorizer()

with open('./exercise_data/negative_movie_review.txt', encoding='utf-8') as f:
    assert_1 = f.read()

try:
    negative_review
except NameError:
    raise NameError('Did you create the object negative_review?')
    
try:
    review_token_neg
except NameError:
    raise NameError('Did you create the object review_token_neg?')
    
try:
    tf_idf_killer_score
except NameError:
    raise NameError('Did you create the object tf_idf_killer_score?')
    
try:
    sparse_tf
except NameError:
    raise NameError('Did you create the object sparse_tf?')

assert_2 = sent_tokenize(assert_1)
assert_3 = 1.792
assert_4 = tftest.fit_transform(assert_2)

assert(assert_1 == negative_review)
assert(assert_2 == review_token_neg)
assert(np.round(tf_idf_killer_score, 3) == assert_3)
assert(np.allclose(assert_4.todense(),sparse_tf.todense()))

print('Your code is correct!')

Your code is correct!


# Exercise 3

In [12]:
# Import spacy and load the spacy model en_core_web_md into
# your Python environment

# Save the model load into an object named nlp

import spacy
nlp = spacy.load("en_core_web_md")

# Define a function that takes two vectors and outputs the cosine similarity
# between them. Use any method you would prefer 

# call the function compute_similarity

from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(vector_1, vector_2):
    sim_matrix = cosine_similarity([vector_1, vector_2])
    return sim_matrix[0][1]


# Extract the embeddings (using spacy) for the sentences: 
# sent1 = "The cat sat on the mat."
# sent2 = "A feline is on the rug."

# Save your vectors in a embed_1 and embed_2 object

embed_1 = nlp('The cat sat on the mat.').vector
embed_2 = nlp('A feline is on the rug.').vector

# Use your function compute_similarity to calculate the similarity
# between embed_1 and embed_2 

# Save the similarity in an object named sim_1 
sim_1 = compute_similarity(embed_1, embed_2)

# Calculate the similarity of both sentences but
# using the one-hot vectorizer approach

# save the similarity in a sim_2 named object
cv_1 = CountVectorizer(binary=True)

sparse_matrix = cv_1.fit_transform(
    ['The cat sat on the mat.',
    'A feline is on the rug.']).todense()

sim_2 = compute_similarity(sparse_matrix[0].tolist()[0], sparse_matrix[1].tolist()[0])


# Answer the following questions: Based on the similarities using
# one-hot vectorizer and document vectors (word2vec based), which 
# method extracts better "meaning" from the sentences?

# The most appropriate method to capture meaning of the senteces is the "word2vec" based
# as similarity is significantly higher than similarity calculated on top of a one-hot vectorizer

# Validation Exercise 3

In [61]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Test for function existence
try:
    compute_similarity
except NameError:
    raise NameError('Did you define the function compute_similarity?')

# Test for vector existence
try:
    embed_1
except NameError:
    raise NameError('Did you create the object embed_1?')

try:
    embed_2
except NameError:
    raise NameError('Did you create the object embed_2?')

# Test for similarity calculations
try:
    sim_1
except NameError:
    raise NameError('Did you create the object sim_1?')

try:
    sim_2
except NameError:
    raise NameError('Did you create the object sim_2?')

# Actual values for assertions
assert_sentences = ['The cat sat on the mat.', 'A feline is on the rug.']
cv_assert = CountVectorizer(binary=True)
sparse_matrix_assert = cv_assert.fit_transform(assert_sentences).todense()

assert_embed_1 = nlp('The cat sat on the mat.').vector
assert_embed_2 = nlp('A feline is on the rug.').vector
assert_sim_1 = compute_similarity(assert_embed_1, assert_embed_2)
assert_sim_2 = compute_similarity(sparse_matrix_assert[0].tolist()[0], sparse_matrix_assert[1].tolist()[0])

# Making sure the values match the ones provided
assert np.array_equal(embed_1, assert_embed_1)
assert np.array_equal(embed_2, assert_embed_2)
assert np.allclose(sim_1, assert_sim_1)
assert np.allclose(sim_2, assert_sim_2)

print('Your code is correct!')


Your code is correct!
