***
This notebook contains the exercise solutions for the Word Vectors Intuition Section of the Natural Language Processing Course. 
<br>
<br>
If you have any question refer to the Lecture **'Tutorial - How to complete the exercises'** in section 2 of the course.
<br>
<br>
**NOTE: Depending on your Python version and library versions, your code may be correct but it may fail the asserts in the Validation cells - if your code matches the one on the solutions, don't worry and consider your exercise correct.**
***


# Exercise 1

In [1]:
# This is the sentence we will use in this exercise
sentence = 'Some cities in France are Paris Toulouse and Marseille'

# Build the vocab of the sentence above
# in list format - sort the elements of the list.
# Call the returning object vocab
# Before building the vocab apply lowercase to every word
# in the sentence.

vocab = list(set(sentence.lower().split(' ')))
vocab.sort()

# Build the co-ocurrence matrix for the sentence above for a 
# neighbor size = 2 (two neighbors on each side)

# Store the Co-Ocurrence matrix as a numpy
# object named co_ocurr

# This is a hard exercise - take your time to
# develop it!

# I've done a nested loop implementation but you
# can use any implementation you want as long
# as you reach the final correct co-ocurrence matrix!

# Hint: You main diagonal should have 0's
import numpy as np
co_ocurr = np.zeros([len(vocab),len(vocab)])

neighbors = 2

for i, element in enumerate(vocab):
    split_sentence = sentence.lower().split(' ')
    for index, word in enumerate(split_sentence):
        if word == element:
            first_el = 0 if index-neighbors < 0 else index - neighbors
            last_el = len(sentence) if index+neighbors > len(sentence) else index + neighbors
            
            context = (
                split_sentence[first_el:index]
                +
                split_sentence[index+1:last_el+1]
            )
            
            for neighbor in context:
                for j, pair_word in enumerate(vocab):
                    if pair_word == neighbor:
                        co_ocurr[i, j] += 1
                        
# Build the similarity matrix in our co_ocurr matrix 
# between every word (using cosine distance)

# Store the values in a Pd.DataFrame with column
# and index names with the names of the words
# in the vocab with the name cosine_df

from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

cosine_df = (
    pd.DataFrame(
        cosine_similarity(co_ocurr),
        index=vocab,
        columns=vocab
    )
)

# Create a list with the sentences:
# - Some cities in Portugal are Evora Lisboa and Porto
# - Some cities in Spain are Madrid Barcelona and Valencia
# - Some cities in the UK are London, Liverpool and Manchester

# Call the object list_sentences

# Additionally, build a new vocab based on all the words (lower case) 
# in the sentence list. Call the new object with the updated vocab
# new_vocab - don't forget to sort it!

list_sentences = [
    'Some cities in Portugal are Evora Lisboa and Porto',
    'Some cities in Spain are Madrid Barcelona and Valencia',
    'Some cities in UK are London Liverpool and Manchester'
]

new_vocab = []
for sentence in list_sentences:
    split_sentence = sentence.lower().split(' ')
    new_vocab.extend(split_sentence)

new_vocab = list(set(new_vocab))

new_vocab.sort()

# Create a new co-ocurrence matrix (neighbor=2) based on the three sentences
# above

# call the object co_ocurr_multiple

co_ocurr_multiple = np.zeros([len(new_vocab), len(new_vocab)])

neighbors = 2

for sentence in list_sentences:
    for i, element in enumerate(new_vocab):
        split_sentence = sentence.lower().split(' ')
        for index, word in enumerate(split_sentence):
            if word == element:
                first_el = 0 if index - neighbors < 0 else index - neighbors
                last_el = len(split_sentence) if index + neighbors >= len(split_sentence) else index + neighbors

                context = (
                    split_sentence[first_el:index]
                    +
                    split_sentence[index + 1:last_el + 1]
                )

                for neighbor in context:
                    for j, pair_word in enumerate(new_vocab):
                        if pair_word == neighbor:
                            co_ocurr_multiple[i, j] += 1

                            
# Create the similarity matrix (in Pandas format) based on the co_ocurr_multiple
# object
# Use the cosine similarity metric
# Call the object similarity_multiple

similarity_multiple = pd.DataFrame(cosine_similarity(co_ocurr_multiple), index=new_vocab, columns=new_vocab)

# Based on the similarity_multiple, what are the most
# similar words to "liverpool"? Write the answer in list format, in alphabetical
# order

most_similar_words = list(
    similarity_multiple.loc[similarity_multiple.liverpool > 0.49].index
)

_ = most_similar_words.pop(3)

# Validation - Exercise 1

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

try:
    vocab
except NameError:
    raise NameError('Did you create the object vocab?')
    
try:
    co_ocurr
except NameError:
    raise NameError('Did you create the object co_ocurr?')
    
try:
    cosine_df
except NameError:
    raise NameError('Did you create the object cosine_df?')
    
try:
    list_sentences
except NameError:
    raise NameError('Did you create the object list_sentences?')
    
try:
    new_vocab
except NameError:
    raise NameError('Did you create the object new_vocab?')
    
try:
    co_ocurr_multiple
except NameError:
    raise NameError('Did you create the object co_ocurr_multiple?')
    
try:
    most_similar_words
except NameError:
    raise NameError('Did you create the object most_similar_words?')

assert_1 = ['and',
         'are',
         'cities',
         'france',
         'in',
         'marseille',
         'paris',
         'some',
         'toulouse']

assert_2 = [[0., 0., 0., 0., 0., 1., 1., 0., 1.],
       [0., 0., 0., 1., 1., 0., 1., 0., 1.],
       [0., 0., 0., 1., 1., 0., 0., 1., 0.],
       [0., 1., 1., 0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 1., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 1., 0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 1., 1., 0., 0.]]

assert_3 = ['Some cities in Portugal are Evora Lisboa and Porto',
 'Some cities in Spain are Madrid Barcelona and Valencia',
 'Some cities in UK are London Liverpool and Manchester']

assert_4 = ['and',
 'are',
 'barcelona',
 'cities',
 'evora',
 'in',
 'lisboa',
 'liverpool',
 'london',
 'madrid',
 'manchester',
 'porto',
 'portugal',
 'some',
 'spain',
 'uk',
 'valencia']

assert_5 = [[0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
        1.],
       [0., 0., 1., 0., 1., 3., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1.,
        0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        1.],
       [0., 0., 0., 0., 0., 3., 0., 0., 0., 0., 0., 0., 1., 3., 1., 1.,
        0.],
       [1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0.],
       [0., 3., 0., 3., 0., 0., 0., 0., 0., 0., 0., 0., 1., 3., 1., 1.,
        0.],
       [1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
        0.],
       [1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
        0.],
       [1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 3., 0., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]]

assert_7 = ['barcelona', 'evora', 'lisboa', 'london', 'madrid', 'uk']

assert(assert_1 == vocab)
assert(np.array_equal(assert_2,co_ocurr))
assert(pd.DataFrame(cosine_similarity(assert_2),index=assert_1,columns=assert_1).equals(cosine_df))
assert(list_sentences == assert_3)
assert(new_vocab == assert_4)
assert(np.array_equal(assert_5,co_ocurr_multiple))
assert(most_similar_words == assert_7)

print('Your code is correct!')

Your code is correct!
