<a href="https://colab.research.google.com/github/chriswu99aaa/AI_Tutorial/blob/master/NLP_Distributional_Semantics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pre-processing need to be done before constructing the model
1. Tokenization
2. Lower casing
3. Stop words removal
4. Stemming
5. Lemmatization

The overall procedure for task1 (1) is
1. Load the training data
2. Pre-processing
  * process multi word representation: models.phrases in gensim
3. Build the tf-idf representation by ourself or using NLTK API

Evaluate the solution by constructing a function which retunrs the cosine similarity.
Build the similarity function

Further action, check similarity for multiple words


In [None]:
!pip install nltk
!pip install gensim



In [None]:
import nltk
from nltk import Text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import gensim
from gensim import corpora, models, similarities
import numpy as np

In [None]:
from google.colab import drive

# Connect to google drive since the training data is store in it
drive.mount('/content/drive/')

# Access files in Google Drive
file_path = '/content/drive/MyDrive/NLP/Training-dataset.csv'


def file_loader(file_path):
    return pd.read_csv(file_path)

Mounted at /content/drive/


Better search results: Lemmatization helps in retrieving better search results since it reduces different forms of a word to a common base form, making it easier to match different forms of a word in the text.


Whether to use stemming or lemmatization or both needs to be tested.

In [None]:
def pre_processing(df):
    '''
    perform pre-processing to the plot synopsis column
    '''
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')


    # Convert the words into lower case
    df['plot_synopsis'] = df['plot_synopsis'].str.lower()

    # Tokenize sentences
    df['plot_synopsis'] = df['plot_synopsis'].apply(lambda x: word_tokenize(x))

    # Remove stopwords and other non-alphabetic character
    stop_words = set(stopwords.words('english'))
    df['plot_synopsis'] = df['plot_synopsis'].apply(lambda x:[word for word in x if word.isalpha() and word not in stop_words])

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    df['plot_synopsis'] = df['plot_synopsis'].apply(lambda x:[lemmatizer.lemmatize(word) for word in x])

In [None]:
def identify_phrase(df):
    '''
    This function uses the gensim Phrase class which handles multi word representation
    '''
    from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

    bigram_model = Phrases(df['plot_synopsis'], min_count=1, threshold=1,connector_words=ENGLISH_CONNECTOR_WORDS)
    trigram_model = Phrases(bigram_model[df['plot_synopsis']], min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)
    df['plot_synopsis'] = df['plot_synopsis'].apply(lambda x: trigram_model[x])

def build_dict(df, mode = 0):
    '''
    Build the dictionary representation from text
    '''

    # Construct dictionary using
    if mode == 1:
        dictionary = corpora.Dictionary(df['plot_synopsis'])

        # Apply the function docwbow to convert each entry in the plot_synopsis column into BOW representation
        df['bow'] = df['plot_synopsis'].apply(dictionary.doc2bow)
    else:
        dictionary = corpora.Dictionary(df.iloc[:,0])

    return dictionary

def build_corpus(dictionary, df):
    # Construct a corpus as a list
    return [dictionary.doc2bow(doc) for doc in df['plot_synopsis']]



In [None]:
def tf_idf(corpus):
    '''
    Build the TF-IDF model
    '''
    # Construct the TF-IDF Model
    return models.TfidfModel(corpus)

    # # Transform corpus to TF-IDF representation
    # tfidf_corpus = tf_idf[corpus]

In [None]:
# load data
df = file_loader(file_path)

# pre-processing
pre_processing(df)

# use phrase to handle multiword representation
identify_phrase(df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
document = df['plot_synopsis'].values.tolist()


In [None]:
dictionary = corpora.Dictionary(document)

In [None]:
dictionary.token2id

In [None]:
corpus = [dictionary.doc2bow(tokens) for tokens in document]


In [None]:
model = models.TfidfModel(corpus)

In [None]:
validation_df = pd.read_csv('/content/drive/MyDrive/NLP/Task-1-validation-dataset.csv')

In [None]:
words1 = validation_df.iloc[:, [1]]
words2 = validation_df.iloc[:, [2]]

In [None]:
words1 = words1.values.tolist()
words2 = words2.values.tolist()

In [None]:
words1_tfidf_list = []
words2_tfidf_list = []

In [None]:
for i in range(len(words1)):
    word1_bow = dictionary.doc2bow(words1[i])
    word2_bow = dictionary.doc2bow(words2[i])

    word1_tfidf = model[word1_bow]
    word2_tfidf = model[word2_bow]

    words1_tfidf_list.append(word1_tfidf)
    words2_tfidf_list.append(word2_tfidf)

In [None]:
for i in range(len(words1)):
    if len(words1_tfidf_list[i])==0:
        words1_tfidf_list[i].append((0,0))
        print('OOV index: ',i)
    if len(words2_tfidf_list[i])==0:
        words2_tfidf_list[i].append((0,0))
        print('OOV index: ',i)

In [None]:
for i in range(len(words1_tfidf_list)):
    print(i)
    w1 = words1_tfidf_list[i]
    w2 = words2_tfidf_list[i]
    print('w1: ',w1)
    print('w2: ',w2)
    sim = cosine_similarity(w1,w2)
    prediction.append(sim)

In [None]:
result = []
for i in range(len(prediction)):
    result.append(prediction[i][0][0])

# save the result to result.csv
results = pd.DataFrame(result)
path = "/content/drive/MyDrive/NLP/result.csv"
results.to_csv(path,index=False)

In [None]:
results = pd.DataFrame(result)

path = "/content/drive/MyDrive/NLP/result1.csv"
results.to_csv(path,index=False)

In [None]:
# import gensim.downloader as api
# from gensim.models import TfidfModel
# from gensim.corpora import Dictionary

# dataset = api.load("text8")
# dct = Dictionary(dataset)  # fit dictionary
# corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format

# model = TfidfModel(corpus)  # fit model
# vector = model[corpus[0]]  # apply model to the first corpus document

## Task 1(2) Word2vec

Bulding up from the pre-processing result, we are going to use word2vec from gensim.

In [None]:
# load data
df = file_loader(file_path)

# pre-processing
pre_processing(df)

# use phrase to handle
identify_phrase(df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
document = df['plot_synopsis'].values.tolist()

In [None]:
# Create Word2Vec model
word_vec_model = models.Word2Vec(sentences=document, vector_size=100, window=5, min_count=1, workers=4)

# Train the Word2Vec model
word_vec_model.train(df['plot_synopsis'], total_examples=len(df['plot_synopsis']), epochs=10)





(26998530, 26998530)

In [None]:
word_vec_model.wv['apple'].shape

(100,)

In [None]:
validation_df = pd.read_csv('/content/drive/MyDrive/NLP/Task-1-validation-dataset.csv')

words1 = validation_df.iloc[:, [1]]
words2 = validation_df.iloc[:, [2]]

words1 = words1.values.tolist()
words2 = words2.values.tolist()

In [None]:
words2[63]

['tableware']

In [None]:
np.zeros((100,)).shape

(100,)

In [None]:
np.random.rand(100,).shape

(100,)

In [None]:
for word in words1:
    if word in word_vec_model.wv:


In [None]:
sim = []
# try:
for i in range(len(words1)):
    if (words1[i][0] in word_vec_model.wv) and (words2[i][0] in word_vec_model.wv):
        s = word_vec_model.wv.similarity(words1[i][0], words2[i][0])
        sim.append(s)
    else:
        w1 = np.random.rand(100,)
        w2 = np.random.rand(100,)
        s = np.dot(w1,w2)/(np.linalg.norm(w1) * np.linalg.norm(w2))
        sim.append(s)

In [None]:
# save the result to result.csv
results2 = pd.DataFrame(sim)
path = "/content/drive/MyDrive/NLP/result2.csv"
results2.to_csv(path,index=False)

In [None]:
word_vec_model.wv.similarity('absorb','study')

0.80618453