Welcome to Deep Search Relevance Ranking Session1- Part1! 😀 
### PART 2: VECTOR SPACE MODEL :

Same as Part I: If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers and 🤗 Datasets. Uncomment the following cell and run it.

In [None]:
! pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import transformers
print(transformers.__version__)

4.19.2


In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


# Loading the dataset

We will use the 🤗 Datasets library to download the data and run our algorithms.
 - This can be done with the functions load_dataset and load_metric.
 - We will use XGlue during  all of our sessions: https://huggingface.co/datasets/xglue/viewer/qadsm/

In [None]:
from datasets import load_dataset, load_metric

In [None]:
xglue = load_dataset('xglue', 'qadsm')

Reusing dataset x_glue (/root/.cache/huggingface/datasets/x_glue/qadsm/1.0.0/8566eedecd9ab28e01c051c023dadf97bf408e5195f76b06aba70ebd4697ae08)


  0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
print(xglue)

DatasetDict({
    train: Dataset({
        features: ['query', 'ad_title', 'ad_description', 'relevance_label'],
        num_rows: 100000
    })
    validation.en: Dataset({
        features: ['query', 'ad_title', 'ad_description', 'relevance_label'],
        num_rows: 10000
    })
    validation.de: Dataset({
        features: ['query', 'ad_title', 'ad_description', 'relevance_label'],
        num_rows: 10000
    })
    validation.fr: Dataset({
        features: ['query', 'ad_title', 'ad_description', 'relevance_label'],
        num_rows: 10000
    })
    test.en: Dataset({
        features: ['query', 'ad_title', 'ad_description', 'relevance_label'],
        num_rows: 10000
    })
    test.de: Dataset({
        features: ['query', 'ad_title', 'ad_description', 'relevance_label'],
        num_rows: 10000
    })
    test.fr: Dataset({
        features: ['query', 'ad_title', 'ad_description', 'relevance_label'],
        num_rows: 10000
    })
})


In [None]:
xglue['train'].shape

(100000, 4)

In [None]:
xglue['train'][0:2]  #xglue['train']['ad_title][0:2] 

{'ad_description': ['Your New England Cruise Awaits! Holland America Line Official Site.',
  'Explore Your World with Four Extraordinary Offers.'],
 'ad_title': ['New England Cruises', 'Holland America Line®'],
 'query': ['cruise portland maine', 'transportation to cruise port miami'],
 'relevance_label': [1, 0]}

In [None]:
xglue['train']['ad_description'][0:2] 

['Your New England Cruise Awaits! Holland America Line Official Site.',
 'Explore Your World with Four Extraordinary Offers.']

In [None]:
len(xglue['train']['ad_description'])

100000

# Vector Space Model Implementation for Search Ranking

### Loading the DataSet , Create Corpus and fit_transform it into TIDF vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = xglue['train']['ad_description']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.shape)

(100000, 24169)




In [None]:
#loading basic packages
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))
stop_words = set(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Preprocessing Functions
### Tokenizing, removeing extra space, puncatuations, removing stop words and stemming

In [None]:
# this function returns a list of tokenized and stemmed words of any text
def rem_pun(doc_text):
    return re.sub(r'[^\w\s]', '', doc_text)
  
def rem_digits(doc_text):
    return re.sub(r'[0-9]+', '', doc_text)

def del_extra_spaces(doc_text):
    return  " ".join(doc_text.split())

def get_tokenized_list(doc_text):
    tokens = nltk.word_tokenize(doc_text)
    return tokens


def word_stemmer(token_list):
  ps = nltk.stem.PorterStemmer()
  stemmed = []
  for words in token_list:
    stemmed.append(ps.stem(words))
  return stemmed

In [None]:

def remove_stopwords(doc_text):
  cleaned_text = []
  for words in doc_text:
    if words not in stop_words:
      cleaned_text.append(words)
  return cleaned_text

In [None]:
#Check for single ad_description
print("ad_description example =",corpus[10])
repun_corpus_1 = rem_pun(corpus[10])
print("after removing puntucation ad_description=",repun_corpus_1)
repun_corpus_1 = rem_digits(repun_corpus_1)
pre_corpuse_1 =del_extra_spaces(repun_corpus_1)
tokens = get_tokenized_list(pre_corpuse_1)
print("WORD TOKENS:")
print(tokens)
doc_text = remove_stopwords(tokens)
print("\nAFTER REMOVING STOPWORDS:")
print(doc_text)
print("\nAFTER PERFORMING THE WORD STEMMING::")
doc_text = word_stemmer(doc_text)
doc_text

corpus[2]= Indulge in your Ideal Cruise. View Special Offers at the Offical Site!
repun_corpus_1= Indulge in your Ideal Cruise View Special Offers at the Offical Site
WORD TOKENS:
['Indulge', 'in', 'your', 'Ideal', 'Cruise', 'View', 'Special', 'Offers', 'at', 'the', 'Offical', 'Site']

AFTER REMOVING STOPWORDS:
['Indulge', 'Ideal', 'Cruise', 'View', 'Special', 'Offers', 'Offical', 'Site']

AFTER PERFORMING THE WORD STEMMING::


['indulg', 'ideal', 'cruis', 'view', 'special', 'offer', 'offic', 'site']

In [None]:
doc_ = ' '.join(doc_text)
doc_

'indulg ideal cruis view special offer offic site'

#Performing Preprocessing on our Corpus

In [None]:
cleaned_corpus = []
for doc in corpus:
  repun_corpus = rem_pun(doc)
  repun_corpus = rem_digits(repun_corpus)
  pre_corpuse =del_extra_spaces(repun_corpus)
  tokens = get_tokenized_list(pre_corpuse)
  doc_text = remove_stopwords(tokens)
  doc_text  = word_stemmer(doc_text)
  doc_text = ' '.join(doc_text)
  cleaned_corpus.append(doc_text)
cleaned_corpus

['your new england cruis await holland america line offici site',
 'explor your world four extraordinari offer',
 'cruis your own privat island In caribbean learn more now',
 'sign Up offer explor caribbean holland america line',
 'offici site sign Up special new england cruis offer today',
 'your canada cruis await holland america line offici site',
 'beauti jewelri At low price order ship free from zale',
 'learn holland america ship holland america line offici site',
 'offici websit sign Up today for special offer cruis',
 'sign Up for special offer take virtual tour watch video more',
 'indulg ideal cruis view special offer offic site',
 'offici site find your ideal cruis view special offer more',
 'fine jewelri At unbeat price plu free ship On order',
 'fine jewelri At unbeat price plu free ship On order',
 'fine ring At unbeat price plu free ship On order',
 'free ship qualifi order low price babi R Us',
 'shop latest velvet jacket',
 'ador shirt bib romper person free shop now s

In [None]:
xglue['train'][0:5]

{'ad_description': ['Your New England Cruise Awaits! Holland America Line Official Site.',
  'Explore Your World with Four Extraordinary Offers.',
  'Cruise to Your Own Private Island In the Caribbean. Learn More Now.',
  'Sign Up for Offers and Explore the Caribbean with Holland America Line',
  'Official Site - Sign Up for Special New England Cruise Offers Today.'],
 'ad_title': ['New England Cruises',
  'Holland America Line®',
  'Holland America Line®',
  'Caribbean Cruises',
  'Holland America Line®'],
 'query': ['cruise portland maine',
  'transportation to cruise port miami',
  'transportation to cruise port miami',
  'galveston cruise parking',
  'cruise portland maine'],
 'relevance_label': [1, 0, 1, 0, 1]}

In [None]:
vectorizerX = TfidfVectorizer(max_df=10, min_df=3)
vectorizerX.fit(cleaned_corpus)
doc_vector = vectorizerX.transform(cleaned_corpus)
print(vectorizerX.get_feature_names())

print(doc_vector.shape)

['aa', 'aacom', 'aacsb', 'aadvantag', 'aaea', 'aaf', 'aagsk', 'aamco', 'aand', 'aap', 'aarporg', 'aask', 'aba', 'abacoa', 'abap', 'abat', 'abbb', 'abbey', 'abbi', 'abcunderwear', 'abd', 'abdomen', 'abdul', 'abe', 'abebookscom', 'abercrombiecom', 'aberdeen', 'abfdc', 'abil', 'abilen', 'abington', 'abiword', 'abl', 'abloy', 'abnorm', 'aboard', 'abort', 'aboveground', 'abraham', 'abras', 'abroad', 'absenc', 'absente', 'absolutelli', 'absorb', 'abstract', 'abu', 'abuelita', 'abuja', 'abvi', 'acadami', 'acai', 'acapulco', 'acarexx', 'acb', 'accentu', 'acceptedget', 'accesori', 'accessor', 'accessoriescal', 'accesssori', 'accid', 'accidenttitlemilestheft', 'acclaim', 'acco', 'accom', 'accomod', 'accompani', 'accomplish', 'accord', 'accou', 'accountemp', 'accountingcrm', 'accountpro', 'accra', 'accreditedonlineexcel', 'accucheck', 'acculab', 'accumulair', 'accuraci', 'accusquar', 'accuweath', 'acdc', 'acer', 'acertifi', 'acess', 'acet', 'acetaminophen', 'aceticketcom', 'acetonid', 'acg', 'ach



In [None]:
import pandas as pd
df1 = pd.DataFrame(doc_vector.toarray(), columns=vectorizerX.get_feature_names())
df1.head(3)

#Preprocess the Query and transform it to vector

In [None]:
query = 'Explore Your World with Four Extraordinary Offers' #Explore Your World with Four Extraordinary Offers'
query = rem_pun(query)
query = rem_digits(query)
query =del_extra_spaces(query)
query = get_tokenized_list(query)
query = remove_stopwords(query)
q = []
for w in word_stemmer(query):
  print("w=",w)
  q.append(w)
q = ' '.join(q)
print("q=",q)
query_vector = vectorizerX.transform([q])
print("query_vector=",query_vector.shape)


w= explor
w= your
w= world
w= four
w= extraordinari
w= offer
q= explor your world four extraordinari offer
query_vector= (1, 4555)


In [None]:
query_vector

<1x2533 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

#Calculate Cosine Similarty
![alt text](https://i.stack.imgur.com/36r1U.png)

In [None]:

cosineSimilarities = cosine_similarity(doc_vector,query_vector).flatten()

In [None]:

cosineSimilarities
np.unique(cosineSimilarities)

array([0.        , 0.60540816, 0.73332467, 1.        ])

In [None]:
related_docs_indices = cosineSimilarities.argsort()[:-10:-1]
print(related_docs_indices)

for i in related_docs_indices:
    data = [cleaned_corpus[i]]
    print(data)

[    1 49320 14432 99999 33337 33328 33329 33330 33331]
['explor your world four extraordinari offer']
['extraordinari carpet clean svc for yr memphi metro area']
['from ordinari extraordinari watch full episod pawn star']
['sign get free No contract pinless']
['softwar surfac xbox pc more offici site plu free ship']
['get blind shade Up off free ship free sampl']
['get blind shade Up off free ship free sampl']
['get blind shade Up off free ship free sampl']
['highest rate most review onlin guarante fit free ship']
