### Test Google JSON API
Try to do some simple queries

In [26]:
import pprint
from googleapiclient.discovery import build

In [27]:
'''Use the JSON API to get query results'''

# see documentation of using build method to initiate a Google API:
# https://edstem.org/us/courses/18455/discussion/1085971
# First argument is the API name (we are using custom search API)
# 2nd argument is version number
# developerKey is our API key
service = build("customsearch", "v1",
          developerKey="****")

# initiate the query
# q is query and cx is our search engine ID (see our notes in shared Google Drive)
res = service.cse().list(
    q='Jaguar car',
    cx='91b445df8028cd711',
  ).execute()


In [130]:
'''Try printing out the query result (I think it's a JSON object)'''
pprint.pprint(res)

{'context': {'title': 'cs6111'},
 'items': [{'cacheId': 'veUNEJDLAzkJ',
            'displayLink': 'www.jaguarusa.com',
            'formattedUrl': 'https://www.jaguarusa.com/index.html',
            'htmlFormattedUrl': 'https://www.<b>jaguar</b>usa.com/index.html',
            'htmlSnippet': 'The official home of <b>Jaguar</b> USA. Explore '
                           'our luxury sedans, SUVs and sports cars. Build '
                           'Your Own, Book a Test Drive or Find a Retailer '
                           'near you.',
            'htmlTitle': '<b>Jaguar</b> USA: <b>Jaguar</b> Sedans, SUVs and '
                         'Sports Cars - Official Site',
            'kind': 'customsearch#result',
            'link': 'https://www.jaguarusa.com/index.html',
            'pagemap': {'cse_image': [{'src': 'https://www.jaguarusa.com/sdlmedia/637463192449631052GF.jpg?v=1'}],
                        'cse_thumbnail': [{'height': '217',
                                           'src':

In [131]:
'''
Returned query result is a dict
The info that we need lies in 'items'
So res['items'] is a list containing 10 search results, 
each result is a dictionary containing its own info
'''
print('Keys of raw returned query result: ', res.keys())

# Number of returned search results.
res_cnt = res['queries']['request'][0]['count']
print('Number of serch results: ', res_cnt)

Keys of raw returned query result:  dict_keys(['kind', 'url', 'queries', 'context', 'searchInformation', 'items'])
Number of serch results:  10


In [132]:
'''Getting useful info from the query result'''

# the first search result
res_0 = res['items'][0]
print(type(res_0))
print(res_0.keys())

# key info that we need
url = res_0['formattedUrl']
title = res_0['title']
snippet = res_0['snippet']

print()
print('url: ', url)
print('Title: ', title)
print('Snippet: ', snippet)

<class 'dict'>
dict_keys(['kind', 'title', 'htmlTitle', 'link', 'displayLink', 'snippet', 'htmlSnippet', 'cacheId', 'formattedUrl', 'htmlFormattedUrl', 'pagemap'])

url:  https://www.jaguarusa.com/index.html
Title:  Jaguar USA: Jaguar Sedans, SUVs and Sports Cars - Official Site
Snippet:  The official home of Jaguar USA. Explore our luxury sedans, SUVs and sports cars. Build Your Own, Book a Test Drive or Find a Retailer near you.


### Test tf-idf vectorizor

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names_out()

print('Words: \n', words, '\n')
print('Shape of the corpus matrix: ', X.shape, '\n')
pprint.pprint(X)
print(X)

Words: 
 ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this'] 

Shape of the corpus matrix:  (4, 9) 

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>
  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


### A simple implementation of the Rocchio algorithm

In [135]:
query = ['Where to buy a jaguar suv?']

corpus = [
    'Jaguar the suv',   # relevant
    'It is a luxurious car manufacturer and brand', # relevant
    'Puma is jaguar\'s relative; both are carnivore animals',   # irrelevant
    'Find out more information about the luxurious sedan here', # relevant
    'luxurious car; very nice and comfortable', # relevant
    'A trust worthy car brand', # relevant
    'An aggresive animal' # irrelevant
]

relevant = [0, 1, 3, 4, 5]
irrelevant = [2,  6]

# TF-IDF
max_df = 0.95        # Ignore words with high df. (Similar effect to stopword filtering)
# min_df = 5           # Ignore words with low df.
smooth_idf = True    # Smooth idf weights by adding 1 to df.
sublinear_tf = True  # Replace tf with 1 + log(tf).

vectorizer = TfidfVectorizer(max_df=max_df,
                             smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
doc_tfidf = vectorizer.fit_transform(corpus).toarray()
query_vec = vectorizer.transform(query).toarray()

print(doc_tfidf.shape)
print(vectorizer.get_feature_names_out())
print('\n the tfidf score matrix: \n', doc_tfidf)


(7, 31)
['about' 'aggresive' 'an' 'and' 'animal' 'animals' 'are' 'both' 'brand'
 'car' 'carnivore' 'comfortable' 'find' 'here' 'information' 'is' 'it'
 'jaguar' 'luxurious' 'manufacturer' 'more' 'nice' 'out' 'puma' 'relative'
 'sedan' 'suv' 'the' 'trust' 'very' 'worthy']

 the tfidf score matrix: 
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.53828134
  0.         0.         0.         0.         0.         0.
  0.         0.         0.64846464 0.53828134 0.         0.
  0.        ]
 [0.         0.         0.         0.36850899 0.         0.
  0.         0.         0.36850899 0.31498927 0.         0.
  0.         0.         0.         0.36850899 0.44394081 0.
  0.31498927 0.44394081 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.36815

In [136]:
'''Update the query vector according to the Rocchio algorithm'''
import numpy as np

# Rocchio (Below is a param set called Ide Dec-Hi)
alpha = 1
beta = 0.75
gamma = 0.15

rel_vecs = np.zeros(len(doc_tfidf[0]))
for i in relevant:
    rel_vecs += doc_tfidf[i]
rel_vecs /= len(doc_tfidf[0]) 

irel_vecs = np.zeros(len(doc_tfidf[0]))
for i in irrelevant:
    irel_vecs += doc_tfidf[i]
irel_vecs /= len(doc_tfidf[0]) 

new_query_vec = alpha * query_vec + beta * rel_vecs - gamma * irel_vecs
    

In [137]:
print(query_vec)
print(new_query_vec)
sorted_idx = np.argsort(new_query_vec)
print(sorted_idx)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.63870855
  0.         0.         0.         0.         0.         0.
  0.         0.         0.76944876 0.         0.         0.
  0.        ]]
[[ 0.00845263 -0.00279363 -0.00279363  0.01818304 -0.00279363 -0.00178138
  -0.00178138 -0.00178138  0.02015535  0.02514968 -0.00178138  0.01116451
   0.00845263  0.00845263  0.00845263  0.00743684  0.0107405   0.65025278
   0.02153966  0.0107405   0.00845263  0.01116451  0.00845263 -0.00178138
  -0.00178138  0.00845263  0.78513742  0.02003935  0.01354053  0.01116451
   0.01354053]]
[[ 1  2  4  5  6  7 24 23 10 15 25 22 20 14  0 12 13 16 19 29 21 11 28 30
   3 27  8 18  9 17 26]]


In [138]:
idx = sorted_idx[0][-1::-1]
print(idx)

[26 17  9 18  8 27  3 30 28 11 21 29 19 16 13 12  0 14 20 22 25 15 10 23
 24  7  6  5  4  2  1]


In [139]:
words = vectorizer.get_feature_names_out()
for i in idx:
    print(words[i])

suv
jaguar
car
luxurious
brand
the
and
worthy
trust
comfortable
nice
very
manufacturer
it
here
find
about
information
more
out
sedan
is
carnivore
puma
relative
both
are
animals
animal
an
aggresive
