## Importing libraries

In [1]:
import numpy as np
import math
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import natsort as nt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

# PART 1

### a) Read 10 files

In [2]:
path = 'C:/Users/Eyad/Desktop/AI course,Data Science/projects/IR CS/documents/'
os.chdir(path)

In [3]:
files = nt.natsorted(os.listdir())
files

['1.txt',
 '2.txt',
 '3.txt',
 '4.txt',
 '5.txt',
 '6.txt',
 '7.txt',
 '8.txt',
 '9.txt',
 '10.txt']

In [4]:
dicts = {}
keys = range(1,len(files)+1)
values = []
for file in files:
    with open(file , 'r', encoding='utf-8') as f:
        values.append(f.read())
for i in keys:
        dicts[i] = values[i-1]
print(dicts)

{1: 'antony brutus caeser cleopatra mercy worser', 2: 'antony brutus caeser calpurnia ', 3: 'mercy worser', 4: 'brutus caeser mercy worser', 5: 'caeser mercy worser', 6: 'antony caeser mercy ', 7: 'angels fools fear in rush to tread where', 8: 'angels fools fear in rush to tread where', 9: 'angels fools in rush to tread where', 10: 'fools fear in rush to tread where'}


### b ) Tokenization

In [5]:
def tokenize_query(query):
    query = word_tokenize(query)
    return query

In [6]:
for i in dicts:
    dicts[i] = word_tokenize(dicts[i])
    print(dicts[i])

['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']
['antony', 'brutus', 'caeser', 'calpurnia']
['mercy', 'worser']
['brutus', 'caeser', 'mercy', 'worser']
['caeser', 'mercy', 'worser']
['antony', 'caeser', 'mercy']
['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']
['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']
['angels', 'fools', 'in', 'rush', 'to', 'tread', 'where']
['fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']


In [7]:
#Test
q = 'Hello im eyad'
q = tokenize_query(q)
q

['Hello', 'im', 'eyad']

## c) Apply stop words (except: in , to,where)

In [8]:
stopwords_edited = stopwords.words('english')
stopwords_edited.remove('in')
stopwords_edited.remove('to')
stopwords_edited.remove('where')

In [9]:
def remove_stop_words(query):
    query = tokenize_query(query)
    query = [word for word in query if not word in stopwords_edited]
    return ' '.join(query)

In [10]:
for i in dicts:
    dicts[i] = [w for w in dicts[i] if not w.lower() in stopwords_edited]
    print(dicts[i])

['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']
['antony', 'brutus', 'caeser', 'calpurnia']
['mercy', 'worser']
['brutus', 'caeser', 'mercy', 'worser']
['caeser', 'mercy', 'worser']
['antony', 'caeser', 'mercy']
['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']
['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']
['angels', 'fools', 'in', 'rush', 'to', 'tread', 'where']
['fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']


In [11]:
dicts

{1: ['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser'],
 2: ['antony', 'brutus', 'caeser', 'calpurnia'],
 3: ['mercy', 'worser'],
 4: ['brutus', 'caeser', 'mercy', 'worser'],
 5: ['caeser', 'mercy', 'worser'],
 6: ['antony', 'caeser', 'mercy'],
 7: ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 8: ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 9: ['angels', 'fools', 'in', 'rush', 'to', 'tread', 'where'],
 10: ['fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']}

In [12]:
all_words = []
for doc in dicts:
    for term in dicts[doc]:
        all_words.append(term)
all_words = sorted(all_words)

In [13]:
#test
q = 'Where are you at the to in from from in egypt'
q = remove_stop_words(q)
q

'Where to in in egypt'

# PART 2

### a) Building positional index

In [14]:
doc_no = 1
pos_idx = {}

In [15]:
for doc in dicts:
    for pos,term in enumerate(dicts[doc]):
        if term in pos_idx:
            pos_idx[term][0] += 1
            if doc_no in pos_idx[term][1]:
                pos_idx[term][1][doc_no].append(pos)
            else:
                pos_idx[term][1][doc_no] = [pos]
        else:
    
            pos_idx[term]=[]

            pos_idx[term].append(1)

            pos_idx[term].append({})

            pos_idx[term][1][doc_no] = [pos]
        
    doc_no +=1

print(pos_idx)

{'antony': [3, {1: [0], 2: [0], 6: [0]}], 'brutus': [3, {1: [1], 2: [1], 4: [0]}], 'caeser': [5, {1: [2], 2: [2], 4: [1], 5: [0], 6: [1]}], 'cleopatra': [1, {1: [3]}], 'mercy': [5, {1: [4], 3: [0], 4: [2], 5: [1], 6: [2]}], 'worser': [4, {1: [5], 3: [1], 4: [3], 5: [2]}], 'calpurnia': [1, {2: [3]}], 'angels': [3, {7: [0], 8: [0], 9: [0]}], 'fools': [4, {7: [1], 8: [1], 9: [1], 10: [0]}], 'fear': [3, {7: [2], 8: [2], 10: [1]}], 'in': [4, {7: [3], 8: [3], 9: [2], 10: [2]}], 'rush': [4, {7: [4], 8: [4], 9: [3], 10: [3]}], 'to': [4, {7: [5], 8: [5], 9: [4], 10: [4]}], 'tread': [4, {7: [6], 8: [6], 9: [5], 10: [5]}], 'where': [4, {7: [7], 8: [7], 9: [6], 10: [6]}]}


In [16]:
sum_freq = sum([pos_idx[term][0] for term in set(all_words)])
sum_freq

52

### b) Allow users to write queries

In [17]:
def return_matched_docs_ix(q):
    pos_idx_list = [[] for i in range(len(pos_idx))]
    for w in q.split():
        try:
            for k in pos_idx[w][1].keys():

                if pos_idx_list[k-1] != []:

                    if pos_idx_list[k-1][-1] == pos_idx[w][1][k][0]-1:
                        pos_idx_list[k-1].append(pos_idx[w][1][k][0])

                else:
                        pos_idx_list[k-1].append(pos_idx[w][1][k][0])

            
            for ix , lists in enumerate(pos_idx_list):
                if len(q.split()) == len(lists):
                    print("Matched in doc number:" , ix+1)
        except KeyError:
            print("No matched document -> invalid input")

In [18]:
# test
q = "brutus and caeser"
q = remove_stop_words(q)
return_matched_docs_ix(q)

Matched in doc number: 1
Matched in doc number: 2
Matched in doc number: 4


# Part 3

### a) Term frequency

In [19]:
def get_term_freq(doc):
    words_found = dict.fromkeys(all_words , 0)
    for word in doc:
        words_found[word] += 1
    return words_found   

In [20]:
get_term_freq(dicts[1]).values()

dict_values([0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1])

In [21]:
tf = pd.DataFrame()
for i in range(1 , len(dicts)+1):
    tf[i] =  pd.DataFrame(
                            get_term_freq(dicts[i]).values() ,
                            index = get_term_freq(dicts[i]).keys()
                         )
    
tf.columns = [f"DOC_{i}" for i in range(1,11)]
tf.style.background_gradient(cmap = "Blues")

Unnamed: 0,DOC_1,DOC_2,DOC_3,DOC_4,DOC_5,DOC_6,DOC_7,DOC_8,DOC_9,DOC_10
angels,0,0,0,0,0,0,1,1,1,0
antony,1,1,0,0,0,1,0,0,0,0
brutus,1,1,0,1,0,0,0,0,0,0
caeser,1,1,0,1,1,1,0,0,0,0
calpurnia,0,1,0,0,0,0,0,0,0,0
cleopatra,1,0,0,0,0,0,0,0,0,0
fear,0,0,0,0,0,0,1,1,0,1
fools,0,0,0,0,0,0,1,1,1,1
in,0,0,0,0,0,0,1,1,1,1
mercy,1,0,1,1,1,1,0,0,0,0


##### Weighted term frequency

In [22]:
def get_weighted_term_freq(x):
    try:
        return math.log(x)+1
    except ValueError:
        return 0

In [23]:
wtf = tf.copy()

In [24]:
for i in range(1,len(dicts)+1):
    wtf[f"DOC_{i}"] = wtf[f"DOC_{i}"].apply(get_weighted_term_freq)
wtf.astype(int).style.background_gradient(cmap = "Blues")

Unnamed: 0,DOC_1,DOC_2,DOC_3,DOC_4,DOC_5,DOC_6,DOC_7,DOC_8,DOC_9,DOC_10
angels,0,0,0,0,0,0,1,1,1,0
antony,1,1,0,0,0,1,0,0,0,0
brutus,1,1,0,1,0,0,0,0,0,0
caeser,1,1,0,1,1,1,0,0,0,0
calpurnia,0,1,0,0,0,0,0,0,0,0
cleopatra,1,0,0,0,0,0,0,0,0,0
fear,0,0,0,0,0,0,1,1,0,1
fools,0,0,0,0,0,0,1,1,1,1
in,0,0,0,0,0,0,1,1,1,1
mercy,1,0,1,1,1,1,0,0,0,0


### b) Inverse document frequency

In [25]:
get_term_freq(dicts[1]).keys() 

dict_keys(['angels', 'antony', 'brutus', 'caeser', 'calpurnia', 'cleopatra', 'fear', 'fools', 'in', 'mercy', 'rush', 'to', 'tread', 'where', 'worser'])

In [26]:
idf_df = pd.DataFrame(index = get_term_freq(dicts[1]).keys() , columns=["df" , "idf"])

In [27]:
for i in idf_df.index:
    idf_df['df'][i] = pos_idx[i][0]
    idf_df['idf'][i] = np.log10( 10 / float(pos_idx[i][0]) )

In [28]:
idf_df["df"] = idf_df["df"].astype(int)
idf_df["idf"] = idf_df["idf"].astype(float)

In [57]:
idf_df.style.background_gradient(cmap = "Blues" , axis= 0)

Unnamed: 0,df,idf
angels,3,0.522879
antony,3,0.522879
brutus,3,0.522879
caeser,5,0.30103
calpurnia,1,1.0
cleopatra,1,1.0
fear,3,0.522879
fools,4,0.39794
in,4,0.39794
mercy,5,0.30103


### c) Term frequency - Inverse document frequency

In [30]:
tf_idf = tf.multiply(idf_df["idf"] , axis = 0)

In [55]:
tf_idf.style.background_gradient(cmap = "Blues" , axis= 0)

Unnamed: 0,DOC_1,DOC_2,DOC_3,DOC_4,DOC_5,DOC_6,DOC_7,DOC_8,DOC_9,DOC_10
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.522879,0.0
antony,0.522879,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0,0.0
brutus,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0,0.522879
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
in,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
mercy,0.30103,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0


### Document length & Normalized tf_idf

##### Document length

In [32]:
doc_length = pd.DataFrame()

In [33]:
def get_doc_length(col):
    return np.sqrt(tf_idf[col].apply(lambda x: x**2).sum())

for col in tf_idf.columns:
    doc_length.loc[ 0 , col + '_length'] = get_doc_length(col)

In [34]:
doc_length = doc_length.T

In [35]:
doc_length.columns = [''] * len(doc_length.columns)

In [36]:
doc_length.style.background_gradient(cmap = 'Blues')

Unnamed: 0,Unnamed: 1
DOC_1_length,1.373462
DOC_2_length,1.279618
DOC_3_length,0.498974
DOC_4_length,0.782941
DOC_5_length,0.582747
DOC_6_length,0.67427
DOC_7_length,1.223496
DOC_8_length,1.223496
DOC_9_length,1.106137
DOC_10_length,1.106137


##### Normalized term freq inverse doc freq

In [37]:
normalized_tfidf = pd.DataFrame()

In [38]:
def get_normalized_tf_idf(col, x):
    try:
        return x / doc_length.loc[col + '_length'].values[0]
    except ZeroDivisionError:
        return 0

In [39]:
for col in tf_idf.columns:
    normalized_tfidf[col] = tf_idf[col].apply(lambda x: get_normalized_tf_idf(col , x))

In [54]:
normalized_tfidf.style.background_gradient(cmap = 'Blues' , axis= 0)

Unnamed: 0,DOC_1,DOC_2,DOC_3,DOC_4,DOC_5,DOC_6,DOC_7,DOC_8,DOC_9,DOC_10
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.472707,0.0
antony,0.380701,0.408621,0.0,0.0,0.0,0.775474,0.0,0.0,0.0,0.0
brutus,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.219176,0.23525,0.0,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
calpurnia,0.0,0.781483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cleopatra,0.728087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.0,0.472707
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
in,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
mercy,0.219176,0.0,0.603298,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0


### d) Cosine similarity & Ranking documents

In [41]:
docs_cos = list(dicts.values()) 

In [42]:
for i in range(len(docs_cos)):
    docs_cos[i] = ' '.join(docs_cos[i])

In [43]:
vectorizer = TfidfVectorizer()

In [44]:
df = vectorizer.fit_transform(docs_cos).T.toarray()

In [45]:
df = pd.DataFrame(df , index= vectorizer.get_feature_names())

In [53]:
df.style.background_gradient(cmap = 'Blues' , axis = 0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.385109,0.385109,0.417294,0.0
antony,0.412627,0.474292,0.0,0.0,0.0,0.662993,0.0,0.0,0.0,0.0
brutus,0.412627,0.474292,0.0,0.571154,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.329457,0.378692,0.0,0.45603,0.555563,0.529358,0.0,0.0,0.0,0.0
calpurnia,0.0,0.637721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cleopatra,0.554808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.385109,0.385109,0.0,0.417294
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.342389,0.342389,0.371004,0.371004
in,0.0,0.0,0.0,0.0,0.0,0.0,0.342389,0.342389,0.371004,0.371004
mercy,0.329457,0.0,0.668165,0.45603,0.555563,0.529358,0.0,0.0,0.0,0.0


In [47]:
def get_relevant_docs(q, df):
    
    print("query is :", q)
    q = remove_stop_words(q)
    print("query read as :", q)
    print("\n------------------\n"*3)
    
    
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0])
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    
    
    print("Most relevant documents:\n")
    returned_docs = []
    print("Cosine similarity: (Query , Document number) ->(Score) : \n")
    for k, v in sim_sorted:
        if v != 0.0:
            returned_docs.append(k+1)
            print(f"({' '.join(q)} , {k+1}) -> {v}")
            print(f"Document: {' '.join(dicts[k+1])}" , end='\n\n')
    print("Returned documents: " , returned_docs)
    
    
    arr_q = []
    for i in ' '.join(q).split():
        try:
            tf_q = sum(np.array(tf.loc[i]))
            wtf_q = get_weighted_term_freq(tf_q)
            idf_q = np.log10( 10 / float(pos_idx[i][0]) )
            tf_idf_q = tf_q * idf_q
            norm_q = tf_q * idf_q / sum(list(doc_length.iloc[[i for i in returned_docs] , 0]))
        except:  
            tf_q , wtf_q , idf_q , tf_idf_q , norm_q = 0,0,0,0,0
        arr_q.append([tf_q , wtf_q , idf_q , tf_idf_q , norm_q])
        
            
    return arr_q

## Search engine

In [48]:
query = input()

caeser and brutus


In [49]:
search = get_relevant_docs(query, df)

query is : caeser and brutus
query read as : caeser brutus

------------------

------------------

------------------

Most relevant documents:

Cosine similarity: (Query , Document number) ->(Score) : 

(caeser brutus , 4) -> 0.730875895547762
Document: brutus caeser mercy worser

(caeser brutus , 2) -> 0.606926656805661
Document: antony brutus caeser calpurnia

(caeser brutus , 1) -> 0.528017734733262
Document: antony brutus caeser cleopatra mercy worser

(caeser brutus , 5) -> 0.34664321269145776
Document: caeser mercy worser

(caeser brutus , 6) -> 0.330292868445042
Document: antony caeser mercy

Returned documents:  [4, 2, 1, 5, 6]


# ----------------

In [50]:
search = pd.DataFrame(search , index= remove_stop_words(query).split() , columns=['tf-raw' , 'wtf(1+ log tf)', 'idf', 'tf*idf' , 'normalized'])

In [52]:
search.style.background_gradient(cmap = "Reds" , axis= 1)

Unnamed: 0,tf-raw,wtf(1+ log tf),idf,tf*idf,normalized
caeser,5,2.609438,0.30103,1.50515,0.353396
brutus,3,2.098612,0.522879,1.568636,0.368302
