# Setup and Load dataset



In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.4-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.4


In [None]:
!gdown --id 1819QbWTepXZzOXREC1nPaBhAP8Ww3rW2

Downloading...
From: https://drive.google.com/uc?id=1819QbWTepXZzOXREC1nPaBhAP8Ww3rW2
To: /content/Cranfield Dataset NLP.zip
100% 499k/499k [00:00<00:00, 111MB/s]


In [None]:
!unzip -o "Cranfield Dataset NLP.zip"  -d  "/content"

Archive:  Cranfield Dataset NLP.zip
   creating: /content/cranfield/
  inflating: /content/cranfield/.DS_Store  
   creating: /content/__MACOSX/
   creating: /content/__MACOSX/cranfield/
  inflating: /content/__MACOSX/cranfield/._.DS_Store  
  inflating: /content/cranfield/cran_queries.json  
  inflating: /content/__MACOSX/cranfield/._cran_queries.json  
  inflating: /content/cranfield/cran_qrels.json  
  inflating: /content/__MACOSX/cranfield/._cran_qrels.json  
  inflating: /content/cranfield/cran_docs.json  
  inflating: /content/__MACOSX/cranfield/._cran_docs.json  
  inflating: /content/cranfield/README.txt  
  inflating: /content/__MACOSX/cranfield/._README.txt  


## Data Loading

In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

#### Name of the file which contain all the item properties

In [None]:

import json
with open('cranfield/cran_docs.json', 'r') as datafile:
        data = json.load(datafile)
content_df = pd.DataFrame(data)

In [None]:
content_df.head(1)

Unnamed: 0,id,author,bibliography,body,title
0,1,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...


In [None]:
import json
with open('cranfield/cran_qrels.json', 'r') as datafile:
        data = json.load(datafile)
qrels_df = pd.DataFrame(data)

In [None]:
qrels_df.head(15)

Unnamed: 0,query_num,position,id
0,1,2,184
1,1,2,29
2,1,2,31
3,1,3,12
4,1,3,51
5,1,3,102
6,1,4,13
7,1,4,14
8,1,4,15
9,1,2,57


In [None]:
import json
with open('cranfield/cran_queries.json', 'r') as datafile:
        data = json.load(datafile)
queries_df = pd.DataFrame(data)

In [None]:
queries_df.head(1)

Unnamed: 0,query number,query
0,1,what similarity laws must be obeyed when const...


In [None]:
all_queries=list(queries_df['query'].values)

## Details about dataset

In [None]:
itemid="id"

In [None]:
features=['title','body']

In [None]:
allcols=[itemid]
for i in features:
  allcols.append(i)

# Setup

In [None]:
content_df['NewTag']=""
for i in features:
  content_df[i] = content_df[i].fillna(' ')
for i in features:
  content_df['NewTag']+=(" "+content_df[i])
content_df['NewTag']=content_df['NewTag'].astype(str)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [None]:
def clean_text(text):
    
    text = text.lower()  # lowercase text
    # replace the matched string with ' '
    text = re.sub( re.compile("\'s"), ' ', text)
    text = re.sub(re.compile("\\r\\n"), ' ', text)
    text = re.sub(re.compile(r"[^\w\s]"), ' ', text)
    return text

In [None]:
stopwords=set(stopwords.words('english'))

In [None]:
content_df['clean'] = content_df['NewTag'].apply(clean_text)
# content_df['token_lem_sentence'] = content_df['clean'].apply(
#         lambda x: tokenizer(x))

In [None]:
def reduce(text):
        """
		Stemming/Lemmatization
		Parameters
		----------
		arg1 : list
			A list of lists where each sub-list a sequence of tokens
			representing a sentence
		Returns
		-------
		list
			A list of lists where each sub-list is a sequence of
			stemmed/lemmatized tokens representing a sentence
		"""
        sno = nltk.stem.SnowballStemmer('english')
        reducedText = ''
        for word in text.split():
          
            reducedText +=(' '+(sno.stem(word)))
        return reducedText

In [None]:
content_df['clean'] = content_df['clean'].apply(reduce)

In [None]:
import gensim
content_df['clean']  =content_df['clean'] .apply(lambda x: gensim.parsing.preprocessing.remove_stopwords(x))


In [None]:
content_df.head(1)

Unnamed: 0,id,author,bibliography,body,title,NewTag,clean
0,1,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...,experimental investigation of the aerodynamic...,experiment investig aerodynam wing slipstream ...


In [None]:
tokenized_corpus = [doc.split(" ") for doc in content_df['clean'].values]

# Model train

In [None]:
!pip install rank_bm25

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
from rank_bm25 import *

In [None]:
bm25 = BM25Okapi(tokenized_corpus)

# Sample search 

In [None]:
import gensim

In [None]:
query=all_queries[0]

query =clean_text(query)
query = reduce(query)
query = gensim.parsing.preprocessing.remove_stopwords(query)
print(query)
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
docs = bm25.get_top_n(query=tokenized_query, documents=content_df['clean'].values, n=10)
df_search = content_df[content_df["clean"].isin(docs)][allcols]
df_search.head()

similar law obey construct aeroelast model heat high speed aircraft


Unnamed: 0,id,title,body
11,12,some structural and aerelastic considerations ...,some structural and aerelastic considerations ...
50,51,theory of aircraft structural models subjected...,theory of aircraft structural models subjected...
77,78,an analytical treatment of aircraft propeller ...,an analytical treatment of aircraft propeller ...
140,141,free-flight techniques for high speed aerodyna...,free-flight techniques for high speed aerodyna...
183,184,scale models for thermo-aeroelastic research .,scale models for thermo-aeroelastic research ....


# Evaluation

In [None]:
queries_df.columns

Index(['query number', 'query'], dtype='object')

In [None]:
str(all_queries[0])

'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .'

In [None]:
qrels_df['id']=qrels_df['id'].astype(str)
content_df[itemid]=content_df[itemid].astype(str)
qrels_df['query_num']=qrels_df['query_num'].astype(str)

## TOP SEARCH RESULT

## How many results search result at first rank gave complete answer to search query

### Simple semantic search

In [None]:
topk=1
totalhit=0
for qry in all_queries:
  query_id=list(queries_df[queries_df['query']==qry]['query number'])[0]

  query =clean_text(qry)
  query = reduce(qry)
  query = gensim.parsing.preprocessing.remove_stopwords(query)
  tokenized_query = query.split(" ")
  doc_scores = bm25.get_scores(tokenized_query)
  docs = bm25.get_top_n(query=tokenized_query, documents=content_df['clean'].values, n=10)
  ids = list(content_df[content_df["clean"].isin(docs)][itemid])[:1]
  
  results = ids

  actuall=list(qrels_df[(qrels_df['query_num']==str(query_id) )& (qrels_df['id'].isin(results)) & (qrels_df['position']<=1) ][itemid])

  totalhit+=len(actuall)




In [None]:
pre=totalhit/len(all_queries)
print('precision@1 :'+str(pre))

precision@1 :0.08888888888888889


## How many results search result at first rank gave high degree relivent answer to search query

### Simple  search

In [None]:
topk=1
totalhit=0
for qry in all_queries:
  query_id=list(queries_df[queries_df['query']==qry]['query number'])[0]

  query =clean_text(qry)
  query = reduce(qry)
  query = gensim.parsing.preprocessing.remove_stopwords(query)
  tokenized_query = query.split(" ")
  doc_scores = bm25.get_scores(tokenized_query)
  docs = bm25.get_top_n(query=tokenized_query, documents=content_df['clean'].values, n=10)
  ids = list(content_df[content_df["clean"].isin(docs)][itemid])[:1]
  
  results = ids

  actuall=list(qrels_df[(qrels_df['query_num']==str(query_id) )& (qrels_df['id'].isin(results)) & (qrels_df['position']<=2) ][itemid])

  totalhit+=len(actuall)




In [None]:
pre=totalhit/len(all_queries)
print('precision@1 :'+str(pre))

precision@1 :0.13333333333333333


## How many results search result at first rank gave useful answer to search query

### Simple  search

In [None]:
topk=1
totalhit=0
for qry in all_queries:
  query_id=list(queries_df[queries_df['query']==qry]['query number'])[0]

  query =clean_text(qry)
  query = reduce(qry)
  query = gensim.parsing.preprocessing.remove_stopwords(query)
  tokenized_query = query.split(" ")
  doc_scores = bm25.get_scores(tokenized_query)
  docs = bm25.get_top_n(query=tokenized_query, documents=content_df['clean'].values, n=10)
  ids = list(content_df[content_df["clean"].isin(docs)][itemid])[:1]
  
  results = ids

  actuall=list(qrels_df[(qrels_df['query_num']==str(query_id) )& (qrels_df['id'].isin(results)) & (qrels_df['position']<=3) ][itemid])

  totalhit+=len(actuall)




In [None]:
pre=totalhit/len(all_queries)
print('precision@1 :'+str(pre))

precision@1 :0.2222222222222222


## TOP 10  SEARCH RESULT

## How many results search result at first rank gave usefull answer to search query

### Simple semantic search

In [None]:
topk=10
totalhit=0
for qry in all_queries:
  query_id=list(queries_df[queries_df['query']==qry]['query number'])[0]

  query =clean_text(qry)
  query = reduce(qry)
  query = gensim.parsing.preprocessing.remove_stopwords(query)
  tokenized_query = query.split(" ")
  doc_scores = bm25.get_scores(tokenized_query)
  docs = bm25.get_top_n(query=tokenized_query, documents=content_df['clean'].values, n=10)
  ids = list(content_df[content_df["clean"].isin(docs)][itemid])[:10]
  
  results = ids

  actuall=list(qrels_df[(qrels_df['query_num']==str(query_id) )& (qrels_df['id'].isin(results)) & (qrels_df['position']<=1) ][itemid])

  totalhit+=len(actuall)




In [None]:
pre=totalhit/(len(all_queries)*10)
print('precision@10 :'+str(pre))

precision@10 :0.10088888888888889
