# Week 1

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import numpy as np
import os
from nltk.stem import PorterStemmer
from natsort import natsorted 
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
import copy
from functools import reduce

In [2]:
df = pd.read_csv("stockerbot-export1.csv")
df=df[['text']]
df['text'] = df['text'].astype(str)
df.head()

Unnamed: 0,text
0,VIDEO: “I was in my office. I was minding my o...
1,The price of lumber $LB_F is down 22% since hi...
2,Who says the American Dream is dead? https://t...
3,Barry Silbert is extremely optimistic on bitco...
4,How satellites avoid attacks and space junk wh...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28440 entries, 0 to 28439
Data columns (total 1 columns):
text    28440 non-null object
dtypes: object(1)
memory usage: 222.3+ KB


# Tokenization using RegexpTokenizer

In [4]:
df['text'] = df['text'].replace(r'http\S+','',regex=True).replace(r'www\S+','',regex=True)
df.head()

Unnamed: 0,text
0,VIDEO: “I was in my office. I was minding my o...
1,The price of lumber $LB_F is down 22% since hi...
2,Who says the American Dream is dead?
3,Barry Silbert is extremely optimistic on bitco...
4,How satellites avoid attacks and space junk wh...


In [5]:
tokens = RegexpTokenizer(r'\w+')
df['text']=df['text'].apply(lambda x:tokens.tokenize(x.lower()))
df.head()

Unnamed: 0,text
0,"[video, i, was, in, my, office, i, was, mindin..."
1,"[the, price, of, lumber, lb_f, is, down, 22, s..."
2,"[who, says, the, american, dream, is, dead]"
3,"[barry, silbert, is, extremely, optimistic, on..."
4,"[how, satellites, avoid, attacks, and, space, ..."


## Removing stop words

In [6]:
stop_words = set(stopwords.words('english')) 
stop_words = stop_words.union(",","(",")","[","]","{","}","#","@","!",":",";",".","?")
df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop_words])
df.head()

Unnamed: 0,text
0,"[video, office, minding, business, david, solo..."
1,"[price, lumber, lb_f, 22, since, hitting, ytd,..."
2,"[says, american, dream, dead]"
3,"[barry, silbert, extremely, optimistic, bitcoi..."
4,"[satellites, avoid, attacks, space, junk, circ..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28440 entries, 0 to 28439
Data columns (total 1 columns):
text    28440 non-null object
dtypes: object(1)
memory usage: 222.3+ KB


---

# Week 2

---

# Lemmetization using WordNetLemmatizer

In [8]:
lem = WordNetLemmatizer()
def lemm(text):
    sent=[]
    for word in text:
        sent.append(lem.lemmatize(word))
    return sent

# Stemming using PorterStemmer

In [9]:
ps=PorterStemmer()

def stemm(text):        
    sent = []
    for word in text:
        sent.append(ps.stem(word))
    return sent

In [10]:
df["text_final"] =  df.apply(lambda x: stemm(lemm(x['text'])), axis=1)

In [11]:
df.head()

Unnamed: 0,text,text_final
0,"[video, office, minding, business, david, solo...","[video, offic, mind, busi, david, solomon, tel..."
1,"[price, lumber, lb_f, 22, since, hitting, ytd,...","[price, lumber, lb_f, 22, sinc, hit, ytd, high..."
2,"[says, american, dream, dead]","[say, american, dream, dead]"
3,"[barry, silbert, extremely, optimistic, bitcoi...","[barri, silbert, extrem, optimist, bitcoin, pr..."
4,"[satellites, avoid, attacks, space, junk, circ...","[satellit, avoid, attack, space, junk, circl, ..."


---
# Creating postings list for inverted index

In [12]:
inv_ind={}
cnt=1
for i in df['text_final']:
    for item in i:
        if item not in inv_ind:
            inv_ind[item]=[]
        if cnt not in inv_ind[item]:
            inv_ind[item].append(cnt)
    cnt+=1
print(inv_ind)

{'video': [1, 801, 1049, 1207, 1328, 1448, 1589, 1598, 1777, 1871, 1925, 2119, 2356, 2440, 2553, 2704, 2705, 3419, 3420, 3598, 3605, 3625, 3627, 3684, 4542, 4551, 5187, 5491, 5509, 5619, 5671, 5779, 5865, 6149, 6674, 6749, 6806, 8143, 8210, 8375, 8387, 8388, 8389, 8390, 8391, 8392, 8393, 8394, 8396, 8397, 8398, 8399, 8400, 8401, 8412, 8487, 8490, 8852, 8891, 8894, 8895, 8916, 10501, 10655, 10665, 10680, 11806, 11826, 12020, 12051, 12222, 12304, 13655, 14541, 14937, 15079, 15328, 15692, 16198, 16239, 16529, 16599, 17439, 17995, 18145, 18686, 18909, 19091, 19350, 19534, 20101, 20209, 20324, 20945, 21008, 22185, 22605, 22613, 22624, 22700, 22702, 22757, 22770, 23409, 23426, 23470, 23560, 24075, 24150, 24349, 24695, 24723, 24888, 24936, 25452, 25763, 25855, 25860, 26091, 26127, 26259, 26336, 26349, 26379, 26485, 26595, 26676, 26685, 27113, 27137, 27704, 27866, 28114, 28187, 28387], 'offic': [1, 340, 343, 653, 838, 839, 1261, 1266, 1923, 1924, 1979, 2035, 2038, 2990, 3229, 3360, 3526, 3838,

In [13]:
def find(word):
    print(inv_ind[word])
find("crypto")

[4, 3377, 4222, 5302, 8544, 11868, 12908, 12964, 13028, 13170, 13240, 13300, 13509, 13559, 13618, 13680, 13794, 14012, 14047, 14173, 14325, 14422, 14556, 14648, 14725, 14882, 14932, 14983, 14986, 15086, 15178, 15350, 15352, 15354, 15360, 15387, 15415, 15417, 15509, 15543, 15555, 15597, 15726, 15820, 15951, 15958, 15960, 16142, 16159, 16199, 16218, 16265, 16278, 16285, 16334, 16361, 16382, 16497, 16544, 16556, 16586, 16587, 16661, 16719, 16722, 16742, 16772, 16776, 16899, 16905, 16951, 16973, 17061, 17109, 17115, 17116, 17133, 17160, 17172, 17175, 17205, 17207, 17235, 17297, 17307, 17345, 17352, 17361, 17365, 17366, 17427, 17428, 17432, 17503, 17511, 17531, 17539, 17547, 17573, 17605, 17612, 17619, 17636, 17673, 17684, 17694, 17696, 17701, 17721, 17728, 17766, 17780, 17791, 17813, 17817, 17835, 17837, 17871, 17919, 17935, 17965, 17972, 17986, 17993, 17999, 18007, 18025, 18027, 18059, 18070, 18079, 18081, 18115, 18151, 18163, 18164, 18198, 18207, 18210, 18253, 18256, 18288, 18303, 18307,

---
# Week 3

---

# Boolean query for inverted index

In [14]:
no_docs=df.shape[0]
def AND(posting1, posting2):
    p1 = 0
    p2 = 0
    result = []
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
    return result

def OR(posting1, posting2):
    p1 = 0
    p2 = 0
    result = []
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1):
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result

def NOT(posting):
    result = []
    i = 0
    for item in posting:
        while i < item:
            result.append(i)
            i += 1
        else:
            i += 1
    else:
        while i < no_docs:
            result.append(i)
            i += 1
    return result 

In [15]:
def bool_query(query):
    op=[]
    ans=[i for i in range(no_docs)]
    i=0
    n=len(query)
    while i<len(query):
        if query[i]=='not':
            q = stemm(lemm([query[i+1]]))[0]
            temp=NOT(inv_ind[q])
            ans=set(ans)
            ans=list(ans.intersection(temp))
            query.remove(query[i])
            query.remove(query[i])
            n-=2
            i-=1
        i+=1
    if len(query)!=0:
        op=[]
        for i in query:
            if i in ["or", "and"]:
               op.append(i)
               query.remove(i)
        query = stemm(lemm(query))
        temp = inv_ind[query[0]]
        temp=set(temp)
        ans=list(temp.intersection(ans))
        for i in range(len(op)):
            if op[i]=='and':
                ans=AND(ans,inv_ind[query[i+1]])
            elif op[i]=='or':
                ans=OR(ans,inv_ind[query[i+1]])
    return ans

## Run the cell below and enter the boolean query to see the output
### Here are few sample cases

In [16]:
query = input("Enter your query: ").strip().split()
print(bool_query(query))

Enter your query: video
[1, 27137, 6149, 23560, 24075, 3598, 26127, 12304, 21008, 6674, 8210, 3605, 1049, 28187, 11806, 17439, 5671, 3625, 3627, 11826, 26676, 1589, 27704, 26685, 1598, 5187, 2119, 17995, 22605, 19534, 22613, 24150, 14937, 6749, 22624, 3684, 24695, 20101, 2704, 2705, 16529, 5779, 8852, 19091, 6806, 24723, 26259, 25763, 22185, 22700, 22702, 1207, 8375, 8891, 8894, 8895, 8387, 8388, 8389, 8390, 8391, 8392, 8393, 8394, 8396, 8397, 8398, 8399, 8400, 8401, 14541, 8916, 16599, 27866, 8412, 26336, 18145, 28387, 22757, 15079, 5865, 26349, 1777, 20209, 22770, 12020, 18686, 25855, 25860, 10501, 26379, 12051, 24349, 801, 8487, 8490, 1328, 2356, 24888, 16198, 15692, 1871, 13655, 3419, 3420, 20324, 24936, 25452, 16239, 23409, 5491, 26485, 23426, 1925, 5509, 2440, 19350, 10655, 1448, 10665, 23470, 10680, 4542, 12222, 4551, 8143, 20945, 28114, 18909, 15328, 26595, 27113, 26091, 5619, 2553]


In [17]:
query = input("Enter your query: ").strip().split()
print(bool_query(query))

Enter your query: video and office
[1]


In [18]:
query = input("Enter your query: ").strip().split()
print(bool_query(query))

Enter your query: video and office and mind
[1]


In [19]:
query = input("Enter your query: ").strip().split()
print(bool_query(query))

Enter your query: video and office not mind
[]


---
# Creating the positional postings list

In [20]:
pos_ind={}
lines_doc={}
line_no=0
for i in df['text_final']:
    for pos,item in enumerate(i):
        if item not in pos_ind:
            pos_ind[item]=[]
            pos_ind[item].append(1)
            pos_ind[item].append({})
            pos_ind[item][1][line_no]=[pos]
        else:
            pos_ind[item][0]=pos_ind[item][0]+1
            if line_no in pos_ind[item][1]:
                pos_ind[item][1][line_no].append(pos)
            else:
                pos_ind[item][1][line_no]=[pos]
    lines_doc[line_no]="In tweet "+str(line_no+1)
    line_no+=1
pos_ind

{'video': [135,
  {0: [0],
   800: [7],
   1048: [5],
   1206: [6],
   1327: [14],
   1447: [14],
   1588: [6],
   1597: [4],
   1776: [14],
   1870: [0],
   1924: [7],
   2118: [4],
   2355: [9],
   2439: [14],
   2552: [5],
   2703: [8],
   2704: [6],
   3418: [11],
   3419: [11],
   3597: [8],
   3604: [8],
   3624: [8],
   3626: [10],
   3683: [9],
   4541: [0],
   4550: [2],
   5186: [1],
   5490: [6],
   5508: [8],
   5618: [4],
   5670: [4],
   5778: [8],
   5864: [6],
   6148: [6],
   6673: [4],
   6748: [5],
   6805: [1],
   8142: [7],
   8209: [6],
   8374: [4],
   8386: [6],
   8387: [6],
   8388: [6],
   8389: [6],
   8390: [6],
   8391: [6],
   8392: [6],
   8393: [6],
   8395: [6],
   8396: [6],
   8397: [6],
   8398: [6],
   8399: [6],
   8400: [6],
   8411: [6],
   8486: [10],
   8489: [4],
   8851: [4],
   8890: [8],
   8893: [8],
   8894: [8],
   8915: [8],
   10500: [5],
   10654: [2],
   10664: [4],
   10679: [2],
   11805: [4],
   11825: [7],
   12019: [6],
   1205

In [21]:
pos_ind["crypto"]

[440,
 {3: [8],
  3376: [6],
  4221: [297],
  5301: [4],
  8543: [3],
  11867: [4],
  12907: [1],
  12963: [4],
  13027: [1],
  13169: [9],
  13239: [3],
  13299: [5],
  13508: [4],
  13558: [4],
  13617: [1],
  13679: [4],
  13793: [1],
  14011: [3],
  14046: [3],
  14172: [3],
  14324: [1],
  14421: [3],
  14555: [4],
  14647: [3],
  14724: [3],
  14881: [4],
  14931: [10],
  14982: [4],
  14985: [3],
  15085: [7],
  15177: [4],
  15349: [4],
  15351: [1],
  15353: [4],
  15359: [4],
  15386: [4],
  15414: [4],
  15416: [4],
  15508: [1],
  15542: [4],
  15554: [1],
  15596: [4],
  15725: [4],
  15819: [3],
  15950: [4],
  15957: [1],
  15959: [3],
  16141: [0],
  16158: [0],
  16198: [3],
  16217: [0],
  16264: [3],
  16277: [3],
  16284: [4],
  16333: [2],
  16360: [3],
  16381: [4],
  16496: [3],
  16543: [4],
  16555: [5],
  16585: [3],
  16586: [4],
  16660: [4],
  16718: [4],
  16721: [4],
  16741: [4],
  16771: [4],
  16775: [6],
  16898: [4],
  16904: [3],
  16950: [4],
  169

In [22]:
def find_in_pos_ind(word):
    for fileno,positions in pos_ind[word][1].items():
        print(f"{lines_doc[fileno]} at {positions}")

## Run the cell below to get the positional postings of any word in the corpus

In [23]:
word=input("Enter the word for its positional indexes: ").strip().lower()
word = ps.stem(word)
word = lem.lemmatize(word)
find_in_pos_ind(word)

Enter the word for its positional indexes: musk
In tweet 26 at [4]
In tweet 3284 at [8]
In tweet 3285 at [10]
In tweet 21264 at [2]
In tweet 24403 at [3]
In tweet 24439 at [3]
In tweet 25308 at [3]
In tweet 25915 at [2]


---
# Creating a function for Phrase Queries 

In [24]:
def phrase_query(s):
    res={}
    for i in s:
        if i not in pos_ind:
            return []
    postings=[[] for i in s]
    
    for i in range(len(s)):
        pos = pos_ind[s[i]][1]
        for j in pos.items():
            temp=[]
            temp.append(j[0])
            temp.append(j[1])
            postings[i].append(temp)
        doc_list=[[] for i in s]
        for i in range(len(postings)):
            for j in postings[i]:
                doc_list[i].append(j[0])
    
    common_docs=doc_list[0]
    for i in range(1,len(doc_list)):
        common_docs=set(common_docs)
        common_docs = list(common_docs.intersection(doc_list[i]))

    for i in range(len(postings)):
            postings[i]=[x for x in postings[i] if x[0] in common_docs]
    
    for i in range(len(postings)):
            for j in range(len(postings[i])):
                postings[i][j][1]=[x-i for x in postings[i][j][1]]
    result=[]
    
    def intersectLists(lists):
        if len(lists)==0:
            return []
        lists.sort(key=len)
        return list(reduce(lambda x,y: set(x)&set(y),lists))
    
    for i in range(len(postings[0])):
        li=intersectLists( [x[i][1] for x in postings] )
        if li==[]:
            continue
        else:
            result.append(postings[0][i][0])
    print(f"The phrase was found in the documents: {result}")

## Run the cell below and type in the phrase to get the list of docs where the phrase is present

In [25]:
s=input("Enter the phrase to be searched: ").strip().lower()
s=s.split()
for i in range(len(s)):
    if('http' or 'www')in s[i]:
        s[i]=''
    if s[i] in stop_words:
        s[i]=''
s=stemm(lemm(s))
phrase_query(s)

Enter the phrase to be searched: Robert Half International
The phrase was found in the documents: [64, 65, 66, 441, 443, 445, 688, 689, 1638, 2220, 2381, 2383, 2705, 2706, 2713, 3422, 5321, 7358, 7629, 7713, 8356, 8357, 8358, 8818, 9382, 13104, 14459, 20846, 21504, 23309]


In [26]:
s=input("Enter the phrase to be searched: ").strip().lower()
s=s.split()
for i in range(len(s)):
    if('http' or 'www')in s[i]:
        s[i]=''
    if s[i] in stop_words:
        s[i]=''
s=stemm(lemm(s))
phrase_query(s)

Enter the phrase to be searched: Iron Mountains begins
The phrase was found in the documents: [189]


In [28]:
s=input("Enter the phrase to be searched: ").strip().lower()
s=s.split()
for i in range(len(s)):
    if('http' or 'www')in s[i]:
        s[i]=''
    if s[i] in stop_words:
        s[i]=''
s=stemm(lemm(s))
phrase_query(s)

Enter the phrase to be searched: Zacks Investment research
The phrase was found in the documents: [31, 110, 111, 133, 134, 142, 143, 167, 207, 404, 856, 864, 885, 888, 889, 933, 1116, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1322, 1328, 1329, 1331, 1730, 1735, 1737, 1739, 1740, 1741, 1745, 1748, 1778, 1814, 1819, 1844, 1845, 1897, 1899, 1916, 1978, 2046, 2056, 2075, 2076, 2077, 2079, 2082, 2101, 2158, 2169, 2178, 2267, 2269, 2561, 2784, 2785, 2787, 2818, 2819, 2822, 2826, 2829, 2830, 2843, 2918, 2923, 2942, 2948, 2979, 2987, 2988, 2989, 2990, 2992, 3009, 3138, 3139, 3194, 3266, 3285, 3321, 3479, 3567, 3573, 3574, 3575, 4515, 4518, 4593, 4618, 4619, 4693, 6440, 6441, 6485, 6492, 6545, 6558, 6659, 6668, 6703, 6705, 6713, 6716, 6719, 6720, 6741, 6760, 6761, 6764, 6799, 6803, 6807, 6808, 6809, 6871, 6873, 6877, 6878, 6879, 6885, 6886, 6892, 6895, 6897, 6899, 6918, 6927, 6944, 7017, 7050, 7096, 7126, 7131, 7135, 7136, 7139, 7140, 7170, 7187, 7192, 7210, 7213, 7227, 7232, 7236, 7248, 

In [29]:
s=input("Enter the phrase to be searched: ").strip().lower()
s=s.split()
for i in range(len(s)):
    if('http' or 'www')in s[i]:
        s[i]=''
    if s[i] in stop_words:
        s[i]=''
s=stemm(lemm(s))
phrase_query(s)

Enter the phrase to be searched: set HCA Healthcare
The phrase was found in the documents: [265, 279]
