# Week 1

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import numpy as np
import os
from nltk.stem import PorterStemmer
from natsort import natsorted 
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
import copy
from functools import reduce

In [2]:
df = pd.read_csv("stockerbot-export1.csv")
df=df[['text']]
df['text'] = df['text'].astype(str)
df.head()

Unnamed: 0,text
0,VIDEO: “I was in my office. I was minding my o...
1,The price of lumber $LB_F is down 22% since hi...
2,Who says the American Dream is dead? https://t...
3,Barry Silbert is extremely optimistic on bitco...
4,How satellites avoid attacks and space junk wh...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28440 entries, 0 to 28439
Data columns (total 1 columns):
text    28440 non-null object
dtypes: object(1)
memory usage: 222.3+ KB


# Tokenization using RegexpTokenizer

In [4]:
df['text'] = df['text'].replace(r'http\S+','',regex=True).replace(r'www\S+','',regex=True)
df.head()

Unnamed: 0,text
0,VIDEO: “I was in my office. I was minding my o...
1,The price of lumber $LB_F is down 22% since hi...
2,Who says the American Dream is dead?
3,Barry Silbert is extremely optimistic on bitco...
4,How satellites avoid attacks and space junk wh...


In [5]:
tokens = RegexpTokenizer(r'\w+')
df['text']=df['text'].apply(lambda x:tokens.tokenize(x.lower()))
df.head()

Unnamed: 0,text
0,"[video, i, was, in, my, office, i, was, mindin..."
1,"[the, price, of, lumber, lb_f, is, down, 22, s..."
2,"[who, says, the, american, dream, is, dead]"
3,"[barry, silbert, is, extremely, optimistic, on..."
4,"[how, satellites, avoid, attacks, and, space, ..."


## Removing stop words

In [6]:
stop_words = set(stopwords.words('english')) 
stop_words = stop_words.union(",","(",")","[","]","{","}","#","@","!",":",";",".","?")
df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop_words])
df.head()

Unnamed: 0,text
0,"[video, office, minding, business, david, solo..."
1,"[price, lumber, lb_f, 22, since, hitting, ytd,..."
2,"[says, american, dream, dead]"
3,"[barry, silbert, extremely, optimistic, bitcoi..."
4,"[satellites, avoid, attacks, space, junk, circ..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28440 entries, 0 to 28439
Data columns (total 1 columns):
text    28440 non-null object
dtypes: object(1)
memory usage: 222.3+ KB


---

# Week 2

---

# Lemmetization using WordNetLemmatizer

In [8]:
lem = WordNetLemmatizer()
def lemm(text):
    sent=[]
    for word in text:
        sent.append(lem.lemmatize(word))
    return sent

# Stemming using PorterStemmer

In [9]:
ps=PorterStemmer()

def stemm(text):        
    sent = []
    for word in text:
        sent.append(ps.stem(word))
    return sent

In [10]:
df["text_final"] =  df.apply(lambda x: stemm(lemm(x['text'])), axis=1)

In [11]:
df.head()

Unnamed: 0,text,text_final
0,"[video, office, minding, business, david, solo...","[video, offic, mind, busi, david, solomon, tel..."
1,"[price, lumber, lb_f, 22, since, hitting, ytd,...","[price, lumber, lb_f, 22, sinc, hit, ytd, high..."
2,"[says, american, dream, dead]","[say, american, dream, dead]"
3,"[barry, silbert, extremely, optimistic, bitcoi...","[barri, silbert, extrem, optimist, bitcoin, pr..."
4,"[satellites, avoid, attacks, space, junk, circ...","[satellit, avoid, attack, space, junk, circl, ..."


---
# Creating postings list for inverted index

In [12]:
inv_ind={}
cnt=1
for i in df['text_final']:
    for item in i:
        if item not in inv_ind:
            inv_ind[item]=[]
        if cnt not in inv_ind[item]:
            inv_ind[item].append(cnt)
    cnt+=1
print(inv_ind)

{'video': [1, 801, 1049, 1207, 1328, 1448, 1589, 1598, 1777, 1871, 1925, 2119, 2356, 2440, 2553, 2704, 2705, 3419, 3420, 3598, 3605, 3625, 3627, 3684, 4542, 4551, 5187, 5491, 5509, 5619, 5671, 5779, 5865, 6149, 6674, 6749, 6806, 8143, 8210, 8375, 8387, 8388, 8389, 8390, 8391, 8392, 8393, 8394, 8396, 8397, 8398, 8399, 8400, 8401, 8412, 8487, 8490, 8852, 8891, 8894, 8895, 8916, 10501, 10655, 10665, 10680, 11806, 11826, 12020, 12051, 12222, 12304, 13655, 14541, 14937, 15079, 15328, 15692, 16198, 16239, 16529, 16599, 17439, 17995, 18145, 18686, 18909, 19091, 19350, 19534, 20101, 20209, 20324, 20945, 21008, 22185, 22605, 22613, 22624, 22700, 22702, 22757, 22770, 23409, 23426, 23470, 23560, 24075, 24150, 24349, 24695, 24723, 24888, 24936, 25452, 25763, 25855, 25860, 26091, 26127, 26259, 26336, 26349, 26379, 26485, 26595, 26676, 26685, 27113, 27137, 27704, 27866, 28114, 28187, 28387], 'offic': [1, 340, 343, 653, 838, 839, 1261, 1266, 1923, 1924, 1979, 2035, 2038, 2990, 3229, 3360, 3526, 3838,

In [13]:
def find(word):
    print(inv_ind[word])
find("crypto")

[4, 3377, 4222, 5302, 8544, 11868, 12908, 12964, 13028, 13170, 13240, 13300, 13509, 13559, 13618, 13680, 13794, 14012, 14047, 14173, 14325, 14422, 14556, 14648, 14725, 14882, 14932, 14983, 14986, 15086, 15178, 15350, 15352, 15354, 15360, 15387, 15415, 15417, 15509, 15543, 15555, 15597, 15726, 15820, 15951, 15958, 15960, 16142, 16159, 16199, 16218, 16265, 16278, 16285, 16334, 16361, 16382, 16497, 16544, 16556, 16586, 16587, 16661, 16719, 16722, 16742, 16772, 16776, 16899, 16905, 16951, 16973, 17061, 17109, 17115, 17116, 17133, 17160, 17172, 17175, 17205, 17207, 17235, 17297, 17307, 17345, 17352, 17361, 17365, 17366, 17427, 17428, 17432, 17503, 17511, 17531, 17539, 17547, 17573, 17605, 17612, 17619, 17636, 17673, 17684, 17694, 17696, 17701, 17721, 17728, 17766, 17780, 17791, 17813, 17817, 17835, 17837, 17871, 17919, 17935, 17965, 17972, 17986, 17993, 17999, 18007, 18025, 18027, 18059, 18070, 18079, 18081, 18115, 18151, 18163, 18164, 18198, 18207, 18210, 18253, 18256, 18288, 18303, 18307,