In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import ast
import math
import operator
import datetime

import pandas as pd
import glob

Indexing and quering text:
Since the articles are not well formated (lots of tags...) we decide to transform it into a simpler structure (CSV ordered by IDs with only the title and text)

In [2]:
list = [datetime.datetime.strptime(filename[2:], '%m%d%y') for filename in glob.glob("la*")]
list.sort()
list = ["la"+filename.strftime("%m%d%y") for filename in list]
id = 0
text = ""
with open("filtered.csv","w+") as fileOut:
    for filename in list:
        with open(filename,"r") as file:
            for line in file:
                if "<DOCID>" in line:
                    if id !=0:
                        fileOut.writelines(str(id) + "#" + str(text.replace("\n","")) + "\n")
                    text = ""
                    id = (line.split(">")[1].split("<")[0].replace(" ",""))
                if "<HEADLINE>" in line:
                    next(file)
                    text = file.readline()
                if "<TEXT>" in line:
                    line = file.readline()
                    while "</TEXT>" not in line:
                        if "<" not in line:
                            text = text+line
                        line = file.readline()

Now all the articles are in one CSV file ordered by id with only the title and the text

In [3]:
with open("filtered.csv","r") as file:
    for i in range(10):
        print(file.readline())

1#NEW FALLOUT FROM CHERNOBYL; The onset of the new Gorbachev policy of glasnost, commonly mistranslated as openness but closer in connotation to candor or publicizing, has complicated the task of Soviet secret-keepers and has allowed substantial new Western insights into Soviet society. David R. Marples' new book, his second on the Chernobyl accident of April 26, 1986, is a shining example of the best type of non-Soviet analysis into topics that only recently were absolutely taboo in Moscow official circles. The author, a British-educated historian and economist, is a research associate with the Canadian Institute of Ukrainian Studies at the University of Alberta, and the academic style of the book is undisguised. However, its intended audience is the general public, and anyone interested in nuclear power, or Soviet economy and society, or human drama, or just plain sleuthing state secrets, will find hitherto unpublished revelations and explanations of the event and its continuing afte

Now we are going to tokenize the words giving a list of stopwords (punctiuation and common words in english), for the stemming process we are going to use snowball instead of porter : it's more powerful (ex: fairly with porter > fairli, with snowball > fair)

In [4]:
list = set([])
stop_words = stopwords.words('english')
stop_words.extend(["the"])
stop_words = set(stop_words)
stemmer = SnowballStemmer('english')
with open("tokens.csv","w+") as fileout: 
    with open("filtered.csv","r") as file:
          for line in file:
            article = re.sub(r'[^\w]', ' ', line.split("#")[1])
            tokens = word_tokenize(re.sub(r"\d+", "", article.lower()))
            tokens = [w for w in tokens if w not in stop_words]
            tokens = [stemmer.stem(w) for w in tokens]
            list = list.union(set(tokens))
            fileout.writelines(line.split("#")[0]+"#"+'"['+str(tokens)[1:-1]+']"'+"\n")
with open("list.csv","w+") as file:
          for item in list:
              file.writelines(item+"\n")

now we need to create the inverted file, first by creating the VOC, for this we just need to create this small function

In [5]:
def load_voc(filename):
    with open(filename,"r") as file:
        i = 0
        voc = {}
        for line in file:
            voc[line[:-1]] = i
            i+=1
    return voc

In [6]:
voc = load_voc("list.csv")
i = 10
for item in voc.items():
    print(item)
    i-=1
    if i == 0:
        break

('alcoa', 0)
('lamiell', 1)
('warco', 2)
('tousey', 3)
('oakton', 4)
('bondelli', 5)
('boujay', 6)
('keif', 7)
('aaftink', 8)
('schwam', 9)


In [7]:
print(len(voc))

171352


Now we create the PL file by inversing the file created previously, for this we use first panda in order to optimize the requests we are going to do next

In [8]:
PL =  pd.read_csv('tokens.csv', sep='#', names=['docID','tokens'], header=None, converters={'tokens': ast.literal_eval})
PL.head()

Unnamed: 0,docID,tokens
0,1,"[new, fallout, chernobyl, onset, new, gorbache..."
1,2,"[aristocrat, independ, voter, andr, jardin, to..."
2,3,"[peruvian, memori, shine, path, read, tungsten..."
3,4,"[extinct, land, fire, four, tribe, extinct, na..."
4,11,"[cover, shirley, templ, black, play, return, e..."


In [9]:
len(PL)

131895

We are not going to use chunks because pandas don't know the size of the file, so we are obliged to process iteratively the chunks, which is very costly in time processing

In [10]:
PL =  pd.read_csv('tokens.csv', sep='#', names=['docID','tokens'], header=None, converters={'tokens': ast.literal_eval}, index_col="docID")
PL.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131895 entries, 1 to 329694
Data columns (total 1 columns):
tokens    131895 non-null object
dtypes: object(1)
memory usage: 303.9 MB


and then we create the PL file

In [11]:
with open("PL.csv","w+") as fileout:
    for item in voc.keys():
        fileout.writelines(item+'#'+str(PL[PL['tokens'].apply(lambda x: item in x)].docID.unique().tolist())+'\n')

In [12]:
with open("PL.csv","r") as file:
    for i in range(10):
        print(file.readline())

alcoa#[19141, 28482, 29722, 30642, 47114, 82372, 92164, 93603, 100382, 121682, 148822, 163272, 163702, 180404, 205413, 244053, 248171, 257883, 268943, 271102, 283141, 282774, 290252, 291744, 291934, 293513, 304452, 305302, 325103]

lamiell#[267192, 292053]

warco#[9901]

tousey#[190582]

oakton#[893, 79334, 221553]

bondelli#[256473]

boujay#[53293]

keif#[18242, 102254, 102431, 115293, 126993, 261572]

aaftink#[150223, 171501]

schwam#[76971, 170622]



In [13]:
PL =  pd.read_csv('tokens.csv', sep='#', names=['docID','tokens'], header=None, converters={'tokens': ast.literal_eval}, index_col="docID")
nbDocs = len(PL.docID)

with open("if.csv","w+") as fileout:
    with open("PL.csv","r") as file:
        for line in file:
            token = line.split("#")[0]
            list = ast.literal_eval(line.split("#")[1][:-1])
            results = {}
            idf = math.log(nbDocs/(1+len(list)))
            for id in list:
                doc = PL.loc[id].tokens
                tf = len([w for w in doc if w == token]) / len(doc)
                if tf > 0:
                    tf = 1 + math.log(tf)
                results[id] = tf * idf
            results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
            fileout.writelines(str(results)+"\n")

In [14]:
with open("if.csv","r") as file:
    for i in range(10):
        print(file.readline())

[(293513, 21.889437059870144), (93603, 14.203073568204506), (163702, 14.203073568204506), (205413, 14.203073568204506), (282774, 14.203073568204506), (305302, 14.203073568204506), (19141, 8.388564048819056), (28482, 8.388564048819056), (29722, 8.388564048819056), (30642, 8.388564048819056), (47114, 8.388564048819056), (82372, 8.388564048819056), (92164, 8.388564048819056), (100382, 8.388564048819056), (121682, 8.388564048819056), (148822, 8.388564048819056), (163272, 8.388564048819056), (180404, 8.388564048819056), (244053, 8.388564048819056), (248171, 8.388564048819056), (257883, 8.388564048819056), (268943, 8.388564048819056), (271102, 8.388564048819056), (283141, 8.388564048819056), (290252, 8.388564048819056), (291744, 8.388564048819056), (291934, 8.388564048819056), (304452, 8.388564048819056), (325103, 8.388564048819056)]

[(267192, 29.847116853586122), (292053, 18.101689026406735)]

[(9901, 18.788201131015505)]

[(190582, 18.788201131015505)]

[(893, 10.403467069361321), (79334,

Since the articles are not really sorted by id on the original file, we are going to make a structure stored in memory that map each docID with it's location on the original file in order to extract the text

In [15]:
articles = {}
i = 1
with open("filtered.csv","r") as file:
    for line in file:
        articles[line.split("#")[0]] = i
        i+=1
with open("article_list.csv","w+") as file:
    file.writelines(str(articles))