In [1]:
import nltk
import string
import re
import math
import pandas as pd
from collections import defaultdict
import ir_datasets
import pycountry
import json
import csv

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

dataset1 = ir_datasets.load("antique")
q_1 = ir_datasets.load("antique/test")
EXAMPLE_TEXT1 = dataset1.docs_iter()[:200000]
        
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
countries = {country.name for country in pycountry.countries}
A_countries = {country.alpha_2 for country in pycountry.countries}

In [2]:
doc_array1 = defaultdict(list)

for doc in EXAMPLE_TEXT1:
    doc_array1[doc.doc_id].append(doc.text) 
    
doc_array2=defaultdict(list)
i=0
with open(".ir_datasets\lotte\lifestyle\dev\collection.tsv") as file:
    dataset2 = csv.reader(file, delimiter="\t")
    for line in dataset2:
        if i < 200000:
            doc_array2[line[0]].append(line[1])
            i=i+1  
        else:
            break

In [3]:
def create_inverted_index(corpus):
    inverted_index = defaultdict(list)
    
    for doc_id ,texts in corpus.items(): 
        for text in texts:
            country=[]
            stemmerTest=[]
            limitizerTest=[]
            date = re.findall(r'\d+\S\d+\S\d+', text)
            text = word_tokenize(text)  
            for w in text:
                if w in A_countries:
                    co=pycountry.countries.get(alpha_2=w).name
                    country.append(co)  
                    text.remove(w)
                if w in countries:
                    country.append(w)
                    text.remove(w)
                    
            for w in text:    
                if w.isalpha():
                    w = w.lower()    
                else:
                    text.remove(w)
                    
            for w in text:    
                if w not in stop_words:
                    stemmerTest.append(ps.stem(w))                  
            text = stemmerTest
        
            for w in text:
                limitizerTest.append(lemmatizer.lemmatize(w))
            text=limitizerTest
        
            text = date + country + text
        
            for term in text:
                inverted_index[term].append(doc_id)
            
    return dict(inverted_index)

In [7]:
inverted_index1 = create_inverted_index(doc_array1)
inverted_index2 = create_inverted_index(doc_array2)

In [8]:
def calculate_tf(term , document):
    term_count = len(document)
    tf = document.count(term)/term_count
    return tf

def calculate_idf(term , corpus ,inverted_index):
    idf = {}
    n_docs = len(corpus)
    doc_ids = inverted_index[term]
    idf = math.log(n_docs/len(doc_ids))
    return idf

def calculate_tfidf(term ,document, corpus,inverted_index):
    tfidf = {}
    tf = calculate_tf(term , document)
    idf = calculate_idf(term , corpus ,inverted_index)
    tfidf = tf * idf
    return tfidf


In [9]:
index1 = {}
for term, doc_ids in inverted_index1.items():
    for doc_id in doc_ids:                                
        index1.setdefault(term, {})[doc_id] = calculate_tfidf(term , doc_array1[doc_id][0] , doc_array1 , inverted_index1)

index2 = {}
for term, doc_ids in inverted_index2.items():
    for doc_id in doc_ids:                                
        index2.setdefault(term, {})[doc_id] = calculate_tfidf(term , doc_array2[doc_id][0], doc_array2 , inverted_index2)


In [7]:
from flask import Flask, request, Response ,jsonify
app = Flask(__name__)

from flaskext.mysql import MySQL

mysql = MySQL()
app.config['MYSQL_DATABASE_USER'] = 'root'
app.config['MYSQL_DATABASE_PASSWORD'] = ''
app.config['MYSQL_DATABASE_DB'] = 'ir_database'
app.config['MYSQL_DATABASE_HOST'] = 'localhost'
mysql.init_app(app)

import pymysql
from flask import jsonify
from flask import flash, request

In [13]:
for query in q_1.queries_iter():
    conn = mysql.connect()
    cursor = conn.cursor(pymysql.cursors.DictCursor)
    sqlQuery = "INSERT INTO antique_query(query_id ,text) VALUES(%s,%s)"
    bindData = (query.query_id,query.text)            
    cursor.execute(sqlQuery, bindData)
    conn.commit()


with open(".ir_datasets\lotte\lifestyle\dev\questions.forum.tsv") as file:
    dataset2 = csv.reader(file, delimiter="\t")
    for query in dataset2:
        conn = mysql.connect()
        cursor = conn.cursor(pymysql.cursors.DictCursor)
        sqlQuery = "INSERT INTO lotte_query(query_id,text) VALUES(%s,%s)"
        bindData = (query[0],query[1])            
        cursor.execute(sqlQuery, bindData)
        conn.commit()    


In [8]:
def create_index1():
    try:        
        for term, doc_ids in index1.items():
            for doc_id ,weight in doc_ids.items():
                conn = mysql.connect()
                cursor = conn.cursor(pymysql.cursors.DictCursor)
                sqlQuery = "INSERT INTO antique(term, doc_id, weight) VALUES(%s, %s, %s)"
                bindData = (term, doc_id, weight)            
                cursor.execute(sqlQuery, bindData)
                conn.commit()
    except Exception as e:
        print(e)
    finally:
        cursor.close() 
        conn.close()          

In [16]:
def create_index2():
    try:        
        for term, doc_ids in index2.items():
            for doc_id ,weight in doc_ids.items():
                conn = mysql.connect()
                cursor = conn.cursor(pymysql.cursors.DictCursor)
                sqlQuery = "INSERT INTO lotte(term, doc_id, weight) VALUES(%s, %s, %s)"
                bindData = (term, doc_id, weight)            
                cursor.execute(sqlQuery, bindData)
                conn.commit()
    except Exception as e:
        print(e)
    finally:
        cursor.close() 
        conn.close() 

In [None]:
create_index1()
create_index2()