In [3]:
import sys
import json
import pandas as pd
import re
import csv
from elasticsearch import Elasticsearch, helpers

In [5]:
# Get started with elasticsearch
HOST = 'http://localhost:9200/'
es = Elasticsearch(hosts=[HOST])
# Check if elasticsearch is running
#!curl "http://localhost:9200/?pretty"

In [70]:
# Preprocessing the questions
qdf=pd.read_csv('questions.csv',error_bad_lines=False, warn_bad_lines=False , header=None, 
                index_col=0, parse_dates=[1],
                    quotechar='"', quoting=2)
# Clean up
# Drop lines in which the question field is empty
qdf=qdf.dropna(subset=[4])  
# Remove any bad indexes (containing letters)
good_indexes = [key for key in qdf.index if re.search('[a-zA-Z]', key) == None]
qdf = qdf.loc[good_indexes]
# Change indexes to integers
qdf.index=qdf.index.astype(int)
# Change column names
qdf.columns = ['date', 'userId', 'categoryId', 'question', 'description']
# Add the questionId column
qdf['questionId'] = qdf.index
# Combine question and description fields
qdf['question'] = qdf['question'].astype(str) + ' - ' + qdf['description'].astype(str)
qdf = qdf.drop(['description'], axis=1)
qdf['date'] = qdf['date'][:10]
qdf.head()

Unnamed: 0_level_0,date,userId,categoryId,question,questionId
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2009-01-28 09:49:57,1,32,Kunnen we volgende week weer schaatsen op natu...,1
2,2009-01-29 09:32:04,1,69,Wat is het lekkerste recept voor boerenkool? -...,2
3,2009-01-29 11:06:20,2,18,Hoeveel kleuren heeft de regenboog? - nan,3
4,2009-01-29 12:37:05,1,60,Wat is de symbolische betekenis van de kip die...,4
5,2009-01-29 16:10:27,3,66,waar kan ik in amsterdam het best een dwerg ha...,5


In [17]:
# Adding the correct category names
cdf=pd.read_csv('categories.csv',error_bad_lines=False, warn_bad_lines=False , header=None, 
                index_col=0, quotechar='"', quoting=2)
cdf.index=cdf.index.astype(int)
cdf[1] = cdf[1].astype(int)
cdf.columns = ['parent', 'name'] 

cdf['parent'] = cdf.apply(lambda x: cdf.iloc[int(x['parent'])-1]['name'], axis=1)
cdf['parent'][1] = ' '  # Apply special case to root category

qdf['category'] = qdf.apply(lambda x: str(cdf.loc[int(x['categoryId'])]['parent']) + ' - ' + str(cdf.loc[int(x['categoryId'])]['name']), axis=1)
qdf = qdf.drop(['categoryId'], axis=1)
qdf.head(10)

Unnamed: 0_level_0,date,userId,question,questionId,category
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2009-01-28 09:49:57,1,Kunnen we volgende week weer schaatsen op natu...,1,"Sport, Spel & Recreatie - Sporten"
2,2009-01-29 09:32:04,1,Wat is het lekkerste recept voor boerenkool? -...,2,Eten & Drinken - Koken & Recepten
3,2009-01-29 11:06:20,2,Hoeveel kleuren heeft de regenboog? - nan,3,Wetenschap - Natuur- en scheikunde
4,2009-01-29 12:37:05,1,Wat is de symbolische betekenis van de kip die...,4,Kunst & Cultuur - Beeldende kunst
5,2009-01-29 16:10:27,3,waar kan ik in amsterdam het best een dwerg ha...,5,Huis & Tuin - Huisdieren
6,2009-01-29 16:14:38,6,Waarom zie je nooit babyduifjes? - nan,6,Huis & Tuin - Overig
7,2009-01-29 16:16:55,7,Hoe krijg je een weggelopen konijn (ontsnapt) ...,7,Alle categorieën - Overig
8,2009-01-29 16:17:33,4,Wat is het synoniem voor synoniem ? - nan,8,Alle categorieën - Overig
9,2009-01-29 16:17:46,5,wat s de reden dat vogels niet vastvriezen aan...,9,Wetenschap - Biologie
10,2009-01-29 16:19:48,4,Als een winkel 24 uur per dag en 365 dagen per...,10,- Alle categorieën


In [45]:
# Indexing questions in elasticsearch
question_bulk = []
for index, row in qdf.iterrows():
    print(len(qdf) - index, end='\r')
    
#     #question_bulk = add_answers(row['questionId'], adf, question_bulk)
#     for index, row in adf[adf['questionId'] == row['questionId']].iterrows():
#         question_bulk.append({'_type':'doc', 
#                      '_index':'goeievragen',
#                      'answers.date':row['date'],
#                      'answers.userId':row['userId'],
#                      'answers.questionId':row['questionId'],
#                      'answers.answer':row['answer'],
#                      #'answers.thumbsDown':row['thumbsDown'],
#                      #'answers.thumbsUp':row['thumbsUp'],
#                      #'answers.bestAnswer':row['bestAnswer'],
#                      'answers.answerId':row['answerId']})
    
    
    question_bulk.append({'_type':'doc', 
                     '_index':'goeievragen',
                     'date':row['date'],
                     'userId':row['userId'],
                     'category':row['category'],
                     'question':row['question'],
                     #'description':row['description'],
                     'questionId':row['questionId']})


-73680

In [47]:
helpers.bulk(es,tuple(question_bulk))

(396045, [])

In [10]:
with open('categories.csv', newline='') as f:
    reader = csv.reader(f)
    for i in range(1):
        print(next(reader))

['1', '0', 'Alle categorieÃ«n']


In [46]:
len(question_bulk)

396045

In [69]:
# Preprocessing answers
chunksize = 10 ** 3
first_item = 1
colnames = range(8)
count = 0
for chunk in pd.read_csv('answers.csv',error_bad_lines=False, warn_bad_lines=False, header=None, 
                index_col=0, parse_dates=[1], quotechar='"', quoting=2, #low_memory=False,
                         chunksize=chunksize, names=colnames):
    chunk = chunk.loc[chunk.index.dropna()]
    chunk = chunk.dropna(subset=[4])
    good_indexes = [key for key in chunk.index if re.search('[A-Za-z\WÀ-ÿ]', str(key)) == None]
    chunk = chunk.loc[good_indexes]
    good_qids = [idx for idx, row in chunk.iterrows() if re.search('[A-Za-z\WÀ-ÿ]', str(row[3])) == None]
    chunk = chunk.loc[good_qids]
    chunk.index=chunk.index.astype(int)
    chunk.columns = ['date', 'userId', 'questionId', 'answer', 'thumbsDown', 'thumbsUp', 'bestAnswer']
    chunk['answerId'] = chunk.index
    chunk['date'] = chunk['date'][:10]
    chunk = chunk.fillna(0)
    if first_item == 1:
        adf = chunk
        first_item = 0
    else:
        adf = pd.concat([adf, chunk])
    count+=1
    print(count, end='\r')

adf.head()

1761

Unnamed: 0,date,userId,questionId,answer,thumbsDown,thumbsUp,bestAnswer,answerId
1,2009-01-29 16:13:55,3,2,met gepureerde aardappels natuurlijk!!! Duhhh ...,57,3,0,1
2,2009-01-29 16:20:49,1,8,"betekenisverwant, gelijkluidend",0,1,0,2
3,2009-01-29 16:28:42,8,3,"Kleuren van de regenboog : ROGGBIV (rood, oran...",0,5,1,3
4,2009-01-29 16:31:29,4,4,De schutters op het schilderij heetten kloveni...,0,6,1,4
5,2009-01-29 16:33:09,6,11,Van de eerste genoemde dieren is er slachtafva...,0,3,0,5


In [68]:
def add_answers(questionId, answer_df, question_bulk):
    for index, row in answer_df[answer_df['questionId'] == questionId].iterrows():
        question_bulk.append({'_type':'doc', 
                     '_index':'goeievragen',
                     'answers.date':row['date'],
                     'answers.userId':row['userId'],
                     'answers.questionId':row['questionId'],
                     'answers.answer':row['answer'],
                     'answers.thumbsDown':row['thumbsDown'],
                     'answers.thumbsUp':row['thumbsUp'],
                     'answers.bestAnswer':row['bestAnswer'],
                     'answers.answerId':row['answerId']})
    return question_bulk

adf.loc[1]['date'][:10]

'2009-01-29'

In [48]:
# Indexing answers in elasticsearch
answer_bulk = []
size = len(adf)
#qdf['json'] = qdf.apply(lambda x: x.to_json(), axis=1)
for index, row in adf.iterrows():
    answer_bulk.append({'_type':'doc', 
                     '_index':'goeievragen',
                     'answers.date':row['date'],
                     'answers.userId':row['userId'],
                     'answers.questionId':row['questionId'],
                     'answers.answer':row['answer'],
                     'answers.thumbsDown':row['thumbsDown'],
                     'answers.thumbsUp':row['thumbsUp'],
                     'answers.bestAnswer':row['bestAnswer'],
                     'answers.answerId':row['answerId']})
    size-=1
    print(size, end='\r')

0000000

In [49]:
helpers.bulk(es,tuple(answer_bulk))

(1401726, [])

In [None]:
# Indexing questions in elasticsearch
question_bulk = []
for index, row in qdf.iterrows():
    question_bulk.append({'_type':'doc', 
                     '_index':'goeievragen',
                     'date':row['date'],
                     'userId':row['userId'],
                     'category':row['category'],
                     'question':row['question'],
                     'description':row['description'],
                     'questionId':row['questionId']})

In [153]:
#res = es.search(index="goeievragen", body={"query": {"match": {'*':'banaan'}}})
res = es.search(index="goeievragen", body={"query": {"multi_match" : { "query":    "banaan", "fields": [ "question", "answer" ] }}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    if 'question' in hit["_source"]:
        print('Vraag:')
        print("%(date)s %(userId)s: %(question)s" % hit["_source"])
    else:
        print('Antwoord:')
        print("%(date)s %(userId)s: %(answer)s" % hit["_source"])

Got 1575 Hits:
Vraag:
2011-08-25 10:50:01 74569: Banaan banaan
Vraag:
2011-08-21 16:49:52 74135: Banaan????
Antwoord:
2011-09-02 19:36:23 63404: Banaan banaan banaan, we gaan er tegenaan. jeeeeeeeejjj! :D
Vraag:
2011-06-05 18:05:20 53982: Waarom wordt de blauwe banaan de BLAUWE banaan genoemd?
Vraag:
2010-08-16 19:13:12 23878: Zal een rechte banaan net zo smaken als een kromme banaan?
Antwoord:
2009-08-25 20:05:10 6399: Banaan Banaan Banaan, we gaan er tegen aan\
Peer Peer Peer, we zijn zo sterk als een beer\
Citroen Citroen Citroen, sudosa-desto kampioen!!
Vraag:
2011-07-30 11:58:02 70660: Banaan invriezen
Vraag:
2011-11-13 17:01:08 84636: Banaan beignets?
Antwoord:
2012-08-21 13:11:58 53954: Banaan, banaan bindt namelijk en zou dus moeten zorgen voor vaste ontlasting.
Vraag:
2011-08-24 02:20:40 74459: Banaan of peer?


In [74]:
def match(term, category=False, jaar=None):
    query={   "query": 
            { "multi_match" : 
            { "query": term, 
             "fields": [ "question", "answer", "category" ] 
    }}}
    if category:
        query={ "query": 
        {"bool": { 
            "should": [
                { "match": { "question":   term        }}, 
                { "match": { "answer": term }}  
      ],
            "filter": [ { "term":  { "category": category }},
                        { "range": { "date": { "gte": "2015-01-01" }}}]   
    }}}
    
    res = es.search(index="goeievragen", body=query)
    print('Top ten results: ')
    for hit in res['hits']['hits']:
        print('-'*10)
        if 'answer' in hit["_source"]:
            print("Antwoord: %(answer)s" % hit["_source"])
        if 'question' in hit["_source"]:
            print("Categorie: %(category)s" % hit["_source"])
            print("Vraag: %(question)s" % hit["_source"])
        print("Datum: %(date)s" % hit["_source"])
        print("Id: %(questionId)s" % hit["_source"])
        
def get_answers(questionId):
    query={"query": {"term" : { "questionId": questionId }}}
    res = es.search(index="goeievragen", body=query)
    for hit in res['hits']['hits']:
        if 'answer' in hit["_source"]:
            print("Antwoord: %(answer)s" % hit["_source"])
    

In [75]:
match('tijd')
#match('tijd', 'wetenschap')
#get_answers(7078)

Top ten results: 
----------
Categorie: Alle categorieën - Overig
Vraag: Wie weet nog wat de oude tijd is is dat de zomer tijd of de winter tijd ? - nan
Datum: 2009-12-30 18:42:35
Id: 37236
----------
Categorie: Wetenschap - Overig
Vraag: Hoe kan er tijd zijn, als die niet gemeten wordt? - Tijd is op zich toch een illusie? Wij hebben tijd, omdat wij hem meten. Als hij niet gemeten wordt, is er geen tijd en net daardoor alle tijd van de wereld. Toch?
Datum: 2011-06-14 11:15:44
Id: 198879
----------
Categorie: Wetenschap - Overig
Vraag: wat is het tegenovergestelde van \TIJD\" ? of is tijd de enige constante en is tijd iets dat altijd verstrijkt.....?" - nan
Datum: 2011-09-26 15:56:48
Id: 244747
----------
Categorie: Wetenschap - Overig
Vraag: Wat is tijd? - Wat is tijd volgens bergson
Datum: 2009-03-26 15:21:40
Id: 1882
----------
Categorie: Alle categorieën - Overig
Vraag: Welke tijd bedoelen ze met lokale tijd ? - Als er iets is gebeurt ergens in een land spreken ze altijd over lokale

In [61]:
res = es.get(index="goeievragen", doc_type='question', id=1)
print(res['_source'])

es.indices.refresh(index="goeievragen")

res = es.search(index="goeievragen", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print("%(date)s %(userId)s: %(question)s" % hit["_source"])

GET http://localhost:9200/goeievragen/question/1 [status:404 request:0.003s]


NotFoundError: NotFoundError(404, '{"_index":"goeievragen","_type":"question","_id":"1","found":false}')