In [1]:
import logging, sys
logging.disable(sys.maxsize)
import json
import lucene
import os
from org.apache.lucene.store import MMapDirectory, SimpleFSDirectory, NIOFSDirectory
from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType, TextField, StringField
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader
from org.apache.lucene.search import IndexSearcher, BoostQuery, Query
from org.apache.lucene.search.similarities import BM25Similarity
from pathlib import Path
from datetime import datetime

jsonKeys = ['created_utc', 'id', 'name', 'num_comments', 'over_18', 'permalink', 'score', 'selftext', 'spoiler', 'title', 'upvote_ratio', 'url', 'comments']
pathlist = Path('jsonData/').glob('**/*.json')
finalDocJson = 'group01_reddit_data.json'

### run commented out code below ONCE to create the combined json file ###
### comment out code below after json is created to make future cell runs faster ###
# tempDoc = []
# counter = 0
# try:
#     for path in pathlist:
#         path_in_str = str(path)
#         #print(path_in_str)
#         with open(path_in_str, 'r') as data_file:
#             counter += 1
#             x = json.load(data_file)
#             for i in range(len(x)):
#                 if (x[i]['over_18'] == False and x[i]['spoiler'] == False):
#                     x[i]['over_18'] = 'false'
#                     x[i]['spoiler'] = 'false'
#                 elif (x[i]['over_18'] == False and x[i]['spoiler'] == True):
#                     x[i]['over_18'] = 'false'
#                     x[i]['spoiler'] = 'true'
#                 elif (x[i]['over_18'] == True and x[i]['spoiler'] == False):
#                     x[i]['over_18'] = 'true'
#                     x[i]['spoiler'] = 'false'
#                 elif (x[i]['over_18'] == True and x[i]['spoiler'] == True):
#                     x[i]['over_18'] = 'true'
#                     x[i]['spoiler'] = 'true'
#                 tempDoc.append(x[i])
# except:
#     print('error at json file:')
#     print(counter)

# with open(finalDocJson, 'w') as f:
#     json.dump(tempDoc, f, indent=4)

#######

finalDoc = []
with open(finalDocJson, 'r') as index_file:
    finalDoc = json.load(index_file)

def create_index(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    store = SimpleFSDirectory(Paths.get(dir))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    ### discussion 6 slides #9 columns are: INDEXED-TOKENIZED-STORED ###
    
    # No-No-Yes = not on slides so idk
    metaType = FieldType()
    metaType.setStored(True)
    metaType.setTokenized(False)

    # No-No-No = Not relevant for searching
    irrelevantType = FieldType()
    irrelevantType.setStored(False)
    irrelevantType.setTokenized(False)
    
    # Yes-No-Yes = reddit username
    usernameType = FieldType()
    usernameType.setStored(True)
    usernameType.setTokenized(False)
    usernameType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Yes-No-No = Sensitive information
    sensitiveType = FieldType()
    sensitiveType.setStored(False)
    sensitiveType.setTokenized(False)
    sensitiveType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Yes-Yes-Yes = Title, abstract
    contextType = FieldType()
    contextType.setStored(True)
    contextType.setTokenized(True)
    contextType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Yes-Yes-No = Body
    bodyType = FieldType()
    bodyType.setStored(False)
    bodyType.setTokenized(True)
    bodyType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    for sample in finalDoc:
        #dict_keys(['created_utc', 'id', 'name', 'num_comments', 'over_18', 'permalink', 'score', 'selftext', 'spoiler', 'title', 'upvote_ratio', 'url', 'comments'])
        created_utc = sample[jsonKeys[0]]
        id = sample[jsonKeys[1]]
        name = sample[jsonKeys[2]]
        num_comments = sample[jsonKeys[3]]
        over_18 = sample[jsonKeys[4]]
        permalink = sample[jsonKeys[5]]
        score = sample[jsonKeys[6]]
        selftext = sample[jsonKeys[7]]
        spoiler = sample[jsonKeys[8]]
        title = sample[jsonKeys[9]]
        upvote_ratio = sample[jsonKeys[10]]
        url = sample[jsonKeys[11]]
        comments = sample[jsonKeys[12]]

        # metaType, irrelevantType, usernameType, sensitiveType, contextType, bodyType
        # all are temporarily set to contextType for now while testing #
        doc = Document()
        doc.add(Field(jsonKeys[0], str(created_utc), contextType))
        doc.add(Field(jsonKeys[1], str(id), contextType))
        doc.add(Field(jsonKeys[2], str(name), contextType))
        doc.add(Field(jsonKeys[3], str(num_comments), contextType))
        doc.add(Field(jsonKeys[4], str(over_18), contextType))
        doc.add(Field(jsonKeys[5], str(permalink), contextType))
        doc.add(Field(jsonKeys[6], str(score), contextType))
        doc.add(Field(jsonKeys[7], str(selftext), contextType))
        doc.add(Field(jsonKeys[8], str(spoiler), contextType))
        doc.add(Field(jsonKeys[9], str(title), contextType))
        doc.add(Field(jsonKeys[10], str(upvote_ratio), contextType))
        doc.add(Field(jsonKeys[11], str(url), contextType))
        doc.add(Field(jsonKeys[12], str(comments), contextType))
        writer.addDocument(doc)
    writer.close()

def retrieve(storedir, query):
    searchDir = NIOFSDirectory(Paths.get(storedir))
    searcher = IndexSearcher(DirectoryReader.open(searchDir))
    
    parser = QueryParser('title', StandardAnalyzer())
    parsed_query = parser.parse(query)

    print('Parsed Query: ')
    print(parsed_query)

    topDocs = searcher.search(parsed_query, 10).scoreDocs
    #print('top docs: ')
    #print(topDocs)
    topkdocs = []
    for hit in topDocs:
        doc = searcher.doc(hit.doc)
        topkdocs.append({
            "documentScore": hit.score,
            "title": doc.get("title"),
            "body": doc.get("body"), # this is wrong, maybe #print(sample_doc[0]['comments'][0]['body']) need to use this
            "post_time": datetime.fromtimestamp(float(doc.get("created_utc"))).strftime('%Y-%m-%d %H:%M:%S') # referenced https://stackoverflow.com/a/46914259
        })

    print('Top 10 Documents: ')
    #print(topkdocs)
    for i in range(len(topkdocs)):
        position = i + 1
        print(str(position) + ') ' + str(topkdocs[i]))


lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print('Cell done executing')

Cell done executing


In [2]:
create_index('lucene_partB_index/')
retrieve('lucene_partB_index/', 'title:WORLD CUP 2022') # this took like 50 seconds, could be faster if the data wasn't combined into a single json file i guess

Parsed Query: 
title:world title:cup title:2022
Top 10 Documents: 
1) {'documentScore': 8.044605255126953, 'title': 'WORLD CUP 2022'}
2) {'documentScore': 7.412470817565918, 'title': 'Brazil World Cup 2022 list'}
3) {'documentScore': 7.132248401641846, 'title': 'Germanys squad for World Cup 2022'}
4) {'documentScore': 7.132248401641846, 'title': 'World cup 2022 group stage complete'}
5) {'documentScore': 7.132248401641846, 'title': 'Argentina Squad for World Cup 2022'}
6) {'documentScore': 6.942017555236816, 'title': 'World map of all 2022 fifa world cup nations'}
7) {'documentScore': 6.405756950378418, 'title': '[Canada] have qualified for the 2022 FIFA World Cup'}
8) {'documentScore': 6.405756950378418, 'title': 'USA has been eliminated from the 2022 World Cup.'}
9) {'documentScore': 6.401978492736816, 'title': 'Argentina have won their third World Cup title at the 2022 FIFA World Cup in Qatar!'}
10) {'documentScore': 6.195402145385742, 'title': 'The most valuable squads heading to Q

In [4]:
#print(sample_doc[0]) # returns first dictionary entry in list
#print(sample_doc[0]['comments']) # returns body and replies for comments of first dictionary entry
#print(sample_doc[0]['comments'][0]) # returns first body and replies for first comment of first dictionary entry
#print(sample_doc[0]['comments'][0]['body']) # returns body of first comment of first dictionary entry
#print(sample_doc[0]['comments'][0]['replies']) # returns replies of first comment of first dictionary entry
#print(sample_doc[0]['comments'][0]['replies'][0]) # returns first reply of first comment of first dictionary entry
create_index('lucene_partB_index/')
retrieve('lucene_partB_index/', 'body:oldest OR title:WORLD')

Parsed Query: 
body:oldest title:world
Top 10 Documents: 
1) {'documentScore': 2.4090113639831543, 'title': 'Belgium World Cup Squad for World Cup', 'body': None, 'post_time': '2022-11-10 03:16:51'}
2) {'documentScore': 2.3023815155029297, 'title': 'World map of all 2022 fifa world cup nations', 'body': None, 'post_time': '2022-06-08 14:40:37'}
3) {'documentScore': 2.217965602874756, 'title': 'WORLD CUP 2022', 'body': None, 'post_time': '2022-12-14 05:07:35'}
4) {'documentScore': 2.217965602874756, 'title': 'World Series hangover??', 'body': None, 'post_time': '2019-04-18 06:24:18'}
5) {'documentScore': 2.15903377532959, 'title': 'How well would they fare in the World Cup, as WORLD XI??', 'body': None, 'post_time': '2022-04-04 00:16:47'}
6) {'documentScore': 2.1272592544555664, 'title': 'FIBA WORLD CUP 2023', 'body': None, 'post_time': '2023-05-01 15:55:27'}
7) {'documentScore': 2.0436806678771973, 'title': 'World Cup balls getting charged', 'body': None, 'post_time': '2022-11-29 20:58

In [4]:
from datetime import datetime

utc = '1671023255.0'
print(datetime.fromtimestamp(float(utc)).strftime('%Y-%m-%d %H:%M:%S'))

#tzInfo = pytz.timezone('America/Los_Angeles')
#dt = datetime.now(tz=tzInfo)
#print(dt)

2022-12-14 05:07:35
