In [1]:
import logging, sys
logging.disable(sys.maxsize)
import json
import lucene
import os
import ast
from org.apache.lucene.store import MMapDirectory, SimpleFSDirectory, NIOFSDirectory
from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader
from org.apache.lucene.search import IndexSearcher, BoostQuery, Query
from org.apache.lucene.search.similarities import BM25Similarity
from datetime import datetime
import operator

from org.apache.lucene.search import Sort, SortField
from pathlib import Path

jsonKeys = ['created_utc', 'id', 'name', 'num_comments', 'over_18', 'permalink', 'score', 'selftext', 'spoiler', 'title', 'upvote_ratio', 'url', 'comments']
pathlist = Path('jsonData/').glob('**/*.json')
finalDocJson = 'group01_reddit_data.json'

### run commented out code below ONCE to create the combined json file ###
### comment out code below after json is created to make future cell runs faster ###
# tempDoc = []
# counter = 0
# try:
#     for path in pathlist:
#         path_in_str = str(path)
#         #print(path_in_str)
#         with open(path_in_str, 'r') as data_file:
#             counter += 1
#             x = json.load(data_file)
#             for i in range(len(x)):
#                 if (x[i]['over_18'] == False and x[i]['spoiler'] == False):
#                     x[i]['over_18'] = 'false'
#                     x[i]['spoiler'] = 'false'
#                 elif (x[i]['over_18'] == False and x[i]['spoiler'] == True):
#                     x[i]['over_18'] = 'false'
#                     x[i]['spoiler'] = 'true'
#                 elif (x[i]['over_18'] == True and x[i]['spoiler'] == False):
#                     x[i]['over_18'] = 'true'
#                     x[i]['spoiler'] = 'false'
#                 elif (x[i]['over_18'] == True and x[i]['spoiler'] == True):
#                     x[i]['over_18'] = 'true'
#                     x[i]['spoiler'] = 'true'
#                 tempDoc.append(x[i])
# except:
#     print('error at json file:')
#     print(counter)

# with open(finalDocJson, 'w') as f:
#     json.dump(tempDoc, f, indent=4)

#######

finalDoc = []
with open(finalDocJson, 'r') as index_file:
    finalDoc = json.load(index_file)

### this function was removed in the flask.py file because not needed
def create_index(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    store = SimpleFSDirectory(Paths.get(dir))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    ### discussion 6 slides #9 columns are: INDEXED-TOKENIZED-STORED ###
    
    # No-No-Yes = not on slides so idk
    metaType = FieldType()
    metaType.setStored(True)
    metaType.setTokenized(False)

    # No-No-No = Not relevant for searching, results in java.lang.IllegalArgumentException: it doesn't make sense to have a field that is neither indexed nor stored
    irrelevantType = FieldType()
    irrelevantType.setStored(False)
    irrelevantType.setTokenized(False)
    
    # Yes-No-Yes = reddit username
    usernameType = FieldType()
    usernameType.setStored(True)
    usernameType.setTokenized(False)
    usernameType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Yes-No-No = Sensitive information
    sensitiveType = FieldType()
    sensitiveType.setStored(False)
    sensitiveType.setTokenized(False)
    sensitiveType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Yes-Yes-Yes = Title, abstract
    contextType = FieldType()
    contextType.setStored(True)
    contextType.setTokenized(True)
    contextType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Yes-Yes-No = Body
    bodyType = FieldType()
    bodyType.setStored(False)
    bodyType.setTokenized(True)
    bodyType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    for sample in finalDoc:
        #dict_keys(['created_utc', 'id', 'name', 'num_comments', 'over_18', 'permalink', 'score', 'selftext', 'spoiler', 'title', 'upvote_ratio', 'url', 'comments'])
        created_utc = sample[jsonKeys[0]]
        id = sample[jsonKeys[1]]
        name = sample[jsonKeys[2]]
        num_comments = sample[jsonKeys[3]]
        over_18 = sample[jsonKeys[4]]
        permalink = sample[jsonKeys[5]]
        score = sample[jsonKeys[6]]
        selftext = sample[jsonKeys[7]]
        spoiler = sample[jsonKeys[8]]
        title = sample[jsonKeys[9]]
        upvote_ratio = sample[jsonKeys[10]]
        url = sample[jsonKeys[11]]
        comments = sample[jsonKeys[12]]

        doc = Document()
        doc.add(Field(jsonKeys[0], str(created_utc), contextType))
        doc.add(Field(jsonKeys[1], str(id), sensitiveType))
        doc.add(Field(jsonKeys[2], str(name), usernameType))
        doc.add(Field(jsonKeys[3], str(num_comments), metaType)) # was irrelevantType, changed to metaType to avoid errors
        doc.add(Field(jsonKeys[4], str(over_18), metaType)) # was irrelevantType, changed to metaType to avoid errors
        doc.add(Field(jsonKeys[5], str(permalink), contextType))
        doc.add(Field(jsonKeys[6], str(score), contextType))
        doc.add(Field(jsonKeys[7], str(selftext), bodyType))
        doc.add(Field(jsonKeys[8], str(spoiler), metaType)) # was irrelevantType, changed to metaType to avoid errors
        doc.add(Field(jsonKeys[9], str(title), contextType))
        doc.add(Field(jsonKeys[10], str(upvote_ratio), metaType)) # was irrelevantType, changed to metaType to avoid errors
        doc.add(Field(jsonKeys[11], str(url), contextType))
        doc.add(Field(jsonKeys[12], str(comments), contextType))
        writer.addDocument(doc)
    writer.close()
###

#SHOULD FIX JSON OBJECTS IN COMMENTS(IP)
def fix_comments_field(comments):
    try:
        if comments is None:
            return []
        if isinstance(comments, str):
            # Check if comments are in valid JSON format
            try:
                comments = json.loads(comments)
            except json.JSONDecodeError:
                pass
        elif isinstance(comments, list):
            for i in range(len(comments)):
                if isinstance(comments[i], dict) and 'replies' in comments[i]:
                    replies = comments[i]['replies']
                    if isinstance(replies, str):
                        try:
                            replies = json.loads(replies)
                            comments[i]['replies'] = replies
                        except json.JSONDecodeError:
                            pass
        return comments
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def retrieve(storedir, query):
    searchDir = NIOFSDirectory(Paths.get(storedir))
    searcher = IndexSearcher(DirectoryReader.open(searchDir))
    
    parser = QueryParser('title', StandardAnalyzer())
    parsed_query = parser.parse(query)

    print('Parsed Query: ')
    print(parsed_query)

    topDocs = searcher.search(parsed_query, 10).scoreDocs
    topkdocs = []
    redditURLPrefix = "https://www.reddit.com"
    for hit in topDocs:
        doc = searcher.doc(hit.doc)
        comments = doc.get("comments")
        comments_list = ast.literal_eval(comments) if comments else []
        first_comment = comments_list[0] if comments_list else {}
        post_url = redditURLPrefix + doc.get("permalink")
        newDoc = {
            "documentScore": hit.score,
            "title": doc.get("title"),
            "body": first_comment.get('body', ''),
            "post_date": datetime.fromtimestamp(float(doc.get("created_utc"))).strftime('%Y-%m-%d %H:%M:%S'), # referenced https://stackoverflow.com/a/46914259
            "post_score": doc.get("score"),
            "num_comments": doc.get("num_comments"),
            "url": post_url,
            "created_utc": doc.get("created_utc"),
            "first": first_comment
        }
        if not newDoc in topkdocs:    
            topkdocs.append(newDoc)

    ### this part is different from the flask.py file
    print('Top 10 Documents: ')
    for i in range(len(topkdocs)):
        position = i + 1
        print(str(position) + ') ' + str(topkdocs[i]))
    ###
    return topkdocs

lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when running for first time, else comment out after running cell once
#create_index('lucene_partB_index/')  # uncomment when running for first time (to create lucene_partB_index folder in file directory), else leave commented out
print('Cell done executing')

Cell done executing


In [2]:
docstest = retrieve('lucene_partB_index/', 'title:World Cup 2022') # this took like 50 seconds, could be faster if the data wasn't combined into a single json file i guess
print(docstest)

Parsed Query: 
title:world title:cup title:2022
Top 10 Documents: 
1) {'documentScore': 8.044605255126953, 'title': 'WORLD CUP 2022', 'body': ' Lmao Qatar to win is 0.0%', 'post_date': '2022-12-14 05:07:35', 'post_score': '14939', 'num_comments': '96', 'url': 'https://www.reddit.com/r/football/comments/yy5yrb/world_cup_2022/', 'created_utc': '1671023255.0', 'first': {'body': ' Lmao Qatar to win is 0.0%'}}
2) {'documentScore': 7.412470817565918, 'title': 'Brazil World Cup 2022 list', 'body': 'Playing the old 5-5-5-5-6 formation', 'post_date': '2022-11-07 14:20:04', 'post_score': '417', 'num_comments': '124', 'url': 'https://www.reddit.com/r/football/comments/yp2kf2/brazil_world_cup_2022_list/', 'created_utc': '1667859604.0', 'first': {'body': 'Playing the old 5-5-5-5-6 formation', 'replies': ['😂😂😂😂', "I know it's a joke, but surely they'll be playing 4-3-3, right?"]}}
3) {'documentScore': 7.132248401641846, 'title': 'Germanys squad for World Cup 2022', 'body': 'Did Germany as a nation c

In [5]:
print(docstest[0]['first']['body'])

# def sort_documents(documents, sort_field, sort_order):
#     sorted_docs = sorted(documents, key=lambda x: x[sort_field], reverse=(sort_order == "desc"))
#     return sorted_docs

# # Check if docstest is not None
# if docstest is not None:
#     # Sort the documents using the sort_documents function
#     sorted_docs = sort_documents(docstest, sort_field="post_date", sort_order="desc")

#     # Print the sorted documents
#     print('Top 10 Documents: ')
#     for i, doc in enumerate(sorted_docs[:10]):
#         position = i + 1
#         print(f"{position}) {doc}")
# else:
#     print("No documents found.")

 Lmao Qatar to win is 0.0%


In [6]:
print(docstest[1]['first']['body'])

Playing the old 5-5-5-5-6 formation


In [7]:
print(docstest[1]['first']['replies']) # let's skip using replies to comments in doc scoring for simplicity sake

['😂😂😂😂', "I know it's a joke, but surely they'll be playing 4-3-3, right?"]
