In [1]:
import json
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import datetime
import os
import subprocess
import nltk
import spacy
#nltk.download('stopwords')

from os.path import exists

import random
from resiliparse.parse.html import HTMLTree
from resiliparse.parse.encoding import detect_encoding
from resiliparse.extract.html2text import extract_plain_text
from get_training_query_term_recall import calculate_doc_term_recall
from passage_extraction_util import passage_calculate_doc_term_recall

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
c = 0

with open('./reddit-tomt-submissions.jsonl') as f, gzip.open('./reddit-tomt-submissions-with-links.jsonl.gz', 'wt') as output_file:
    for l in tqdm(f):
        l = json.loads(l)
        if 'links_on_answer_path' in l and len(l['links_on_answer_path']) > 0:
            output_file.write(json.dumps(l) + '\n')


1279425it [04:47, 4454.40it/s]


In [4]:
with gzip.open('./reddit-tomt-submissions-with-links.jsonl.gz') as f:
    entries = []
    for l in tqdm(f):
        l = json.loads(l)
        del l['comments']
        entries.append(l)


326458it [00:57, 5700.86it/s]


In [5]:
def clean_urls(urls):
    ret = []
    for url in urls:
        url = url.split(')')[0]
        url = url.split('(')[-1]
        ret.append(url)        
    return ret
        
assert ['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'] == clean_urls(['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'])
assert ['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'] == clean_urls(['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/)'])
assert ['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'] == clean_urls(['[text](https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/)nochmehrtext'])

In [6]:
def get_htmls(entry):
    links = clean_urls(entry['links_on_answer_path'])
    
    ret = dict()
    for link in links:
        html_file = f"websites/{link.split('://')[-1]}"
        if exists(html_file):
            for suffix in ['','/index.html']:
                if os.path.isfile(html_file + suffix):
                    ret[link] = html_file + suffix
                    break
            if link not in ret:
                raise ValueError('todo')
        
    return ret

assert get_htmls({'links_on_answer_path': ['http://www.louissachar.com/Wayside.htm']}) == {'http://www.louissachar.com/Wayside.htm': 'websites/www.louissachar.com/Wayside.htm'}
assert get_htmls({'links_on_answer_path': ['http://www.youtube.com/watch?v=NU75uz0b8EU']}) == {'http://www.youtube.com/watch?v=NU75uz0b8EU': 'websites/www.youtube.com/watch?v=NU75uz0b8EU/index.html'}
assert False, 'ToDo: I remember that I saw that the parsing above does not work for xy.'

AssertionError: 

In [7]:
def download_page(reddit_entry):
    if get_htmls(reddit_entry):
        return
    
    current_directory = os.getcwd()
    links = clean_urls(reddit_entry['links_on_answer_path'])
    timestamp = int(reddit_entry['solved_utc'])
    time = datetime.datetime.utcfromtimestamp(timestamp)
    time_start = (time - datetime.timedelta(days=365)).strftime("%Y%m%d%H%M%S")
    time_end = (time + datetime.timedelta(days=365)).strftime("%Y%m%d%H%M%S")
    for link in links:
        command = [
        "docker",
        "run",
        "-v",
        f"{current_directory}/websites:/websites",
        "hartator/wayback-machine-downloader",
        "--exact-url",
        "--from",
        time_start,
        "--to",
        time_end,
        "--maximum-snapshot",
        "2",
        link
        ]
        subprocess.run(command, check=True)


In [8]:
def extract_text(html_file, return_type):
    tree = HTMLTree.parse_from_bytes(open(html_file, 'rb').read(), detect_encoding(open(html_file, 'rb').read()))
    if return_type == 'title':
        return tree.head.text
    elif return_type == 'main':
        return tree.head.text + ' ' + extract_plain_text(tree, main_content=True)
    return tree.body.text

assert extract_text('websites/www.louissachar.com/Wayside.htm', 'title') == '\nLouis Sachar —  Wayside School Book Series\n\n'
assert extract_text('websites/www.louissachar.com/Wayside.htm', 'main').startswith("\nLouis Sachar —  Wayside School Book Series\n\n \n\nTo purchase any of Louis's books visit your local bookseller or make your selection online:\n\nSideways Stories from\nWayside School\n\n\xa0\n\nWayside School\nis Falling Down\n\n\xa0\n\nWayside School\n")

FileNotFoundError: [Errno 2] No such file or directory: 'websites/www.louissachar.com/Wayside.htm'

In [9]:
def extract_question(entry):
    return entry['title'].split(']')[-1] + '. ' + entry['content']

In [10]:
class text_normalization():
    def stem():
        return True

    def stop():
        return True

def construct_instance_for_model_training(entry, text_type):
    target_texts = []
    question = extract_question(entry)
    for link in get_htmls(entry).values():
        target_texts.append(extract_text(link, text_type))
    input_data = [json.dumps({"queries": target_texts, "doc":{"title": question}})]
    return list(calculate_doc_term_recall(input_data, text_normalization()))

In [11]:
data = []
with open('training-data-main-06-07-2023.jsonl') as f:
    for l in f:
        data += [json.loads(l)]
        
        

In [21]:
def sentence_scores(entry):
    instance = entry['training_samples']['instances'][0]
    word_scores = instance['term_recall']
    text = instance['doc']['title']
    
    sentences = [sent.text.strip() for sent in nlp(text).sents] 

    sentence_scores = {i: 0 for i in sentences}

    for sentence in sentences:
        num_words = len(sentence.split())

        for word in word_scores.keys():
            if word in sentence:
                sentence_scores[sentence] += word_scores[word]

        # Normalizing the score based on sentence length
        if num_words > 0:
            sentence_scores[sentence] = round(sentence_scores[sentence]/num_words, 3)

    return sentence_scores

In [13]:
def extract_top_k_sentences(entry, k):
    sentences_for_entry = sentence_scores(entry)
    return sorted(list(sentences_for_entry.keys()),key=lambda i: sentences_for_entry[i], reverse = True)[:k]
    

In [48]:
def extract_top_p_percent_sentences(entry, p):
    scores = sentence_scores(entry)
    sorted_sentences = sorted(scores,key=lambda i: scores[i], reverse = True)
    cut = int(len(scores) * p/100)
    top_sentences = sorted_sentences[:cut]
    return {sentence: scores[sentence] for sentence in top_sentences}


In [61]:
def extract_top_bottom_x_sentences(entry, x):
    scores = sentence_scores(entry)
    sorted_sentences = sorted(scores,key=lambda i: scores[i], reverse = True)
    top_sentences = sorted_sentences[:x]
    bot_sentences = sorted_sentences[-x:]
    sentences = top_sentences + bot_sentences
    return {sentence: scores[sentence] for sentence in sentences}

In [22]:
sentence_scores(data[60])

{'Guy from a tribe gets separated from his family and gets chased by a pack of wolves.': 0.294,
 "Im trying to remember everything as detailed as possible and here's the plot\n\nThere was this movie i forgot where there was this guy that got chased by a pack of wolves and he climbs up a tree.": 0.184,
 'One wolf clings on his foot injuring him so hi grabs his spear to stab one of the wolves.': 0.316,
 'The wolves then scamper away except for the injured one.': 0.4,
 'Days pass and he goes down the tree and thinks of killing the injured wolf': 0.467,
 'but he wasnt able to kill it.': 0.143,
 'Instead he brought it to a cave where the guy tends for him and the wolf which eventually ended up taming the wolf.': 0.217,
 'They both got healed and eventually left the cave.': 0.444,
 'The guy tries to drive away the wolf but eventually assists one another.': 0.385,
 "They eventually find this hut with a dead guy and the protagonist gets the dead guy's bow from him and they proceed to wander.":

In [15]:
extract_top_k_sentences(data[7], 3)

['American TV show telling you how to survive disaster situations.',
 'It only aired a few episodes in my country (NZ) a few years ago but I found it super interesting.',
 'Each episode would feature a small case situation and a large scale one, eg.']

In [62]:
extract_top_p_percent_sentences(data[7],50)

{'Show about how to survive disaster situations.': 0.429,
 'American TV show telling you how to survive disaster situations.': 0.4,
 'Hour long episodes I think.': 0.4,
 'Each episode would feature a small case situation and a large scale one, eg.': 0.286}

In [50]:
extract_top_p_percent_sentences(data[7],25)

{'Show about how to survive disaster situations.': 0.429,
 'American TV show telling you how to survive disaster situations.': 0.4}

In [60]:
extract_top_bottom_x_sentences(data[7],2)

['Show about how to survive disaster situations.', 'American TV show telling you how to survive disaster situations.', 'There was an episode about a Pandemic, another about a home invasion.', 'Hosted by a guy.']


{'Show about how to survive disaster situations.': 0.429,
 'American TV show telling you how to survive disaster situations.': 0.4,
 'There was an episode about a Pandemic, another about a home invasion.': 0.083,
 'Hosted by a guy.': 0.0}

In [None]:
construct_instance_for_model_training(entries[5], 'title')

In [None]:
instance = [{'doc': {'id': 1,
   'position': 1,
   'title': " A woman birthing monsters/aliens. I was re-watching Prometheus and during the c-section scene in the giant med-machine I had a little flashback to a movie or series (not sure) I saw as a child of a human woman giving birth to either monsters, mutants or aliens. It's not much to go on but the only thing i remember are a long medical table,  an orange light being cast on the table and a woman popping out what at the time I believed were aliens? It must have been an 80's early 90' production. \nThanks in advance :)\n\nEdit:  Everything had a dark atmosphere, like it was happening at night or in a dark room."},
  'term_recall': {'birth': 1.0,
   'birthing': 1.0,
   'c': 1.0,
   'scene': 1.0,
   'time': 1.0}}]

In [None]:
text = instance[0]['doc']['title']

list = instance[0]['term_recall'].keys()

In [None]:
for entry in tqdm(entries):
    download_page(entry)

In [None]:
!cat ./reddit-tomt-submissions.jsonl | shuf | head -100000 > json-tomt-sample.jsonl

In [None]:
!cat json-tomt-sample.jsonl | wc -l

In [None]:
df = pd.read_json("./json-tomt-sample.jsonl", lines=True)

In [None]:
df['category'] = df.apply(lambda i: extract_category(i), axis=1)

In [None]:
df.head(2)

In [None]:
df.iloc[0].to_dict()

In [None]:
def extract_category(tomt):
    
    return tomt.to_dict()['title'].split('[')[-1].split(']')[0]

In [None]:
df['category'].value_counts().head(20)

In [None]:
df[df['category'].str.contains('site')].category.value_counts()

In [None]:
df[df['category'].str.contains('page')].category.value_counts()

In [None]:
sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")
sns.boxplot(df.ups)
# observation: zero or ten or more upvotes are outlayers, hence we take questions with 1-9 upvotes

In [None]:
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")
sns.boxplot(df.downs)
# observation:questions with one downvote are already outlayers, hence we only take questions with zero downvotes

In [None]:
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")
sns.boxplot(df.num_comments)
# observation:questions with zero or 13 comments are already outlayers, hence we only take questions with zero downvotes

In [None]:
df.num_comments.describe()

In [None]:
df.num_comments.quantile(q=0.95)

In [None]:
#select topics

#define conditions
c1 = df["solved_utc"] != ""
c2 = (df["ups"]>0) & (df["ups"]<10)
c3 = df["downs"]==0
c4 = (df["num_comments"] > 0) & (df["num_comments"]<13)

#apply conditions to dataframe
topics = df[c1 & c2 & c3 & c4]

#select samples by category
books = topics[topics["category"].isin(["BOOK", "Book"])].sample(250)
movies = topics[topics["category"].isin(["MOVIE", "Movie", "movie"])].sample(250)
websites = topics[topics["category"].isin(["Website", "website"])] #less then 250 elements, so all are taken
songs = topics[topics["category"].isin(["SONG", "Song", "song", "MUSIC", "Music"])].sample(250)

In [None]:
books.to_json("./books.json", orient = "records", lines = True)
movies.to_json("./movies.json", orient = "records", lines = True)
websites.to_json("./websites.json", orient = "records", lines = True)
songs.to_json("./songs.json", orient = "records", lines = True)

In [None]:
print(books.shape) #(272, 127)
print(movies.shape) #(1029, 127)
print(websites.shape) #(110, 127)
print(songs.shape) #(1385, 127)