In [1]:
import json
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import datetime
import os
import subprocess
import nltk
import spacy
#nltk.download('stopwords')

from os.path import exists

import random
from resiliparse.parse.html import HTMLTree
from resiliparse.parse.encoding import detect_encoding
from resiliparse.extract.html2text import extract_plain_text
from get_training_query_term_recall import calculate_doc_term_recall
from passage_extraction_util import passage_calculate_doc_term_recall

nlp = spacy.load("en_core_web_sm")

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
c = 0

with open('./reddit-tomt-submissions.jsonl') as f, gzip.open('./reddit-tomt-submissions-with-links.jsonl.gz', 'wt') as output_file:
    for l in tqdm(f):
        l = json.loads(l)
        if 'links_on_answer_path' in l and len(l['links_on_answer_path']) > 0:
            output_file.write(json.dumps(l) + '\n')


1279425it [04:47, 4454.40it/s]


In [4]:
with gzip.open('./reddit-tomt-submissions-with-links.jsonl.gz') as f:
    entries = []
    for l in tqdm(f):
        l = json.loads(l)
        del l['comments']
        entries.append(l)


326458it [00:57, 5700.86it/s]


In [5]:
def clean_urls(urls):
    ret = []
    for url in urls:
        url = url.split(')')[0]
        url = url.split('(')[-1]
        ret.append(url)        
    return ret
        
assert ['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'] == clean_urls(['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'])
assert ['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'] == clean_urls(['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/)'])
assert ['https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/'] == clean_urls(['[text](https://www.reddit.com/r/woahdude/comments/66wqhv/true_cyan_color_illusion/)nochmehrtext'])

In [6]:
def get_htmls(entry):
    links = clean_urls(entry['links_on_answer_path'])
    
    ret = dict()
    for link in links:
        html_file = f"websites/{link.split('://')[-1]}"
        if exists(html_file):
            for suffix in ['','/index.html']:
                if os.path.isfile(html_file + suffix):
                    ret[link] = html_file + suffix
                    break
            if link not in ret:
                raise ValueError('todo')
        
    return ret

assert get_htmls({'links_on_answer_path': ['http://www.louissachar.com/Wayside.htm']}) == {'http://www.louissachar.com/Wayside.htm': 'websites/www.louissachar.com/Wayside.htm'}
assert get_htmls({'links_on_answer_path': ['http://www.youtube.com/watch?v=NU75uz0b8EU']}) == {'http://www.youtube.com/watch?v=NU75uz0b8EU': 'websites/www.youtube.com/watch?v=NU75uz0b8EU/index.html'}
assert False, 'ToDo: I remember that I saw that the parsing above does not work for xy.'

AssertionError: 

In [7]:
def download_page(reddit_entry):
    if get_htmls(reddit_entry):
        return
    
    current_directory = os.getcwd()
    links = clean_urls(reddit_entry['links_on_answer_path'])
    timestamp = int(reddit_entry['solved_utc'])
    time = datetime.datetime.utcfromtimestamp(timestamp)
    time_start = (time - datetime.timedelta(days=365)).strftime("%Y%m%d%H%M%S")
    time_end = (time + datetime.timedelta(days=365)).strftime("%Y%m%d%H%M%S")
    for link in links:
        command = [
        "docker",
        "run",
        "-v",
        f"{current_directory}/websites:/websites",
        "hartator/wayback-machine-downloader",
        "--exact-url",
        "--from",
        time_start,
        "--to",
        time_end,
        "--maximum-snapshot",
        "2",
        link
        ]
        subprocess.run(command, check=True)


In [8]:
def extract_text(html_file, return_type):
    tree = HTMLTree.parse_from_bytes(open(html_file, 'rb').read(), detect_encoding(open(html_file, 'rb').read()))
    if return_type == 'title':
        return tree.head.text
    elif return_type == 'main':
        return tree.head.text + ' ' + extract_plain_text(tree, main_content=True)
    return tree.body.text

assert extract_text('websites/www.louissachar.com/Wayside.htm', 'title') == '\nLouis Sachar —  Wayside School Book Series\n\n'
assert extract_text('websites/www.louissachar.com/Wayside.htm', 'main').startswith("\nLouis Sachar —  Wayside School Book Series\n\n \n\nTo purchase any of Louis's books visit your local bookseller or make your selection online:\n\nSideways Stories from\nWayside School\n\n\xa0\n\nWayside School\nis Falling Down\n\n\xa0\n\nWayside School\n")

FileNotFoundError: [Errno 2] No such file or directory: 'websites/www.louissachar.com/Wayside.htm'

In [9]:
def extract_question(entry):
    return entry['title'].split(']')[-1] + '. ' + entry['content']

In [10]:
class text_normalization():
    def stem():
        return True

    def stop():
        return True

def construct_instance_for_model_training(entry, text_type):
    target_texts = []
    question = extract_question(entry)
    for link in get_htmls(entry).values():
        target_texts.append(extract_text(link, text_type))
    input_data = [json.dumps({"queries": target_texts, "doc":{"title": question}})]
    return list(calculate_doc_term_recall(input_data, text_normalization()))

---

In [2]:
data = []
with open('training-data-main-06-07-2023.jsonl') as f:
    for l in f:
        data += [json.loads(l)]

In [3]:
def sentence_scores(entry):
    instance = entry['training_samples']['instances'][0]
    word_scores = instance['term_recall']
    text = instance['doc']['title']
    
    sentences = [sent.text.strip() for sent in nlp(text).sents] 

    sentence_scores = {i: 0 for i in sentences}

    for sentence in sentences:
        num_words = len(sentence.split())

        for word in word_scores.keys():
            if word in sentence:
                sentence_scores[sentence] += word_scores[word]

        # Normalizing the score based on sentence length
        if num_words > 0:
            sentence_scores[sentence] = round(sentence_scores[sentence]/num_words, 3)

    return sentence_scores

In [4]:
def extract_top_k_sentences(entry, k):
    sentences_for_entry = sentence_scores(entry)
    return sorted(list(sentences_for_entry.keys()),key=lambda i: sentences_for_entry[i], reverse = True)[:k]
    

In [5]:
def extract_top_p_percent_sentences(scores, p):
    #scores = sentence_scores(entry)
    sorted_sentences = sorted(scores,key=lambda i: scores[i], reverse = True)
    cut = int(len(scores) * p/100)
    top_sentences = sorted_sentences[:cut]
    return {sentence: scores[sentence] for sentence in top_sentences}


In [54]:
def extract_top_bottom_x_sentences(scores, x):
    #scores = sentence_scores(entry)
    sorted_sentences = sorted(scores,key=lambda i: scores[i], reverse = True)
    top_sentences = sorted_sentences[:x]
    bot_sentences = sorted_sentences[-x:]
    sentences = top_sentences + bot_sentences
    ret = {sentence: sentence in top_sentences for sentence in sentences if sentence not in top_sentences or sentence not in bot_sentences}
    return {k: 1 if v else 0 for k,v in ret.items() if not v or len(k) > 15}

In [None]:
scores = [sentence_scores(entry) for entry in tqdm(data)]

In [24]:
def to_training_format(func, scored_sentences):
    input_text = ' '.join(scored_sentences.keys())
    target_text = ' '.join(func(scored_sentences).keys())
    return {'input_text' : input_text, 'target_text' : target_text}

In [55]:
def to_classification_format(func, scored_sentences):
    ret = []
    for sentence, label in func(scored_sentences).items():
        ret += [{'text' : sentence, 'labels' : label}]
    return ret

In [20]:
scores = []
with open('scores.json') as f:
    for l in f.read().split('}\n'):
        entry = l+'}'
        try:
            scores += [json.loads(entry)]
        except:
            pass

In [21]:
scores

[{'Book I read in 3rd grade.': 0.167,
  'There was a book I read in third grade...': 0.222,
  'here I what I remember:\n\nThe book was about a class of students who went to a school with like thirty floors...': 0.182,
  'Except one floor was missing.': 0.4,
  'That is basically all I remember.': 0.0,
  'Except, each chapter focused around a different student/teacher.': 0.125},
 {'where one of the strips starts as a casual looking for an iron joke but ends in a transdimensional epic.': 0.2,
  "I don't remember any other strips of this webcomic, I must have seen it a year or two ago.": 0.105,
  'The art was very well detailed.': 0.333,
  'The very first panels were about two guys preparing to go to a wedding but needing an iron that was in a closet but the closet was a portal to a dark dimension, there are some kidnapping of the bride and twists in the end.': 0.022},
 {'Name of song in this video.': 0.5,
  'https://www.youtube.com/watch?v=HpSs2Zt5Fz4': 6.0},
 {"1990's Educational Cartoon

In [23]:
long_scores = [entry for entry in scores if len(entry) > 3]
long_scores[0:3]

[{'Book I read in 3rd grade.': 0.167,
  'There was a book I read in third grade...': 0.222,
  'here I what I remember:\n\nThe book was about a class of students who went to a school with like thirty floors...': 0.182,
  'Except one floor was missing.': 0.4,
  'That is basically all I remember.': 0.0,
  'Except, each chapter focused around a different student/teacher.': 0.125},
 {'where one of the strips starts as a casual looking for an iron joke but ends in a transdimensional epic.': 0.2,
  "I don't remember any other strips of this webcomic, I must have seen it a year or two ago.": 0.105,
  'The art was very well detailed.': 0.333,
  'The very first panels were about two guys preparing to go to a wedding but needing an iron that was in a closet but the closet was a portal to a dark dimension, there are some kidnapping of the bride and twists in the end.': 0.022},
 {"1990's Educational Cartoon for kids to learn French.": 0.0,
  'Hi all,\n\nWhen I was really young, 3-5, I remember watc

In [27]:
to_training_format(lambda i: extract_top_p_percent_sentences(i, 50),long_scores[1])

{'input_text': "where one of the strips starts as a casual looking for an iron joke but ends in a transdimensional epic. I don't remember any other strips of this webcomic, I must have seen it a year or two ago. The art was very well detailed. The very first panels were about two guys preparing to go to a wedding but needing an iron that was in a closet but the closet was a portal to a dark dimension, there are some kidnapping of the bride and twists in the end.",
 'target_text': 'The art was very well detailed. where one of the strips starts as a casual looking for an iron joke but ends in a transdimensional epic.'}

In [56]:
to_classification_format(lambda i: extract_top_bottom_x_sentences(i, 2),long_scores[1])

[{'text': 'The art was very well detailed.', 'labels': 1},
 {'text': 'where one of the strips starts as a casual looking for an iron joke but ends in a transdimensional epic.',
  'labels': 1},
 {'text': "I don't remember any other strips of this webcomic, I must have seen it a year or two ago.",
  'labels': 0},
 {'text': 'The very first panels were about two guys preparing to go to a wedding but needing an iron that was in a closet but the closet was a portal to a dark dimension, there are some kidnapping of the bride and twists in the end.',
  'labels': 0}]

In [29]:
len(long_scores)

74769

In [43]:
with open('./top_50.json', 'w') as f:
    for entry in long_scores:
        entry = to_training_format(lambda i: extract_top_p_percent_sentences(i, 50),entry)
        f.write(json.dumps(entry)+'\n')

In [57]:
with open('./best_worst_2.json', 'w') as f:
    for entry in long_scores:
        entry = to_classification_format(lambda i: extract_top_bottom_x_sentences(i, 2),entry)
        for i in entry:
            f.write(json.dumps(i)+'\n')

In [16]:
#take top 50% of every entry
top_50 = [extract_top_p_percent_sentences(score, 50) for score in long_scores]

#take best and worst 3 sentences of every entry
best_worst_3 = [extract_top_bottom_x_sentences(score, 3) for score in long_scores]

In [17]:
with open('./top_50.json', 'w') as f:
    for entry in top_50:
        f.write(f'{json.dumps(entry, indent = 2)}\n')
        
with open('./best_worst_3.json', 'w') as f:
    for entry in best_worst_3:
        f.write(f'{json.dumps(entry, indent = 2)}\n')

In [18]:
from copy import deepcopy

training_method = 'title'
with open('training-data-title-06-07-2023.jsonl', 'w') as f:
    for entry in tqdm(entries):
        entry = deepcopy(entry)
        training_samples = []
        try:
            for i in construct_instance_for_model_training(entry, training_method):
                if i['term_recall']:
                    training_samples += [i]
        except:
            pass
    
        entry['training_samples'] = {'method': training_method, 'instances': training_samples}
    
        if training_samples:
            f.write(json.dumps(entry) + '\n')


NameError: name 'entries' is not defined

In [None]:
construct_instance_for_model_training(entries[5], 'title')

In [None]:
instance = [{'doc': {'id': 1,
   'position': 1,
   'title': " A woman birthing monsters/aliens. I was re-watching Prometheus and during the c-section scene in the giant med-machine I had a little flashback to a movie or series (not sure) I saw as a child of a human woman giving birth to either monsters, mutants or aliens. It's not much to go on but the only thing i remember are a long medical table,  an orange light being cast on the table and a woman popping out what at the time I believed were aliens? It must have been an 80's early 90' production. \nThanks in advance :)\n\nEdit:  Everything had a dark atmosphere, like it was happening at night or in a dark room."},
  'term_recall': {'birth': 1.0,
   'birthing': 1.0,
   'c': 1.0,
   'scene': 1.0,
   'time': 1.0}}]

In [None]:
text = instance[0]['doc']['title']

list = instance[0]['term_recall'].keys()

In [None]:
for entry in tqdm(entries):
    download_page(entry)

In [None]:
!cat ./reddit-tomt-submissions.jsonl | shuf | head -100000 > json-tomt-sample.jsonl

In [None]:
!cat json-tomt-sample.jsonl | wc -l

In [None]:
df = pd.read_json("./json-tomt-sample.jsonl", lines=True)

In [None]:
df['category'] = df.apply(lambda i: extract_category(i), axis=1)

In [None]:
df.head(2)

In [None]:
df.iloc[0].to_dict()

In [None]:
def extract_category(tomt):
    
    return tomt.to_dict()['title'].split('[')[-1].split(']')[0]

In [None]:
df['category'].value_counts().head(20)

In [None]:
df[df['category'].str.contains('site')].category.value_counts()

In [None]:
df[df['category'].str.contains('page')].category.value_counts()

In [None]:
sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")
sns.boxplot(df.ups)
# observation: zero or ten or more upvotes are outlayers, hence we take questions with 1-9 upvotes

In [None]:
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")
sns.boxplot(df.downs)
# observation:questions with one downvote are already outlayers, hence we only take questions with zero downvotes

In [None]:
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")
sns.boxplot(df.num_comments)
# observation:questions with zero or 13 comments are already outlayers, hence we only take questions with zero downvotes

In [None]:
df.num_comments.describe()

In [None]:
df.num_comments.quantile(q=0.95)

In [None]:
#select topics

#define conditions
c1 = df["solved_utc"] != ""
c2 = (df["ups"]>0) & (df["ups"]<10)
c3 = df["downs"]==0
c4 = (df["num_comments"] > 0) & (df["num_comments"]<13)

#apply conditions to dataframe
topics = df[c1 & c2 & c3 & c4]

#select samples by category
books = topics[topics["category"].isin(["BOOK", "Book"])].sample(250)
movies = topics[topics["category"].isin(["MOVIE", "Movie", "movie"])].sample(250)
websites = topics[topics["category"].isin(["Website", "website"])] #less then 250 elements, so all are taken
songs = topics[topics["category"].isin(["SONG", "Song", "song", "MUSIC", "Music"])].sample(250)

In [None]:
books.to_json("./books.json", orient = "records", lines = True)
movies.to_json("./movies.json", orient = "records", lines = True)
websites.to_json("./websites.json", orient = "records", lines = True)
songs.to_json("./songs.json", orient = "records", lines = True)

In [None]:
print(books.shape) #(272, 127)
print(movies.shape) #(1029, 127)
print(websites.shape) #(110, 127)
print(songs.shape) #(1385, 127)