## Generate Dataset

Check how to create a held out dataset

In [8]:
%load_ext autoreload
%autoreload 2

import json

with open("../data/articles.json") as f:
    raw_articles = json.load(f)

with open("../data/comments_not_anon.json") as f:
    raw_comments = json.load(f)
    
print(f"We have {len(raw_articles)} articles and {len(raw_comments)} comments")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
We have 616 articles and 28637 comments


In [9]:
articles = {art['tweet_id']:art for art in raw_articles}

for art in articles.values():
    art["comments"] = []


In [10]:
from collections import defaultdict

counts = []

categories = [
    "WOMEN", "LGBTI", "RACISM", "CLASS",
    "POLITICS", "DISABLED", "APPEARANCE", "CRIMINAL",
]

def process_comment(raw_comment):
    """
    Process raw comment
    """
    ret = {
        "text": raw_comment["text"],
        "is_hateful": int(len(raw_comment['HATE']) >= 2)
    }
    for cat in categories + ["calls"]:
        ret[cat] = 0
    
    if ret["is_hateful"]:
        ret["calls"] = int(len(raw_comment['CALLS']) >= 2)
        
        for category in categories:
            ret[category] = int(len(raw_comment[category]) > 0)
    
    return ret



## Process articles

In [11]:
from tqdm.auto import tqdm

for comment in tqdm(raw_comments):
    tweet_id = comment["article_id"]
    article = articles[tweet_id]
    article["comments"].append(
        process_comment(comment)
    )

  0%|          | 0/28637 [00:00<?, ?it/s]

## Naive approach

Random sample of articles

In [12]:
from sklearn.model_selection import train_test_split

train_articles, test_articles = train_test_split(list(articles.values()), train_size=0.8, random_state=2021)


len(train_articles), len(test_articles)

(492, 124)

In [13]:
with open("../data/train.json", "w") as f:
    json.dump(train_articles, f)
with open("../data/test.json", "w") as f:
    json.dump(test_articles, f)