In [1]:
import numpy as np
import pandas as pd
import os
import json
import pickle
import random
import math
import re

In [2]:
f = open('processed/url_cui_mapping.json')
url_cui_dict = json.load(f)

In [3]:
root = "."
OUTPUT_DIR = os.path.join(root, 'processed')

In [4]:
df_train_all = pd.read_json('raw_data/train.json')
df_test_all = pd.read_json('raw_data/test.json')
df_val_all = pd.read_json('raw_data/val.json')

In [5]:
df_train_all.columns

Index(['DTweetID', 'DguardianID', 'DguardianName', 'OriginalPosterID',
       'OriginalTweetID', 'ScreenNameOriginalPoster', 'dtweet_content',
       'label', 'otweet_content', 'url'],
      dtype='object')

In [6]:
columns = ['OriginalTweetID', 'otweet_content', 'url']

In [7]:
df_train = df_train_all[columns]
df_test = df_test_all[columns]
df_val = df_val_all[columns]

In [8]:
df_train.head()

Unnamed: 0,OriginalTweetID,otweet_content,url
0,780136889441456128,the families of these <number> should sit in t...,http://www.politifact.com/truth-o-meter/statem...
1,913075551212011520,"@user <person> didnt lower my my taxes , and h...",http://www.politifact.com/truth-o-meter/statem...
2,753045148775440384,@user like when white christian men commit ter...,http://www.politifact.com/punditfact/statement...
3,743830161393520640,@user don't believe the media ! they lie . loo...,http://www.snopes.com/donald-trumps-son-game-h...
4,818188352314900480,@user says ultra liberal <person> hating,http://www.politifact.com/truth-o-meter/statem...


In [9]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

(58562, 3)
(7322, 3)
(7319, 3)


In [10]:
df_train.iloc[0]

OriginalTweetID                                   780136889441456128
otweet_content     the families of these <number> should sit in t...
url                http://www.politifact.com/truth-o-meter/statem...
Name: 0, dtype: object

In [11]:
x = df_train.iloc[0]

In [12]:
type(int(x['OriginalTweetID']))

int

In [13]:
def write_output(split: str, mentions: list):
    with open(os.path.join(OUTPUT_DIR, split + '.jsonl'), 'w') as f:
        f.write('\n'.join([json.dumps(m) for m in mentions]))
    print(f'{split} set Done.')

In [14]:
def clean_up(s: str):
    if type(s) is float and math.isnan(s):
        return ""
    else:
        s = ''.join(e for e in s if e.isalnum() or e is ' ')
        s = re.sub("\s+"," ",s)
        return s.strip()

In [15]:
def build_mention_json(df: pd.DataFrame):
    mentions = []
    for idx in range(df.shape[0]):
        row = df.iloc[idx]

        if row['url'] not in url_cui_dict:
            continue

        label_id = url_cui_dict[row['url']]

        transformed_mention = {}
        
        transformed_mention['mention'] = clean_up(row['otweet_content'])
        transformed_mention['mention_id'] = int(row['OriginalTweetID'])
        transformed_mention['context_left'] = ""
        transformed_mention['context_right'] = ""
        transformed_mention['context_doc_id'] = ""
        transformed_mention['type'] = ""
        transformed_mention['label_id'] = label_id
        transformed_mention['label'] = ""
        transformed_mention['label_title'] = ""
        mentions.append(transformed_mention)
    
    return mentions

In [16]:
# Creating Training Json file
train_mentions = build_mention_json(df_train)
write_output("train", train_mentions)

train set Done.


In [17]:
# Creating Validation Json file
val_mentions = build_mention_json(df_val)
write_output("valid", val_mentions)

valid set Done.


In [18]:
# Creating Validation Json file
test_mentions = build_mention_json(df_test)
write_output("test", test_mentions)

test set Done.
