In [1]:
import numpy as np
import pandas as pd
import os
import json
import pickle
import random
import math
import re

In [2]:
f = open('processed/url_cui_mapping.json')
url_cui_dict = json.load(f)

In [3]:
root = "."
OUTPUT_DIR = os.path.join(root, 'processed')
train_filename = "train"
val_filename = "valid"
test_filename = "test"

drop_all = True

In [4]:
df_train_all = pd.read_json('train_tweet.json')
df_test_all = pd.read_json('test_tweet.json')
df_val_all = pd.read_json('val_tweet.json')

In [5]:
df_train_all.columns

Index(['OriginalTweetID', 'otweet_content', 'url'], dtype='object')

In [6]:
columns = ['OriginalTweetID', 'otweet_content', 'url']

In [7]:
df_train = df_train_all[columns]
df_test = df_test_all[columns]
df_val = df_val_all[columns]

In [8]:
df_train.head()

Unnamed: 0,OriginalTweetID,otweet_content,url
0,780136889441456128,The families of these four should sit in the f...,http://www.politifact.com/truth-o-meter/statem...
1,913075551212011520,"@user <person> didnt lower my my taxes , and h...",http://www.politifact.com/truth-o-meter/statem...
2,753045148775440384,@JaredBerry316 @glennbeck like when white Chri...,http://www.politifact.com/punditfact/statement...
3,743830161393520640,@Adam_Kazda 60 diplomats were killed on Bush's...,http://www.snopes.com/donald-trumps-son-game-h...
4,818188352314900480,"@fight4women Wow, you got a source for all tha...",http://www.politifact.com/truth-o-meter/statem...


In [9]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

(58562, 3)
(7322, 3)
(7319, 3)


In [10]:
df_train.iloc[0]

OriginalTweetID                                   780136889441456128
otweet_content     The families of these four should sit in the f...
url                http://www.politifact.com/truth-o-meter/statem...
Name: 0, dtype: object

In [11]:
x = df_train.iloc[0]

In [12]:
type(int(x['OriginalTweetID']))

int

In [13]:
def write_output(split: str, mentions: list):
    with open(os.path.join(OUTPUT_DIR, split + '.jsonl'), 'w') as f:
        f.write('\n'.join([json.dumps(m) for m in mentions]))
    print(f'{split} set Done.')

In [14]:
def clean_up(s: str):
    if type(s) is float and math.isnan(s):
        # handling nans
        return ""
    else:
        # remove URLs
        s = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', s)
        s = re.sub(r'http\S+', '', s)
        
        # remove content in <...>
        # s = re.sub(r'<[\w\d]+>', '', s)
        
        # remove special characters - ()#%^&*@! etc.
        s = ''.join(e for e in s if e.isalnum() or e is ' ')
        
        # remove multiple spaces
        s = re.sub("\s+"," ",s)
        
        return s.strip()

In [15]:
def drop_intersecting_tweets(train_df: pd.DataFrame, df: pd.DataFrame):
    df = df.drop(df[df['OriginalTweetID'].isin(train_df['OriginalTweetID'])].index)
    return df.drop(df[df['otweet_content'].isin(train_df['otweet_content'])].index)

In [16]:
def drop_duplicates(df: pd.DataFrame, drop_all: bool):
    # Drop Duplicate otweet_content
    df_int = df.drop_duplicates(subset=['otweet_content'], ignore_index=True, keep="first")
    
    if drop_all:
        keep=False
    else:
        keep="first"    
    return df_int.drop_duplicates(subset=['OriginalTweetID'], ignore_index=True, keep=keep)

In [17]:
def drop_noisy_rows(df: pd.DataFrame):
    df['me_lens']=df['otweet_content'].apply(lambda x: len(x))
    return df.drop(df[df['me_lens'] < 34].index)

In [18]:
def preprocess_mentions(df: pd.DataFrame, is_train: bool, drop_all: bool, train_df: pd.DataFrame=None):
    df_int = df.copy()
    df_int['otweet_content'] = df_int['otweet_content'].apply(lambda s: clean_up(s))
    print(df.shape)
    
    df_final = drop_noisy_rows(df_int)
    print(df_final.shape)
    
    df_final = drop_duplicates(df_final, drop_all)
    print(df_final.shape)
    
    if not is_train:
        df_final = drop_intersecting_tweets(train_df, df_final)
        print(df_final.shape)
       
    return df_final

In [19]:
def build_mention_json(df: pd.DataFrame):
    mentions = []
    for idx in range(df.shape[0]):
        row = df.iloc[idx]

        if row['url'] not in url_cui_dict:
            continue

        label_id = url_cui_dict[row['url']]

        transformed_mention = {}
        
        transformed_mention['mention'] = row['otweet_content']
        transformed_mention['mention_id'] = int(row['OriginalTweetID'])
        transformed_mention['context_left'] = ""
        transformed_mention['context_right'] = ""
        transformed_mention['context_doc_id'] = ""
        transformed_mention['type'] = ""
        transformed_mention['label_id'] = label_id
        transformed_mention['label'] = ""
        transformed_mention['label_title'] = ""
        mentions.append(transformed_mention)
    
    return mentions

In [20]:
# Creating Training Json file
df_train_processed = preprocess_mentions(df_train, is_train=True, drop_all=drop_all)
train_mentions = build_mention_json(df_train_processed)
write_output(train_filename, train_mentions)

(58562, 3)
(49941, 4)
(43123, 4)
train set Done.


In [21]:
df_train_processed

Unnamed: 0,OriginalTweetID,otweet_content,url,me_lens
0,780136889441456128,The families of these four should sit in the f...,http://www.politifact.com/truth-o-meter/statem...,134
1,913075551212011520,user person didnt lower my my taxes and he rai...,http://www.politifact.com/truth-o-meter/statem...,136
2,753045148775440384,JaredBerry316 glennbeck like when white Christ...,http://www.politifact.com/punditfact/statement...,124
3,743830161393520640,AdamKazda 60 diplomats were killed on Bushs wa...,http://www.snopes.com/donald-trumps-son-game-h...,121
4,818188352314900480,fight4women Wow you got a source for all that ...,http://www.politifact.com/truth-o-meter/statem...,134
...,...,...,...,...
43118,969486554417811456,user a troll response number turn us all n2 sl...,http://www.politifact.com/new-york/statements/...,123
43119,990227999999094784,user you should be ashamed of yourself defendi...,https://www.snopes.com/fact-check/how-to-creat...,276
43120,763168655416975360,user he borrowed money from the russians and c...,http://www.politifact.com/punditfact/statement...,106
43121,987398087458074624,user citation please you dont get to just make...,http://www.politifact.com/new-york/statements/...,105


In [22]:
# Creating Validation Json file
df_val_processed = preprocess_mentions(df_val, is_train=False, drop_all=drop_all, train_df=df_train_processed)
val_mentions = build_mention_json(df_val_processed)
write_output(val_filename, val_mentions)

(7319, 3)
(6158, 4)
(5816, 4)
(5214, 4)
valid set Done.


In [23]:
# Creating Test Json file
df_test_processed = preprocess_mentions(df_test, is_train=False, drop_all=drop_all, train_df=df_train_processed)
test_mentions = build_mention_json(df_test_processed)
write_output(test_filename, test_mentions)

(7322, 3)
(6162, 4)
(5761, 4)
(5202, 4)
test set Done.


In [24]:
print(f"Train: Out of {df_train.shape[0]} records, {len(train_mentions)} have been retained after processing")
print(f"Validation: Out of {df_val.shape[0]} records, {len(val_mentions)} have been retained after processing")
print(f"Test: Out of {df_test.shape[0]} records, {len(test_mentions)} have been retained after processing")

Train: Out of 58562 records, 42655 have been retained after processing
Validation: Out of 7319 records, 5214 have been retained after processing
Test: Out of 7322 records, 5202 have been retained after processing


## Sanity

In [25]:
df_train_final = pd.DataFrame(train_mentions)

In [26]:
df_train_final.head()

Unnamed: 0,mention,mention_id,context_left,context_right,context_doc_id,type,label_id,label,label_title
0,The families of these four should sit in the f...,780136889441456128,,,,,272,,
1,user person didnt lower my my taxes and he rai...,913075551212011520,,,,,273,,
2,JaredBerry316 glennbeck like when white Christ...,753045148775440384,,,,,274,,
3,AdamKazda 60 diplomats were killed on Bushs wa...,743830161393520640,,,,,14,,
4,fight4women Wow you got a source for all that ...,818188352314900480,,,,,275,,


In [27]:
count_df = pd.DataFrame(df_train_final.groupby('mention')['mention_id'].count()).reset_index(level=['mention'])

In [28]:
count_df[count_df['mention_id'] > 1]

Unnamed: 0,mention,mention_id


In [29]:
count = 0
total_length = 0
for i in range(df_train_final.shape[0]):
    total_length += len(df_train_final['mention'].iloc[i])
    if len(df_train_final['mention'].iloc[i]) < 34:
        count += 1
        print(df_train_final['mention'].iloc[i])

print(count)      

0


In [30]:
df_val_final = pd.DataFrame(val_mentions)

In [31]:
df_val_final[df_val_final['mention'].isin(df_train_final['mention'])]

Unnamed: 0,mention,mention_id,context_left,context_right,context_doc_id,type,label_id,label,label_title


In [32]:
df_test_final = pd.DataFrame(test_mentions)

In [33]:
df_test_final[df_test_final['mention'].isin(df_train_final['mention'])]

Unnamed: 0,mention,mention_id,context_left,context_right,context_doc_id,type,label_id,label,label_title
