### Fakeddit

### Setup Environment:
* [Paper](https://arxiv.org/pdf/1911.03854.pdf)

### Setup Environment:

In [1]:
import os
import pandas as pd

from src.classifiers import process_labels, split_data
from src.classifiers_base import preprocess_df

from transformers import BertTokenizer

from src.multimodal_data_loader import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_base_cpu_metrics import train_early_fusion, train_late_fusion

In [2]:
PATH = 'datasets/fakeddit/'

In [3]:
text_path = os.path.join(PATH, 'labels_subset.csv')
images_path = os.path.join(PATH, 'images')

## Get data

In [4]:
df = pd.read_csv(text_path)
df

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label,split
0,michaelconfoy,loose talk got there first keep it under your ...,1.429942e+09,i.imgur.com,True,33snxf.jpg,https://external-preview.redd.it/TfwZkzBSsqgq0...,,6.0,70,propagandaposters,"""Loose talk got there first! Keep it under you...",0.98,0,1,5,train
1,SaltMineForeman,this tree at my campsite naturally grew a hear...,1.554421e+09,i.imgur.com,True,b9k2ta.jpg,https://external-preview.redd.it/ulTUeOqBi2y7F...,,2.0,30,mildlyinteresting,This tree at my campsite naturally grew a hear...,0.89,1,0,0,train
2,Shawn_666,fdr addressing the nation,1.522982e+09,i.redd.it,True,8a5wy9.jpg,https://preview.redd.it/fyop34q987q01.png?widt...,,1.0,12,fakehistoryporn,FDR addressing the nation (1932),0.80,0,2,2,train
3,YOLO2THEMAX,face morph,1.448948e+09,,True,cxiu2qb.jpg,http://i.imgur.com/nLdvOJ5.jpg,3urf4a,,2,psbattle_artwork,Face morph,,0,2,4,train
4,FarMojo,a japanese kamikaze pilot of the sixth air arm...,1.559169e+09,i.redd.it,True,bul96p.jpg,https://preview.redd.it/3ro69vkv88131.jpg?widt...,,0.0,8,fakehistoryporn,A Japanese kamikaze pilot of the sixth air arm...,0.75,0,2,2,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68073,Anazron,giant flesheating spiders could soon be moving...,1.472864e+09,grimsbytelegraph.co.uk,True,50whzb.jpg,https://external-preview.redd.it/HEa-SZzApziHx...,,33.0,1587,savedyouaclick,Giant 'flesh-eating' spiders could soon be mov...,0.96,0,2,5,val
68074,CaptainBubblesMcGee,birdie fjord,1.514092e+09,i.redd.it,True,7lte89.jpg,https://preview.redd.it/pbzko489ys501.jpg?widt...,,1.0,25,fakealbumcovers,Birdie - Fjord,1.00,0,2,1,val
68075,prohitman,spittin hot fire and hairballs,1.456299e+09,,True,d0bptg9.jpg,http://i.imgur.com/aABgTcA.jpg,479m43,,99,psbattle_artwork,Spittin' hot fire... and hairballs,,0,2,4,val
68076,lux514,supreme court lets full trump travel ban take ...,1.512429e+09,amp.cnn.com,True,7hlb60.jpg,https://external-preview.redd.it/Cqtqrno3dAQVv...,,1.0,3,neutralnews,Supreme Court lets full Trump travel ban take ...,1.00,1,0,0,val


## Data Perparation

In [5]:
# Select features and labels vectors
text_columns = 'title'
image_columns = 'id'
label_columns = '2_way_label'

df = preprocess_df(df, image_columns, images_path)

# Split the data
train_df, test_df = split_data(df)

# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)

100%|██████████| 68078/68078 [00:02<00:00, 25218.54it/s]
100%|██████████| 68078/68078 [00:34<00:00, 1960.58it/s]


Train Shape: (56214, 17)
Test Shape: (5911, 17)


In [6]:
train_df

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label,split
0,michaelconfoy,loose talk got there first keep it under your ...,1.429942e+09,i.imgur.com,True,datasets/fakeddit/images/33snxf.jpg,https://external-preview.redd.it/TfwZkzBSsqgq0...,,6.0,70,propagandaposters,"""Loose talk got there first! Keep it under you...",0.98,0,1,5,train
1,SaltMineForeman,this tree at my campsite naturally grew a hear...,1.554421e+09,i.imgur.com,True,datasets/fakeddit/images/b9k2ta.jpg,https://external-preview.redd.it/ulTUeOqBi2y7F...,,2.0,30,mildlyinteresting,This tree at my campsite naturally grew a hear...,0.89,1,0,0,train
2,Shawn_666,fdr addressing the nation,1.522982e+09,i.redd.it,True,datasets/fakeddit/images/8a5wy9.jpg,https://preview.redd.it/fyop34q987q01.png?widt...,,1.0,12,fakehistoryporn,FDR addressing the nation (1932),0.80,0,2,2,train
3,YOLO2THEMAX,face morph,1.448948e+09,,True,datasets/fakeddit/images/cxiu2qb.jpg,http://i.imgur.com/nLdvOJ5.jpg,3urf4a,,2,psbattle_artwork,Face morph,,0,2,4,train
4,FarMojo,a japanese kamikaze pilot of the sixth air arm...,1.559169e+09,i.redd.it,True,datasets/fakeddit/images/bul96p.jpg,https://preview.redd.it/3ro69vkv88131.jpg?widt...,,0.0,8,fakehistoryporn,A Japanese kamikaze pilot of the sixth air arm...,0.75,0,2,2,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56241,crazeecatlady801,father and son,1.388601e+09,imgur.com,True,datasets/fakeddit/images/1u64tc.jpg,https://external-preview.redd.it/SSY_cSjrDTbTo...,,0.0,3,pareidolia,Father and son.,0.81,0,2,2,train
56242,dirtydaversfg,well hidden perfect fit owl,1.501643e+09,i.imgur.com,True,datasets/fakeddit/images/6r1e9l.jpg,https://external-preview.redd.it/CrOdO-iprQvYL...,,133.0,21307,photoshopbattles,PsBattle: Well hidden perfect fit owl,0.93,1,0,0,train
56243,mealzer,bought a craft gingerale and it had this under...,1.553030e+09,i.imgur.com,True,datasets/fakeddit/images/b330ti.jpg,https://external-preview.redd.it/8sn7r4Ajyz1aJ...,,7.0,12,mildlyinteresting,Bought a craft gingerale and it had this under...,0.88,1,0,0,train
56244,,nineeleven terror organization operating under...,1.536688e+09,imgur.com,True,datasets/fakeddit/images/9ezs3y.jpg,https://external-preview.redd.it/5FADs-75THyBG...,,0.0,10,fakehistoryporn,NineEleven - Terror Organization Operating Und...,0.75,0,2,2,train


In [7]:
# Instantiate tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

### Models

In [9]:
if label_columns == '2_way_label':
    output_size = 1
else:
    output_size = len(pd.unique(train_df[label_columns]))
multilabel = False

In [None]:
# Train early fusion model
print("Training Early Fusion Model:")
train_early_fusion(train_loader, test_loader, output_size, num_epochs=10, multilabel=multilabel, report=True, lr=0.001)

Training Early Fusion Model:
The number of parameters of the model are: 197121


STAGE:2023-12-30 00:23:22 127940:127940 ActivityProfilerController.cpp:312] Completed Stage: Warm Up


In [None]:
# Train late fusion model
print("Training Late Fusion Model:")
train_late_fusion(train_loader, test_loader, output_size, num_epochs=10, multilabel=multilabel, report=True)