In [30]:
import pandas as pd
from datetime import datetime

## Submissions

In [65]:
subs = pd.read_csv('../data/morocco/submissions_all.csv', low_memory=False)

# drop unnecessary columns
subs.drop(columns=['Unnamed: 0', 'media', 'media_embed', 'thumbnail'], inplace=True)

# delete rows with [deleted by user] or [ Removed by Reddit ] in title
subs = subs[subs['title'] != '[deleted by user]']
subs = subs[subs['title'] != '[ Removed by Reddit ]']
subs = subs[subs['title'] != '[image processing failed]']

# replace [deleted] and [removed] with Empty str in selftext and author
subs['selftext'] = subs['selftext'].replace(['[deleted]', '[removed]', 'Title', 'title'], '')
subs['title'] = subs['title'].replace(['[deleted]', '[removed]', 'Title', 'title'], '')
subs['author'] = subs['author'].replace('[deleted]', '')

# replace nan with Empty str in all columns
subs.fillna('', inplace=True)

# drop duplicates
subs.drop_duplicates(subset=['id'], inplace=True)
subs.reset_index(drop=True, inplace=True)

# change created_utc to datetime type
subs['created_utc'] = pd.to_datetime(subs['created_utc'])

# order by created_utc
subs.sort_values(by='created_utc', inplace=True)


In [66]:
subs

Unnamed: 0,id,author,author_flair_text,title,selftext,link_flair_text,created_utc,permalink,score,num_comments,over_18,hide_score
88836,83vri,taoufix,,Facebook is lost case [pic],,,2009-03-11 18:24:44,/r/Morocco/comments/83vri/facebook_is_lost_cas...,3,3,False,False
88837,c6u7c,,,Rabat Agdal At Night,,,2010-05-21 21:43:14,/r/Morocco/comments/c6u7c/rabat_agdal_at_night/,3,2,False,False
88838,c7162,taoufix,,Beach near Sidi Ifni at sunset [pic],,,2010-05-22 15:53:13,/r/Morocco/comments/c7162/beach_near_sidi_ifni...,4,0,False,False
88839,c71ir,,,"Medina de Rabat on a hazy, lazy friday",,,2010-05-22 16:43:48,/r/Morocco/comments/c71ir/medina_de_rabat_on_a...,3,1,False,False
88840,c727d,taoufix,,Tiznit traditional market street during lunch ...,,,2010-05-22 18:11:18,/r/Morocco/comments/c727d/tiznit_traditional_m...,3,1,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
86640,1d3ofx4,penelopelouiseb,:snoo_smile: Visitor,Beautiful Asilah!,Some of my photos from Asilah! It was on my Mo...,:art: Art &amp; Photography,2024-05-29 22:22:27,/r/Morocco/comments/1d3ofx4/beautiful_asilah/,1,1,False,
86639,1d3ohin,Time-Ad-8776,:snoo_smile: Visitor,aliexpress fake airpods,can anyone recommend me chi fakes free shippin...,:technology: Science &amp; Tech,2024-05-29 22:24:24,/r/Morocco/comments/1d3ohin/aliexpress_fake_ai...,1,1,False,
86638,1d3orn3,PotentialOrder5837,:snoo_smile: Visitor,Sending money to yourself for vacation,Hi \nI plan to send myself around 50k dirhams ...,:travel: Travel,2024-05-29 22:37:01,/r/Morocco/comments/1d3orn3/sending_money_to_y...,1,1,False,
86646,1d3pnbc,Leather_Alfalfa6519,:snoo_smile: Visitor,is it too late to leave? do I actually leave o...,I’m a 26 (turning 26 next month) y.o female wi...,:Discussion: Discussion,2024-05-29 23:17:22,/r/Morocco/comments/1d3pnbc/is_it_too_late_to_...,1,1,False,


In [67]:
subs.to_csv('../data/cleaned/submissions.csv', index=False)

## Comments

In [101]:
comments = pd.read_csv('../data/morocco/all_comments.csv', low_memory=False)

# drop unnecessary columns
comments.drop(columns=['Unnamed: 0', 'subreddit_id', 'subreddit'], inplace=True)

# replace [deleted] and [removed] with Empty str in author
comments['author'] = comments['author'].replace('[deleted]', None)

# remove AutoModerator comments
comments = comments[comments['author'] != 'AutoModerator']

# drop rows with [deleted] or [removed] in body
comments = comments[comments['body'] != '[deleted]']
comments = comments[comments['body'] != '[removed]']

# drop duplicates
comments.drop_duplicates(subset=['id'], inplace=True)
comments.reset_index(drop=True, inplace=True)

# change created_utc to datetime type
comments['created_utc'] = pd.to_datetime(comments['created_utc'])

# format link_id and parent_id to match submission id
comments['link_id'] = comments['link_id'].str.split('_').str[1]
comments['parent_id'] = comments['parent_id'].str.split('_').str[1]


In [103]:
comments.to_csv('../data/cleaned/comments.csv', index=False)