## Combining all Labelled Dataset into a single Corpus

### Import Libraries

In [18]:
import pandas as pd
import ast
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
tqdm.pandas()

Code to change progress bar color to dark from tqdm.

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  

In [3]:
rdd_df = pd.DataFrame(pd.concat([chunk for chunk in tqdm(pd.read_csv('../data/reddit_depression_dataset_cleaned.csv', chunksize=1000, index_col=0), desc='Loading data')]))
rdd_df.head()

Loading data: 0it [00:00, ?it/s]

Unnamed: 0,label,date,upvotes,num_comments,combined_text,tokenized_text,alphanum_text,stopword_removed_text,stemmed_text,non_stopword_removed_stemmed_text,combined_stemmed_text,combined_non_stopword_removed_stemmed_text
0,0.0,2014-07-14 03:35:09,4.0,0.0,Deep thoughts underdog Only when we start cons...,"['deep', 'thoughts', 'underdog', 'only', 'when...","['deep', 'thoughts', 'underdog', 'only', 'when...","['deep', 'thoughts', 'underdog', 'start', '99'...","['deep', 'thought', 'underdog', 'start', '99',...","['deep', 'thought', 'underdog', 'onli', 'when'...",deep thought underdog start 99 underdog start ...,deep thought underdog onli when we start consi...
1,0.0,2014-09-13 00:31:19,4.0,1.0,"I like this sub, there's only two posts yet I ...","['i', 'like', 'this', 'sub', ',', ""there's"", '...","['i', 'like', 'this', 'sub', 'only', 'two', 'p...","['posts', 'coming', 'human', 'morality', 'joke...","['post', 'come', 'human', 'moral', 'joke', 'lo...","['i', 'like', 'thi', 'sub', 'onli', 'two', 'po...",post come human moral joke long abscenc hope d...,i like thi sub onli two post yet i keep come b...
2,0.0,2014-11-20 04:31:58,6.0,1.0,Rebirth! Hello. \nI am the new guy in charge h...,"['rebirth', '!', 'hello', '.', 'i', 'am', 'the...","['rebirth', 'hello', 'i', 'am', 'the', 'new', ...","['rebirth', 'guy', 'charge', 'thegood', 'ofc',...","['rebirth', 'guy', 'charg', 'thegood', 'ofc', ...","['rebirth', 'hello', 'i', 'am', 'the', 'new', ...",rebirth guy charg thegood ofc bring weirdpinea...,rebirth hello i am the new guy in charg here b...
3,0.0,2014-11-20 19:38:05,25.0,2.0,"""I want to be like water. I want to slip throu...","['""', 'i', 'want', 'to', 'be', 'like', 'water'...","['i', 'want', 'to', 'be', 'like', 'water', 'i'...","['water', 'slip', 'fingers', 'hold', 'ship', '...","['water', 'slip', 'finger', 'hold', 'ship', 'm...","['i', 'want', 'to', 'be', 'like', 'water', 'i'...",water slip finger hold ship michel william,i want to be like water i want to slip through...
5,0.0,2014-11-22 19:17:39,8.0,23.0,What is the limit of the knowledge and power a...,"['what', 'is', 'the', 'limit', 'of', 'the', 'k...","['what', 'is', 'the', 'limit', 'of', 'the', 'k...","['limit', 'knowledge', 'power', 'human', 'pers...","['limit', 'knowledg', 'power', 'human', 'perso...","['what', 'is', 'the', 'limit', 'of', 'the', 'k...",limit knowledg power human person infinit grow,what is the limit of the knowledg and power a ...


In [5]:
osm_df = pd.DataFrame(pd.concat([chunk for chunk in tqdm(pd.read_csv('../data/1.6_million_dataset_labelled_LogReg.csv', chunksize=1000), desc='Loading data')]))
osm_df.head()

Loading data: 0it [00:00, ?it/s]

Unnamed: 0,date,stemmed_text,predicted_label
0,2009-04-06 22:19:45,"['awww', 'bummer', 'shoulda', 'david', 'carr',...",0
1,2009-04-06 22:19:49,"['upset', 'updat', 'facebook', 'text', 'result...",0
2,2009-04-06 22:19:53,"['dive', 'time', 'ball', 'manag', 'save', '50'...",0
3,2009-04-06 22:19:57,"['bodi', 'feel', 'itchi']",0
4,2009-04-06 22:19:57,"['behav', 'mad']",0


In [6]:
mhd_df = pd.DataFrame(pd.concat([chunk for chunk in tqdm(pd.read_csv('../data/mental_health_dataset_labelled_LogReg.csv', chunksize=1000), desc='Loading data')]))
mhd_df.head()

Loading data: 0it [00:00, ?it/s]

Unnamed: 0,Date,location,followers,engagement,stemmed_tweet,predicted_label
0,2023-02-15 13:48:52+00:00,Haringey,317,0.022082,"['peopl', 'ocd', 'experi', 'intens', 'neg', 'r...",1
1,2023-02-09 22:37:56+00:00,unknown,9,0.0,"['erika', 'iocdf', 'grassroot', 'advoc', 'love...",0
2,2022-10-20 11:30:07+00:00,Gloucester,905,0.0,"['light', 'week', 'ocd', 'awar', 'week', 'want...",1
3,2022-10-18 19:42:14+00:00,"Boston, MA",19257,0.000467,"['gif', 'view', 'giphi', 'hope', 'campaign', '...",0
4,2022-10-18 19:42:13+00:00,"Boston, MA",19257,0.000727,"['ocdweek', 'event', 'activ', 'livestream', 'l...",0


### Data Processing

In [7]:
osm_df.rename(columns={'predicted_label': 'label'}, inplace=True)
osm_df.head()

Unnamed: 0,date,stemmed_text,label
0,2009-04-06 22:19:45,"['awww', 'bummer', 'shoulda', 'david', 'carr',...",0
1,2009-04-06 22:19:49,"['upset', 'updat', 'facebook', 'text', 'result...",0
2,2009-04-06 22:19:53,"['dive', 'time', 'ball', 'manag', 'save', '50'...",0
3,2009-04-06 22:19:57,"['bodi', 'feel', 'itchi']",0
4,2009-04-06 22:19:57,"['behav', 'mad']",0


In [9]:
mhd_df.rename(columns={'Date': 'date', 'stemmed_tweet': 'stemmed_text', 'predicted_label': 'label'}, inplace=True)
mhd_df.head()

Unnamed: 0,date,location,followers,engagement,stemmed_text,label
0,2023-02-15 13:48:52+00:00,Haringey,317,0.022082,"['peopl', 'ocd', 'experi', 'intens', 'neg', 'r...",1
1,2023-02-09 22:37:56+00:00,unknown,9,0.0,"['erika', 'iocdf', 'grassroot', 'advoc', 'love...",0
2,2022-10-20 11:30:07+00:00,Gloucester,905,0.0,"['light', 'week', 'ocd', 'awar', 'week', 'want...",1
3,2022-10-18 19:42:14+00:00,"Boston, MA",19257,0.000467,"['gif', 'view', 'giphi', 'hope', 'campaign', '...",0
4,2022-10-18 19:42:13+00:00,"Boston, MA",19257,0.000727,"['ocdweek', 'event', 'activ', 'livestream', 'l...",0


In [15]:
corpus = pd.DataFrame(pd.concat([rdd_df[['date', 'stemmed_text', 'label']], osm_df[['date', 'stemmed_text', 'label']], mhd_df[['date', 'stemmed_text', 'label']]], ignore_index=True))
corpus.reset_index(drop=True, inplace=True)
print("RDD DataFrame Shape:", rdd_df.shape)
print("OSM DataFrame Shape:", osm_df.shape)
print("MHD DataFrame Shape:", mhd_df.shape)
print(corpus.shape)
corpus.head()

RDD DataFrame Shape: (1987011, 12)
OSM DataFrame Shape: (1600000, 3)
MHD DataFrame Shape: (724745, 6)
(4311756, 3)


Unnamed: 0,date,stemmed_text,label
0,2014-07-14 03:35:09,"['deep', 'thought', 'underdog', 'start', '99',...",0.0
1,2014-09-13 00:31:19,"['post', 'come', 'human', 'moral', 'joke', 'lo...",0.0
2,2014-11-20 04:31:58,"['rebirth', 'guy', 'charg', 'thegood', 'ofc', ...",0.0
3,2014-11-20 19:38:05,"['water', 'slip', 'finger', 'hold', 'ship', 'm...",0.0
4,2014-11-22 19:17:39,"['limit', 'knowledg', 'power', 'human', 'perso...",0.0


In [16]:
print(corpus[corpus['label'] == 1].shape)
print(corpus[corpus['label'] == 0].shape)

(424032, 3)
(3887724, 3)


In [19]:
corpus['stemmed_text'] = corpus['stemmed_text'].progress_apply(lambda x: ast.literal_eval(x))
type(corpus['stemmed_text'][0])

  0%|          | 0/4311756 [00:00<?, ?it/s]

list

In [20]:
corpus['text_frequency'] = corpus['stemmed_text'].progress_apply(lambda x: len(x))
corpus.head()

  0%|          | 0/4311756 [00:00<?, ?it/s]

Unnamed: 0,date,stemmed_text,label,text_frequency
0,2014-07-14 03:35:09,"[deep, thought, underdog, start, 99, underdog,...",0.0,8
1,2014-09-13 00:31:19,"[post, come, human, moral, joke, long, abscenc...",0.0,11
2,2014-11-20 04:31:58,"[rebirth, guy, charg, thegood, ofc, bring, wei...",0.0,11
3,2014-11-20 19:38:05,"[water, slip, finger, hold, ship, michel, will...",0.0,7
4,2014-11-22 19:17:39,"[limit, knowledg, power, human, person, infini...",0.0,7


### Export Data

In [21]:
corpus.to_csv('../data/labelled_corpus.csv', index=False)