In [54]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [55]:
dataset = []
for csv in Path('.').glob('*.tsv'):
    term = csv.name.replace('2022_12_11_inceldom_', '').replace('_references.tsv', '')
    csv = pd.read_csv(csv, sep='\t')
    csv['term'] = term
    dataset.append(csv)
dataset = pd.concat(dataset)
dataset = dataset.drop('Unnamed: 0', axis=1)

In [56]:
unique_users = dataset.groupby('term')['author'].nunique()
unique_users.name = "Unique Users Using Term" 
unique_threads = dataset.groupby('term')['URL'].nunique()
unique_threads.name = "Unique Threads Using Term"
unique_posts = dataset.groupby('term').size()
unique_posts.name = "Unique Posts Using Term"

usage_summary = pd.concat([unique_users, unique_threads, unique_posts], axis=1).reset_index()
tidy_names = {'rape': 'rape(s|ed)',
'raping': 'raping',
'sex_slave': 'sex slave(s|d)',
'sexual_slavery' : 'sexual slavery',
'sexual_assault': 'sexual assault(s|ing|ed)',
'sexual_assaulting': 'sexual assaulting',
'sexually_assaulting': 'sexually assaulting',
'harem': 'harem(s)'}
usage_summary = usage_summary.replace(tidy_names).set_index('term')
usage_summary.to_csv('usage_summary.tsv', sep='\t')

In [57]:
usage_summary

Unnamed: 0_level_0,Unique Users Using Term,Unique Threads Using Term,Unique Posts Using Term
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
harem(s),1324,3944,4736
rape(s|ed),3986,25835,43621
raping,1409,3377,3882
sex slave(s|d),681,1224,1421
sexual assault(s|ing|ed),564,958,1103
sexual assaulting,5,5,5
sexual slavery,53,58,61
sexually assaulting,53,65,65


In [59]:
index = pd.read_csv('../2022_12_11_inceldom_discussion_scrape/complete_submissions_index.txt', sep='\t',names=['URL', 'Comment_Data', 'User_Data'])

users = set()
post_count = 0
for scrape in tqdm(Path('../2022_12_11_inceldom_discussion_scrape/submissions/').glob('*.tsv'), total=index.shape[0]):
    try:
        scrape_df = pd.read_csv(scrape, sep='\t', comment='#')
        users = users.union(set(scrape_df['author'].unique()))
        post_count += scrape_df.shape[0]
    except:
        continue

100%|████████████████████████████████████████████| 234615/234615 [05:42<00:00, 684.76it/s]


In [61]:
print(f"Total Users: {len(users)}")
print(f"Total Posts: {post_count}")
print(f"Total Threads: {index.shape[0]}")

Total Users: 11678
Total Posts: 4974826
Total Threads: 234615
