In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from glob import glob
import numpy as np
from datetime import datetime
from collections import defaultdict, OrderedDict

import term_hiearchy
import re
import tqdm.notebook


sns.set_style('whitegrid')
sns.set_palette('colorblind')

# Key term hit processing

Parse, clean, and use hierarchy to process overlapping hits for regular expressions developed from qualitative glossary.


## Data Description

Parse dataframes generated by `generate_datasets_hierarchy.py -i 2021_04_15_scrape/2021_04_15_inceldom_discussion_scrape/complete_submissions_index.txt`
These scripts take the scraped forum text data and search forum text using a set of regular expressions.

- `2021_04_15_scrape/2021_04_15_inceldom_discussion_scrape/hierarchy_query_data.tsv.gz`: contains all hits to search terms in the qualitative glossary and encoded in `generate_datasets.py`

- `2021_04_15_scrape/2021_04_15_inceldom_discussion_scrape/hierarchy_user_data.tsv.gz`: contains a database of all the users

- `2021_04_15_scrape/2021_04_15_inceldom_discussion_scrape/hierarchy_users_per_thread.tsv.gz`: count of number of unique users in each thread

- `term_hiearchy.hierarchy` contains the regular expressions used by `generate_datasets_hiearchy.py` in a series of OrderedDictionaries to retain priorities


In [2]:
query_data = pd.read_csv('../2021_04_15_inceldom_discussion_scrape/hiearchy_query_data.tsv.gz', sep='\t', lineterminator='\n')

# userdata will include more users than post data usernames as it includes users with deleted posts
user_data = pd.read_csv('../2021_04_15_inceldom_discussion_scrape/hierarchy_user_data.tsv.gz', sep='\t', lineterminator='\n')
user_data = user_data.sort_values(['username', 'number_of_posts'], ascending=False).drop_duplicates(subset=["username"], keep="first")
post_data = pd.read_csv('../2021_04_15_inceldom_discussion_scrape/hierarchy_post_data.tsv.gz', sep='\t', lineterminator='\n')

term_hiearchy_dict = term_hiearchy.hierarchy

# build a dictionary of term priorities
term_priorities = []
ix = 0
for term_category, terms in term_hiearchy_dict.items():
    for term in term_hiearchy_dict[term_category]:
        term_priorities.append((term, ix))
        ix+=1

term_priorities = OrderedDict(term_priorities)

query_data['term_priority'] = query_data['query_term'].apply(lambda x: term_priorities[x])
# filter out 

Now we need to clean-up some false positive hits to "JAP" as part of the words "japan" and "japanese"

In [3]:
jap_fp = query_data[(query_data['query_term'] == 'JAP') & (query_data['query_word_after_match'].str.lower().isin(['an', 'anese', 'cel']))].index
query_data = query_data.drop(jap_fp)

jb_fp = query_data[(query_data['query_term'] == 'JB') & ((query_data['query_word_after_match'].str.len() == 1) | query_data['query_word_after_match'].str.lower().str.contains('wmax'))].index
query_data = query_data.drop(jb_fp)

co_fp = query_data[query_data['query_term'] == 'CO'].index
query_data = query_data.drop(co_fp)

trans_fp = query_data[(query_data['query_term'] == 'trans') & \
    (~query_data['query_word_after_match'].str.lower().isin(['sexuals', 'exual', 'sexual', 'phobia', 'men', 'man', 'rights', 'sexualism', 'exualism', 'vestites', 'sexuality', 'exuality', 'cels']))].index
query_data = query_data.drop(trans_fp)

hole_fp = query_data[(query_data['query_term'] == 'hole') & (~query_data['query_sentence'].str.contains(' hole '))].index
query_data = query_data.drop(hole_fp)

puta_fp = query_data[(query_data['query_term'] == 'puta') & (~query_data['query_sentence'].str.contains(' puta'))].index
query_data = query_data.drop(puta_fp)

bim_fp = query_data[(query_data['query_term'] == 'bim') & (~query_data['query_word_after_match'].str.lower().str.startswith('b', na=False))].index
query_data = query_data.drop(bim_fp)

gash_fp = query_data[(query_data['query_term'] == 'gash') & (query_data['query_word_after_match'].str.lower() == 'i')].index
query_data = query_data.drop(gash_fp)

skirt_fp = query_data[(query_data['query_term'] == 'skirt')].index
query_data = query_data.drop(skirt_fp)

scag_fp = query_data[(query_data['query_term'] == 'scag') & (~query_data['query_tidied_match'].str.contains('skag'))].index
query_data = query_data.drop(scag_fp)

pj_fp = query_data[(query_data['query_term'] == 'PJ')].index
query_data = query_data.drop(pj_fp)

lez_fp = query_data[(query_data['query_term'] == 'lez') & (~query_data['query_sentence'].str.lower().str.contains(' lez'))].index
query_data = query_data.drop(lez_fp)

gasher_fp = query_data[(query_data['query_term'] == 'gasher')].index
query_data = query_data.drop(gasher_fp)

hog_fp = query_data[(query_data['query_term'] == 'hog') & (~query_data['query_sentence'].str.lower().str.contains(' hog'))].index
query_data = query_data.drop(hog_fp)

hogwash_fp = query_data[(query_data['query_term'] == 'hog') & (query_data['query_word_after_match'] == 'wash')].index
query_data = query_data.drop(hogwash_fp)

Then we need to tidy up the terms defined as "Racist Misogyny" if followed by another term lower in the term hierarchy. We will do this by evaluating the neighbouring words using the regular expressions and if they hit then renaming the category to "Racist Misogyny" otherwise we leave it as is 'Racist Misogyny (if followed by other term)'.  After this runs we can drop all the remaining "'Racist Misogyny (if followed by other term)'" as this indicates they didn't have a neighbouring word that hit one of the regular expressions. 

In [4]:
# tidy up racist misogyny terms that aren't definitely racist misogyny (i.e., not follow by another term)
def check_neighbouring_words(row):
    if row['query_category'] != 'Racist Misogyny (if followed by other term)':
        return row
    
    neighour = True
    for term_category, terms in term_hiearchy_dict.items():
        for term, term_regex in term_hiearchy_dict[term_category].items():
            if not pd.isna(row['query_word_before_match']):
                if re.match(term_regex, row['query_word_before_match']):
                    row['query_category'] = 'Racist Misogyny'
                    return row

            if not pd.isna(row['query_word_after_match']):
                if re.match(term_regex, row['query_word_after_match']):
                    row['query_category'] = 'Racist Misogyny'
                    return row  
    return row

query_data = query_data.apply(check_neighbouring_words, axis=1)
# drop any remaining "Racist Misogyny (if followed by other term)" because if its still there after
# checking for neighbouring words then it isn't valid
query_data = query_data[query_data['query_category'] != 'Racist Misogyny (if followed by other term)']

Then we need to look at hits neighbouring each other and keep the term with the highest priority (from the term hierachy dictionaries)

In [5]:
# apply hierarchy
indices_to_remove = set()

# for each thread get all the hits from that thread
for group, hits_in_same_sentence in query_data.groupby(['thread_url', 'post_position', 'sentence_position']):  
    if hits_in_same_sentence.shape[0] > 1:
        # check if any of them overlap if they do delete the overlapping ones based on term priority
        for hit_ix, hit in hits_in_same_sentence.sort_values('word_position').iterrows():
        
            adjacent_hits = hits_in_same_sentence[hits_in_same_sentence['word_position'].between(hit['word_position'] -1, 
                                                                                                 hit['word_position'] + 1)]
            #adjacent_hits = adjacent_hits.drop(hit_ix)
            if adjacent_hits.shape[0] > 1:
                # take the top priority hit
                adjacent_hits = adjacent_hits.sort_values('term_priority')
                for ix in adjacent_hits.iloc[1:].index:
                    indices_to_remove.add(ix)

query_data = query_data.drop(indices_to_remove)

Finally we can save the processed query data for quantiative analysis in notebook 1

In [6]:
query_data.to_csv('../2021_04_15_inceldom_discussion_scrape/hierarchy_query_data_PROCESSED.tsv.gz', compression='gzip', sep='\t')