In [1]:
import pandas as pd
from scripts import scrape_reddit as sr
from datetime import datetime
from scripts import extract_pairs as ep
from scripts import extract_patterns as ept
from scripts import link_google_id as gid
import swifter

# Pair extraction with base pattern

## Scrape reddit data and create 'banned from' corpus 

In [None]:
# run this to recreate the 'banned from' corpus

min_date = int(datetime.strptime('01-10', '%m-%y').timestamp())
max_date = int(datetime.strptime('12-20', '%m-%y').timestamp())
folder_path = 'banned_from_corpus/'

count = sr.scrape_pattern('banned from', '<ent> banned from <plat>', min_date, max_date, folder_path, 0)

In [11]:
# run this to recreate the 'banned from' corpus

min_date = int(datetime.strptime('01-18', '%m-%y').timestamp())
max_date = min_date + 20000000
folder_path = 'test/'

count = sr.scrape_pattern('banned from', '<ent> banned from <plat>', min_date, max_date, folder_path, 0)

The 'banned from' corpus files are pandas dataframe in the following format:

text|date|pattern|id|score|nb_comments
---|---|---|---|---|---

## Extract entity/platform pairs from the 'banned from' corpus 

In [None]:
banned_from_corpus = ''' Complete here '''

regex_entity = "(.+?)(?:| (?:am|was|is|are|were|got|get|will be|getting|being|has been|have been|been)) banned from"
regex_platform = "banned from(?: the)? (\w+)"

banned_from_corpus = banned_from_corpus[banned_from_corpus.text.swifter.apply(lambda x: ep.has_platform(x))] # filter those who don't have the platform name
banned_from_corpus['platform'] = banned_from_corpus.text.swifter.apply(lambda x: ep.extract_platform(x, regex_platform))# extract platform
banned_from_corpus = banned_from_corpus.dropna() # Remove all lines for which there was no platform
banned_from_corpus['entity'] = banned_from_corpus.text.swifter.apply(lambda x: ep.extract_entity(x, regex_entity)) # extract entities 
banned_from_corpus = banned_from_corpus.dropna()

The result is in the form of:

entity | platform | text | date | pattern | id | score | nb_comments
---|---|---|---|---|---|---|---

## Scrape reddit posts containing the base pairs 

In [None]:
# creating a dictionnary from the base pairs

base_pairs = pd.read_csv('results/base_pairs.csv')
base_pairs.platform = base_pairs.platform.apply(lambda x: [x])

# group by entities
base_pairs_dict = dict(base_pairs.groupby('entity').agg({'platform': 'sum'}).reset_index().values)
entities = list(base_pairs_dict.keys())
base_pairs_dict

In [None]:
min_date = int(datetime.strptime('01-10', '%m-%y').timestamp())
max_date = int(datetime.strptime('12-20', '%m-%y').timestamp())
folder_path = 'posts_base_pairs/'

count = 0 # to keep track of the index of the files being written
for i, e in enumerate(entities):
    count = sr.scrape_pairs(e, base_pairs_dict[e], min_date, max_date, f'{folder_path}{i}/', count)

The result is in the form of:

entity | platforms | text | date | id | score | nb_comments
---|---|---|---|---|---|---

# Pattern inference with base pairs 

## Pattern inference from posts with base pairs

In [None]:
posts_base_pairs = '''Complete here'''

# Extract patterns from base pairs
patterns = posts_base_pairs.swifter.apply(lambda x: ept.extract_patterns(x.text.lower(), x.entity.lower(), x.platforms), axis = 1)

In [6]:
# all patterns to lower case and grouped; only patterns with minimum 14 occurences are kept (0.2% of the list)

df_patterns = pd.DataFrame(data = [p for sublist in patterns for p in sublist], columns = ['patterns']) # flatten the list of patterns
df_patterns['cnt'] = 1
df_patterns = df_patterns.groupby('patterns').count().reset_index()
df_patterns = df_patterns[df_patterns.cnt >= 14]

## Scrape posts with inferred patterns 

In [8]:
patterns = pd.read_csv('data/raw_patterns.csv', sep = '\t')
folder_path = 'patterns_posts/'
min_date = int(datetime.strptime('01-10', '%m-%y').timestamp())
max_date = int(datetime.strptime('12-20', '%m-%y').timestamp())

count = 0
for i, row in patterns.iterrows():
    count = sr.scrape_pattern(row.substring, row.pattern, min_date, max_date, folder_path, count)

The result is in the form of:

text | date | pattern | id | score | nb_comments
---|---|---|---|---|---

# Pair extraction from inferred patterns 

In [5]:
regex_platform = lambda pattern, entity_first: f"{pattern}(\w+)" if entity_first else f"(\w+){pattern}"
regex_entity = lambda pattern, entity_first: f"(.+?){pattern}" if entity_first else f"{pattern}(.*$)"

In [None]:
posts_inferred_patterns = ''' Complete here with posts containing the infered patterns '''

posts_inferred_patterns = posts_inferred_patterns[posts_inferred_patterns.text.swifter.apply(lambda x: ep.has_platform(x))] # filter those who don't have the platform name
posts_inferred_patterns['platform'] = posts_inferred_patterns.swifter.apply(lambda x: ep.extract_platform(x.text, regex_platform(x.pattern.replace('<ent>','').replace('<plat>',''), x.pattern[1] == 'e')), axis = 1)# extract platform
posts_inferred_patterns = posts_inferred_patterns.dropna() # Remove all lines for which there was no platform
posts_inferred_patterns['entity'] = posts_inferred_patterns.swifter.apply(lambda x: ep.extract_entity(x.text, regex_entity(x.pattern.replace('<ent>','').replace('<plat>',''), x.pattern[1] == 'e'), x.pattern[1] == 'e'), axis = 1) # extract entities 
posts_inferred_patterns = posts_inferred_patterns.dropna()

The result is in the form of:

entity | platforms | text | date | id | score | nb_comments
---|---|---|---|---|---|---

##  Pattern precision

In [None]:
# On s'en fout un peu pour l'instant, pas si important

## Ban dates extraction 

In [1]:
# Préciser que après ça il faut aussi le faire à la main

# Google Knowledge Graph linking 

In [None]:
final_dataset = ''' Complete here with (manually and automatically) filtered dataset '''

final_dataset['g_id'] = final_dataset.entity.swifter.apply(lambda x: gid.link_google_id(x))
final_dataset = final_dataset.dropna()

The resulting dataframe needs to be manually checked to verify that the google id collected actually refers to the entity under consideration