In [2]:
# Data handling
import pandas as pd

# PySpark
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StringType, BooleanType, StructType, StructField, FloatType, ArrayType, MapType
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# String handling
import html
import tldextract
from dateutil.parser import parse

# Calculating scores
import profanity_check
from empath import Empath
lexicon = Empath()

In [None]:
DATA_DIR = '../data'
SPK_ATTR_PATH = DATA_DIR + 'speaker_attributes.parquet'
SPK_ATTR_WITH_LABELS_PATH = DATA_DIR + 'speaker_attributes_with_labels.parquet'

# Preprocessing
In this notebook we perform all the necessary steps to perform the analysis of profanity in quotations. This includes cleaning the data, identifying profane quotations, calculating scores for Empath categories, and enriching the data.

In [3]:
conf = pyspark.SparkConf().setMaster("local[24]").setAll([
    ('spark.driver.memory','40G'),
    ('spark.driver.maxResultSize', '40G'),
    ('spark.ui.port', 4866),
    ('spark.sql.execution.arrow.pyspark.enabled', True),
])


spark = SparkSession.builder.config(conf=conf).config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR') # ERROR, WARN, INFO, DEBUG, ...
spark

21/12/17 00:18:25 WARN Utils: Your hostname, iccluster039 resolves to a loopback address: 127.0.1.1; using 10.90.38.15 instead (on interface ens786f0)
21/12/17 00:18:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/culjak/.conda/envs/speaker-disambiguation/lib/python3.9/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/culjak/.ivy2/cache
The jars for the packages stored in: /home/culjak/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7ea28b9e-aa19-4b8d-8b2b-8fe4b50c7aec;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.3.4 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lomb

In [None]:
DATA_DIR = '../data/'
quotes = spark.read.parquet(DATA_DIR + 'quotes.parquet')
quotes.printSchema()

In [None]:
def url_list_to_tlds(urls):
    try:
        if urls is None or len(urls) == 0:
            return None
        res = list(map(lambda url : tldextract.extract(url).domain, urls))
        return res if res else None
    except:
        return None

## Cleaning

In [13]:
quotes.select('quotation').where(F.array_contains('domains', 'wikia.com')).show(100, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
print("Total quotes on Wikia:", quotes.where(F.array_contains('domains', 'wikia.com')).count())
quotes.where(F.array_contains('domains', 'wikia.com'))\
    .groupby('speaker')\
    .count()\
    .sort('count', ascending=False).show(50)

                                                                                

Total quotes on Wikia: 595558




+--------------------+------+
|             speaker| count|
+--------------------+------+
|                None|401619|
|         Hisao Nakai|  2270|
|          John Tracy|  2247|
|         Roger Ebert|  2110|
| Alexander the Great|  1962|
|         Casey Jones|  1098|
|        April O'Neil|  1069|
|     Queen of Hearts|   958|
|           Lady Gaga|   636|
|        Peter Parker|   601|
|             Big Mac|   591|
|    Vlad the Impaler|   553|
|   Dennis the Menace|   550|
|     Jenkins , David|   508|
|           Ruby Rose|   491|
|            la porte|   481|
|        Harry Potter|   466|
|       Jessica Jones|   436|
|      Captain Marvel|   397|
|          Sean Nolan|   396|
|Louisa Connolly-B...|   388|
|         Arthur Read|   385|
|     Cassandra Clare|   379|
|              Shen ?|   350|
|          Poison Ivy|   340|
|       Ebert , Roger|   326|
|             Peter ?|   317|
|           Adam West|   316|
|           Volume 10|   304|
|         Jeff Probst|   296|
|        S

                                                                                

We decided to exclude quotes from Wikia.com since
 - a majority of them are attributed either to no speaker at all and some of them are attributed to fictional characters e.g. Hisao Nakai, John Tracy, Casey Jones
 - some quotations are quite strange as seen from sample 100 quotations from wikia.com domain

In [59]:
to_remove = quotes.select('quoteID').where(F.array_contains('domains', 'wikia.com'))

Furthermore, we decided to remove all the quotations that (listed in order of appearing in the regex):
 - HTML tags
 - start with an equals sign
 - end with an equals sign
 - Start with "Bilde?"
 - start with an url i.e. with "https" or "http" string
 - start with "file:" string
 - contain equality with non-letter characters (Quotations containing equality where letters are on both sides may be valid. e.g. Sharing = Caring)
 - contains that contain an unopened closed bracket or an unclosed open bracket
 - contain CSS tags
 - contain HTML entities e.g. &nbsp;
 - can be parsed as dates
 
5 examples for each of the issues are listed below the following cell

In [None]:
# CSS tags are fetched from https://www.w3.org/TR/CSS2/propidx.html
css_tags = '(azimuth|background-attachment|background-color|background-image|background-position|background-repeat|background|border-collapse|border-color|border-spacing|border-style|border-top|border-right|border-bottom|border-left|border-top-color|border-right-color|border-bottom-color|border-left-color|border-top-style|border-right-style|border-bottom-style|border-left-style|border-top-width|border-right-width|border-bottom-width|border-left-width|border-width|border|bottom|caption-side|clip|color|content|counter-increment|counter-reset|cue-after|cue-before|cue|cursor|direction|display|elevation|empty-cells|float|font-family|font-size|font-style|font-variant|font-weight|font|height|left|letter-spacing|line-height|list-style-image|list-style-position|list-style-type|list-style|margin-right|margin-left|margin-top|margin-bottom|margin|max-height|max-width|min-height|min-width|orphans|outline-color|outline-style|outline-width|outline|overflow|padding-top|padding-right|padding-bottom|padding-left|padding|page-break-after|page-break-before|page-break-inside|pause-after|pause-before|pause|pitch-range|pitch|play-during|quotes|richness|right|speak-header|speak-numeral|speak-punctuation|speak|speech-rate|stress|table-layout|text-align|text-decoration|text-indent|text-transform|top|unicode-bidi|vertical-align|visibility|voice-family|volume|white-space|widows|width|word-spacing|z-index):'
regex = '<.*>|^=|=$|^Bilde?|^https?:|^file:|= [^ a-zA-Z]|^[^\[\{\(]*\]|\[[\]\}\)]*$|^[^\{\[\(]*\}|\{[^\}\]\)]*$|^[^<]*>|<[^>]*$' + '|' + css_tags + '&.*;'

to_remove = to_remove.union(quotes.select('quoteID').where(F.col('quotation').rlike(regex)))

def is_date(string):
    """Source: https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format"""
    try:
        parse(string, fuzzy=False)
        return True
    except ValueError:
        return False


is_date_udf = F.udf(is_date, BooleanType())
to_remove.union(quotes.select('quoteID').where(is_date_udf('quotation'))).write.parquet('to_remove.parquet') # Save all the QIDS that should be remove

In [44]:
regexs = ['<.*>', '^=', '^Bilde\?', '^https?:', '^file:', '= [^ a-zA-Z]+', '^[^\[\{\(]*\]|\[[\]\}\)]*$|^[^\{\[\(]*\}|\{[^\}\]\)]*$|', css_tags, '&.*;']
for rexp in regexs:
    print('Regular expression:', rexp)
    quotes.select('quotation').where(F.col('quotation').rlike(rexp)).show(5, truncate = False)
    print()

Regular expression: <.*>
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                                                                                                                         |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|x E9:? < t5 H@F=5 36 E96 7: CDE E @ E6 = = J@F[ E @ 86E E96; @ 3 =: < 6 E92E [ 7@C E92E C62D@?[: D?' E H92E J@F’C6 ECJ:? 8 E @ > 2 < 6 92AA6? ]

                                                                                

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                        |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|= = = = Categoría: YG Entertainment Categoría: Warner Music Taiwan Categoría: KRapero Categoría: KSolista Categoría: KCantante [ [ Categoría: KBailar...         |
|= @@ <:? 8:? E @ A@DD:3=6 = 625D ]                                                                                                                               |
|= Hector is the lead mechanic at Sevenjet on the Piaggio aircraft. Extremely knowledgeable and relentless on keeping the aircraft in perfect operating condition.|
|= / = discrimin

                                                                                

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                                                       |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Bilde? site = du & amp; date = 20170309 & amp; category = news01 & amp; artno = 309009999 & amp; ref = ph & amp; item = 1 & amp; newtbl = 1 & amp; maxw = 650 & amp; ts = 1489116838 & amp; imageversion = 16by9|
|Bilde? site = cj & amp; date = 20170318 & amp; category = news01 & amp; artno = 318009999 & amp; ref = ph & amp; item = 1 & amp; newtbl = 1 & amp; maxw = 6

                                                                                

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                 |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|file: / / 10.86.178.62 / Users/Gary% 20Dvorchak/Desktop/Blueshirt% 20Group% 20Asia/Clients/China% 20Rapid% 20Finance% 20 (XRF) / Earnings/2018-1Q/ralph@blueshirtgroup.com|
|file: / / / C | / Users/mexic/AppData / Roaming/Adobe/Dreamweaver CC 2017/en _ US/Configuration/Temp / Assets/eamD3F6. tmp/images/profile. png                            |
|file: / / localhost/Users/mmccarty / Library/Caches/TemporaryItems / msoclip/0/clip _ image002. png                                   

[Stage 200:>                                                        (0 + 1) / 1]

+---------------------------------------------------------+
|quotation                                                |
+---------------------------------------------------------+
|background-color: # 808080;                              |
|font-family: Cambria; font-size:16 px; line-height:22 px;|
|background: #C 0C0C0; width:140 px                       |
|vertical-align: top; width:150 px;                       |
|text-align: right; background: # 2B9AAA;                 |
+---------------------------------------------------------+
only showing top 5 rows


Regular expression: &.*;
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                  

                                                                                

In [4]:
to_remove = spark.read.parquet('to_remove.parquet')
quotes = quotes.join(to_remove, on='quoteID', how='leftanti') # We use left anti join to remove spurious quotations

## Censorship removal

The list of the words considered for censorship removal was taken from https://www.cs.cmu.edu/~biglou/resources/. The list was filtered by running profanity_check on all the words, selecting those with profanity probability higher than 0.9 and then manually removing ones which are unlikely to be censored or appear in the quotations. The method for censorship removal is based on regular expressions and consists of two stages. In the first stage we detect censorship by constructing a large regular expression that covers two types of censorship that we consider, the "middle" censorship and the "end" censorship:
 - The "middle" censorship includes all the censorship styles that keep the first and the last letter of the word and add ann arbitrary number of censorship characters in the middle e.g. F**k.
 - In the censorship styles that we categorize as the "end" censorship, first $n$ characters of the word are visible, while the other $l - n$ ($l$ is the number of characters in the word) characters, are replaced with less than or equal to $l - n$ censorship characters e.g. F--.

In the second stage we firstly construct a dictionary where the keys are profanities and the values are regular expressions that represent the profanity in different censorship styles. We then iterate through the keys of the dictionary, replacing each matched dictionary value (in this the values are regular expressions) with the corresponding key.

In [7]:
with open('bad-words.txt', 'r') as f:
    bad_words = f.readlines()
    bad_words = [word.strip('\n ') for word in bad_words]

In [8]:
scores = profanity_check.predict_prob(bad_words)
bad_words_filtered = [word for word, score in zip(bad_words, scores) if score > 0.9]
print(*bad_words_filtered, sep=' ')

arse ass asses asshole assholes bastard bitch bitches bitching boobs bullshit butt butt-fuck butt-fucker butt-fuckers cock cocksucker crack-whore crap cum cunt damn dick dickhead die dipshit dumb dumbass dyke fag faggot fuck fucked fucker fuckers fuckhead fuckin fucking fucks fucktard fuk gay goddamn hell hoes homosexual idiot jackass kill looser loser moron motherfucker negro negro's nig nigga niggah nigger niggers nigger's penis piss pussies pussy queer racist rapist retard retarded scum semen shit shitty stupid suck twat wanker whore wtf


In [9]:
words_to_remove = ['butt', 'butt-fuck', 'butt-fucker', 'butt-fuckers', 'crack-whore', 'cum', 'die', 'dumb', 'dyke', 'fuk', 'gay',
                   'hell', 'hoes','homosexual', 'idiot', 'jackass', 'kill', 'looser', 'loser', 'moron', "negro's", 'piss', 'queer',
                   'racist', 'rapist', 'retard', 'retarded', 'scum', 'semen', 'stupid', 'suck', 'wanker', 'wtf']
for word in words_to_remove:
    bad_words_filtered.remove(word)

We firstly define some helper functions for censorship removal regex construction

In [153]:
def possible_positions(string):
    """
    For a given string returns all the positions such that the string is not a substring of any other word in the text.
    """
    # It is possible for a string to be followed by one or three punctuation characters, a dash,
    # an apostrophe or a quotation mark characters before the end of the string or a space
    return (f' {string}(["\'?!,\.\-]|[\.!?]{{3,}})? ', # The string is in the middle of a text
            f'^{string}(["\'?!,\.\-]|[\.!?]{{3,}})? ', # A text starts with the string
            f' {string}(["\'?!,\.\-]|[\.!?]{{3,}})?$', # A text ends with a string
            f'^{string}(["\'?!,\.\-]|[\.!?]{{3,}})?$') # Text contains only the string

def upper_lower(word):
    """
    Generates a regular expression that covers all the possible casings of the word.
    """
    string = ''
    for i in word:
        string += f'[{i}{i.swapcase()}]'
        
    return string

# For example
print(upper_lower('ADA'))

def middle_censorship(word):
    """
    Returns all the possible censorship cases where at least the first and the last character of the word are preserved. e.g f**k
    """
    censorship = []
    for i in range(1, len(word)):
        for j in range(1, len(word) - i):
            censorship.extend(possible_positions(f'{upper_lower(word[:j])} ?[_\-\.\*@#%]+ ?{upper_lower(word[j + i:])}'))
    return censorship
    
    
def end_censorship(word):
    """
    Returns all the possible censorship cases where the first character is always preserved, but the last one is never preserved. e.g. f***
    """
    
    censorship = []
    for i in range(1, len(word)):
        if i + 1 == len(word):
            censorship.extend(possible_positions(f'{upper_lower(word[:i])} ?[_\-\*@#%]'))
        else:
            censorship.extend(possible_positions(f'{upper_lower(word[:i])} ?([_\-\*@#%]{{1,{len(word) - i}}}|\.{{2,{len(word) - i}}}) ?'))
    return censorship
    
# For example
print(*middle_censorship('Fuck'), sep='\n')
print()
print(*end_censorship('Fuck'), sep='\n')

[Aa][Dd][Aa]
 [Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})? 
^[Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})? 
 [Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})?$
^[Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})?$
 [Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
^[Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
 [Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$
^[Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$
 [Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
^[Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
 [Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$
^[Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$

 [Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})? 
^[Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})? 
 [Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})?$
^[Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})?$
 [Ff][uU] ?([_\-\*@#%]{1,2}|\.{2,2}) ?(["'?!,\.\-]|[\.!?]{3,})? 
^[Ff][uU]

In [155]:
middle_censorship_dict = {word: middle_censorship(word) for word in bad_words_filtered}

censorships = []
for censorship_arr in middle_censorship_dict.values():
    censorships.extend(censorship_arr)

For "end censorship", we consider a smaller list of words to account for ambigutity and false positives by heuristically removing the words we assume would likely not going to be censored and their frequency in the uncensored quotes which is computed below.

In [11]:
profanity_freqs = {}
for word in bad_words_filtered:
    profanity_freqs[word] = quotes.where(F.col('quotation').rlike('|'.join(possible_positions(word)))).count()
    
    
for word, count in profanity_freqs.items():
    print(word, count)



arse 3400
ass 37174
asses 3527
asshole 3517
assholes 1338
bastard 3113
bitch 14618
bitches 3349
bitching 830
boobs 6106
bullshit 8069
cock 1631
cocksucker 92
crap 20457
cunt 663
damn 45679
dick 5758
dickhead 466
dipshit 67
dumbass 686
fag 908
faggot 671
fuck 27332
fucked 6540
fucker 453
fuckers 398
fuckhead 36
fuckin 4516
fucking 40680
fucks 951
fucktard 4
goddamn 3770
motherfucker 1786
negro 1157
nig 58
nigga 3403
niggah 5
nigger 768
niggers 336
nigger's 11
penis 7075
pussies 336
pussy 3073
shit 54585
shitty 3483
twat 288
whore 2339


                                                                                

In [194]:
words_to_remove = set(['boobs', 'fag', 'goddamn', 'fucker', 'damn', 'dipshit', 'niggah', 'nig'])
# fucker and fucked would have the same end_censorship regex, so we remove fucker since fucked appears more frequently
# damn is quite frequent, but we believe it is unlikely to be censored. Thus, we remove it in this stage.
bad_words_end = bad_words_filtered.copy()
for word in words_to_remove:
    bad_words_end.remove(word)

end_censorship_dict = {word: end_censorship(word) for word in bad_words_end}

for censorship_arr in end_censorship_dict.values():
    censorships.extend(censorship_arr)

In [159]:
censorship_blended = '|'.join(censorships)

preliminary_filtered_quotes = quotes.where(F.col('quotation').rlike('[_\-\*@#%]|\.\.'))
censored_quotes = preliminary_filtered_quotes.select('quoteID').where(F.col('quotation').rlike(censorship_blended))
# If dot was used as a censorship character, we consider only only censorships that include more that one dot.

In [313]:
censored_quotes_pd = censored_quotes.select('quoteID', 'quotation').toPandas()

We noticed that, in some cases, the "a little bit" syntagm is followed by " --" which makes our method wrongly identify it as a censored quotation. Thus, we filter out all such quotations.

In [316]:
censored_quotes = censored_quotes.where(~F.col('quotation').rlike(' a little bit -- '))

**Resolving "middle" censorship.** Here, we sort the obscene words by their frequency in the uncensored quotes in descending order. This is done so that, in case of ambiguity, some censored string is replaced with a word that appears more frequently. However, although e.g. "boobs" appear more frequently in the quotes than "bitches", we firstly resolve "bitches" because it is more likely to be censored. A change in order is performed by modifying word frequencies.

In [320]:
profanity_freqs['bitches'] += profanity_freqs['boobs']

In [322]:
for word, _ in sorted(profanity_freqs.items(), key=lambda x: -x[1]):
    word_censorships = middle_censorship_dict[word]
    censored_quotes = censored_quotes.withColumn('quotation', F.regexp_replace('quotation', '|'.join(word_censorships), ' ' + word + n_groups(len(word_censorships)) + ' '))

**Resolving "end" censorship.** We again sort the obscene words by their frequency in the uncensored quotes in descending order. Here, we swap the order of "fucking" and "fuck" even though "fucking" appears more frequently. We do that because otherwise, short censored words starting with f will incorrectly be replaced with "fucking" (although it may not always be incorrect).

In [323]:
profanity_freqs['fuck'] += profanity_freqs['fucking']
profanity_freqs['bitches'] -= profanity_freqs['boobs'] # Return bitches frequency to its original count, since now we ignore "boobs"

In [324]:
# This function looks very strange and it's a bit hard to explain so I will start with an example.
# Let '(aab|bc)|(bcd|dc) be some regex. Let's say we want to append 'a' to all the substrings matching
# this regex. We would naturally start by calling re.sub('(aab|bc)|(bdd|dc)', ... but then we stop
# because we don't know which group should we reference to preserve the part of the string in brackets.
# If we call re.sub('(aab|bc)|(bdd|dc)', '\1a', string) we will always be referring to the group '(aab|bc)',
# in which case if in the 'string' variable we have e.g. 'bdd', we will not be able to preserve the string,
# and the result would be just the letter a since the first group is an empty string because it wasn't matched.
# The fact that the unmatched group is empty leads us to the solution. We have to refer to all the groups in
# the regular expression so the solution would be re.sub('(aab|bc)|(bdd|dc)', '\\1\\2a', string). Now we
# cover all the possible cases. This is illustrated in the cell below. In our case, something similar happens,
# if a profanity ends in punctuation, we want to preserve it so for each regular expression that we concatenated
# when building the one for censorship removal, we have to refer to a group that matches punctuation.
# Essentially, n_groups generates n groups: \1\2...\n. Thus, n has to be specified so that it matches the number
# of groups in the regular # expression. Furthermore, We may not want to preserve all the groups so we allow
# the possibility of defining a function that selects only the groups that we need. If we build a regular expression
# by concatenating the regular expressions shown few cells above:
# [Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})? 
# ^[Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})? 
#  [Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})?$
# ^[Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})?$
#  [Ff][uU] ?([_\-\*@#%]{1,2}|\.{2,2}) ?(["'?!,\.\-]|[\.!?]{3,})? 
# ^[Ff][uU] ?([_\-\*@#%]{1,2}|\.{2,2}) ?(["'?!,\.\-]|[\.!?]{3,})? 
#  [Ff][uU] ?([_\-\*@#%]{1,2}|\.{2,2}) ?(["'?!,\.\-]|[\.!?]{3,})?$
# ^[Ff][uU] ?([_\-\*@#%]{1,2}|\.{2,2}) ?(["'?!,\.\-]|[\.!?]{3,})?$
#  [Ff][uU][cC] ?[_\-\*@#%](["'?!,\.\-]|[\.!?]{3,})? 
# ^[Ff][uU][cC] ?[_\-\*@#%](["'?!,\.\-]|[\.!?]{3,})? 
#  [Ff][uU][cC] ?[_\-\*@#%](["'?!,\.\-]|[\.!?]{3,})?$
# ^[Ff][uU][cC] ?[_\-\*@#%](["'?!,\.\-]|[\.!?]{3,})?$
# we do not want to preserve the censorship characters captured by the first group in each regular expression
# so we have to specify a filtering function that filters out the unwanted group indices. In this example
# we want to keep groups at indices 2, 4, 6, 8, 10, 12, 14, 16 and 17, 18, 19, 20. In a more general situation,
# according to the definition of our regex, we would want to keep all the even indices less than or equal to n,
# and all the odd indices greater than (n - 2) * 8.

def end_censorship_regex_only_last_groups_per_row(n):
    return lambda x: not ((x <= (n - 2) * 8) and (x % 2 == 1))

def n_groups(n, referencing_char='$', filter_func=None): 
    # referencing_char is set to '$' because pyspark uses it to reference the groups
    if filter_func is None:
        return ''.join([referencing_char + str(i) for i in range(1, n + 1)])
    else:
        return ''.join([referencing_char + str(i) for i in range(1, n + 1) if filter_func(i)])

In [21]:
print(re.sub('(aab|bc)|(bdd|dc)', '\\1a', 'bdd'))
print(re.sub('(aab|bc)|(bdd|dc)', '\\1\\2a', 'bdd'))

a
bdda


In [325]:
for word, _ in sorted(profanity_freqs.items(), key=lambda x: -x[1]):
    if word not in words_to_remove:
        word_censorships = end_censorship_dict[word]
        n = (len(word) - 2) * 8 + 4
        censored_quotes = censored_quotes.withColumn('quotation', F.regexp_replace('quotation', '|'.join(word_censorships), ' ' + word + n_groups(n, filter_func=end_censorship_regex_only_last_groups_per_row(len(word))) + ' '))
censored_quotes.select('quotation').show(100, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                    

# Caluclating scores
## Profanity scores

In [345]:
censored_quotes_pd = censored_quotes.select('quoteID', 'quotation').toPandas()
censored_quotes_pd['scores'] = profanity_check.predict_prob(censored_quotes_pd['quotation'])

                                                                                

In [5]:
# Calculating profanity scores for the quotations. We split the dataset for each year in half to save RAM.
dfs = []
foryear in range(2015, 2021):
    print(str(year) + '...')
    for i, months in enumerate(('0[1-6]', '(0[7-9]|1[0-2])')):
        print('\t' + str(i + 1) + '. half...')
        partition = quotes.where(F.col('date').rlike(f'{year}-{months}')).select('quoteID', 'quotation').toPandas()
        partition['scores'] = profanity_check.predict_prob(partition['quotation'])
        dfs.append(partition[['quoteID', 'scores']])

In [45]:
def update_with_censored_scores(all_scores, censored_scores):
    """
    Replaces scores calculated for censored data before censorship was removed with the scores calculated after censorship removal.
    """
    all_scores = all_scores.set_index('quoteID')
    censored_scores = censored_scores.set_index('quoteID')
    all_scores.update(censored_scores)
    all_scores = all_scores.reset_index()
    return all_scores

In [352]:
scores_censored = censored_quotes_pd.loc[:, ['quoteID', 'scores']]

In [357]:
scores_all = pd.concat(dfs)
scores_all['censored'] = 0
scores_censored['censored'] = 1

In [359]:
scores_all = update_with_censored_scores(scores_all, scores_censored)
spark.createDataFrame(scores_all).write.parquet('scores.parquet')

## Expading the set of profane quotations
Profanity check is not perfect. Although in the [documentation](https://pypi.org/project/profanity-check/), the author reports high results on the Wikipedia comments dataset, we don't have any guarantee that the model would perform well on the data from a different distribution. Thus, we radically set the profanity score threshold to 0.9, to reduce the number of false positives as much as possible. We also label all the censored quotes as profane, even though our censorship removal method is also not perfect. Funally, we use the same bad words list as before and consider all the words for which profanity score calculated by profanity_check exceeds 0.5. We then filter the set by manually, removing implausible words.

In [10]:
scores = profanity_check.predict_prob(bad_words)
bad_words_filtered = [word for word, score in zip(bad_words, scores) if score > 0.5]
print(*bad_words_filtered, sep=' ')

anal anus arse arsehole ass asses asshole assholes balls bastard beaner bitch bitches bitching blacks blow boobs bugger bullshit butt butt-bang butt-fuck butt-fucker butt-fuckers cancer chink cock cocksucker commie coon crack-whore crap crappy cum cunt dammit damn destroy dick dickhead die dildo dipshit dirty dumb dumbass dyke fag faggot fart fat feces fraud fu fuc fuck fucked fucker fuckers fuckhead fuckin fucking fucks fucktard fuk gay goddamn hell ho hoes hole homo homosexual idiot jackass jew kill killed looser loser masturbate masturbating molester moron motherfucker motherfucking naked nazi negro negro's nig nigga niggah niggaz nigger niggers nigger's nipple penis penises piss pissed pisses pissing poop porn prick prostitute puke pussies pussy queer racist rape rapist redneck retard retarded screw scum semen sex sexual shit shite shithead shits shitting shitty shoot sick slave slut sluts spic spit stupid suck swallow tit tits titties tosser tranny twat vagina vomit wanker whitey 

In [17]:
to_remove = {'balls', 'blacks', 'blow', 'cancer', 'destroy', 'die', 'dirty', 'fat', 'feces', 'fraud', 'gay',
             'hole', 'homo', 'homosexual', 'jew', 'kill', 'killed', 'molester', 'naked', 'nazi', 'nipple', 'porn',
             'prostitute', 'puke', 'queer', 'racist', 'rape', 'rapist', 'sex', 'sexual', 'shoot',
             'sick', 'slave', 'spit', 'swallow', 'vomit'}

bad_words_filtered = set(bad_words_filtered).difference(to_remove)

In [22]:
bad_words_regexes = []
for word in bad_words_filtered:
    bad_words_regexes.extend(possible_positions(word))


bad_words_regex = '|'.join(bad_words_regexes)
quotes.join(profanity, on='quoteID')\
    .withColumn('profanity', F.when((F.col('scores') >= 0.9) 
                                    | (F.col('censored') == 1)
                                    | (F.col('quotation').rlike(bad_words_regex)), 1).otherwise(0))\
    .select('quoteID', 'profanity', 'censored')\
    .write.parquet(DATA_DI + 'profanity_expanded.parquet')


                                                                                

## Empath

In [37]:
censored_quotes = spark.read.parquet(DATA_DIR + 'censorship_removed.parquet')

In [34]:
lexicon = Empath()

empath_udf = F.udf(lambda x: lexicon.analyze(x, normalize=True), MapType(StringType(), FloatType()))

In [4]:
empath_all = spark.read.parquet(DATA_DIR + 'empath.parquet')
empath_censored = spark.read.parquet(DATA_DIR + 'empath_censored.parquet')

                                                                                

In [None]:
for year in range(2015, 2021):
    quotes.select('quoteID', empath_udf('quotation').alias('empath')).where(F.col('date').rlike(f'^{year}')).write.parquet(DATA_DIR + f'empath_{year}.parquet')

In [38]:
for year in range(2015, 2021):
    censored_quotes.select('quoteID', empath_udf('quotation').alias('empath')).where(F.col('date').rlike(f'^{year}')).write.parquet(DATA_DIR + f'empath_censored_{year}.parquet')

                                                                                

In [42]:
categories = sorted(list(lexicon.analyze('').keys()))

In [55]:
for year in range(2015, 2021):
    empath_original = spark.read.parquet(f'{DATA_DIR}empath_{year}.parquet')\
                                .select('quoteID',
                                        *[F.col('empath').getItem(category).alias(category) for category in categories])
    empath_censored = spark.read.parquet(f'{DATA_DIR}empath_censored_{year}.parquet')\
                                .select('quoteID',
                                        *[F.col('empath').getItem(category).alias(category) for category in categories])
    empath_all_pd = empath_original.toPandas()
    empath_censored_pd = empath_censored.toPandas()
    empath_all_pd = update_with_censored_scores(empath_all_pd, empath_censored_pd)
    empath_all_pd.to_parquet(f'{DATA_DIR}empath_all_{year}.parquet')

                                                                                

In [None]:
df_ultimate = None
for year in range(2015, 2021):
    if df_ultimate is None:
        df_ultimate = spark.read.parquet(f'{DATA_DIR}empath_all_{year}.parquet')
    else:
        df_ultimate = df_ultimate.union(spark.read.parquet(f'{DATA_DIR}empath_all_{year}.parquet'))
df_ultimate.write.parquet(f'{DATA_DIR}empath_ultimate.parquet')



# Speaker data preprocessing
## Speaker disambiguation
We have obeserved that lower QIDs are generally assigned to more famous entities so in cases where there are multiple QIDs for some speaker we use the lowest QID.

In [None]:
get_lowest_qid_udf = F.udf(lambda x: ('Q' + str(min(map(lambda y: int(y[1:]), x))) if x != [] else None), StringType())
quotes_disamb = quotes.withColumn('qid', get_lowest_qid_udf('qids'))
quotes_disamb.drop('qids')
quotes_disamb.write.parquet(DATA_DIR + 'quotes_disamb.parquet')

In [21]:
speaker_attributes = spark.read.parquet('speaker_attributes.parquet')
labels = pd.read_csv({DATA_DIR} + 'wikidata_labels_descriptions_quotebank.csv.bz2')
labels = labels.set_index("QID")

In [22]:
def qids_to_attr_labels(qids):
    """Returns the list of labels corresponding the each qid in the given list"""
    if qids is None or len(qids) == 0:
        return []
    return list(map(lambda qid : labels.Label[qid] if qid in labels.index else qid, qids))
# We have noticed that in some cases there are two identical genders listed e.g. https://www.wikidata.org/wiki/Q18643315
# That's why we firstly use set to get only unique qids

def expand_spk_attr_with_labels():
    spk_attr = spark.read.parquet(SPK_ATTR_PATH)
    for col in ['nationality', 'gender', 'ethnic_group', 'occupation', 'party', 'academic_degree', 'candidacy', 'religion']:
        print(col)
        spk_attr.printSchema()
        spk_attr = spk_attr.withColumn(col + '_labels', f.udf(f.col(col)))
    spk_attr.write.parquet(SPK_ATTR_WITH_LABELS_PATH, 'overwrite')

In [23]:
qids_to_labels_udf = F.udf(qids_to_attr_labels, ArrayType(StringType()))

for col in ['nationality', 'gender', 'ethnic_group', 'occupation', 'party', 'academic_degree', 'candidacy', 'religion']:
    speaker_attributes = speaker_attributes.withColumn(col, qids_to_labels_udf(F.col(col)))

speaker_attributes.write.parquet(DATA_DIR + 'speaker_attributes_labels.parquet', 'overwrite')

                                                                                