In [1]:
# Data handling and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

# PySpark
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StringType, BooleanType, StructType, StructField

# String handling
import html
import tld
from dateutil.parser import parse

# Helpers
import sys
sys.path.append('/home/culjak/speaker-disambiguation-quotebank/')

# Calculating scores
import profanity_check
from sparknlp.pretrained import PretrainedPipeline

In [2]:
conf = pyspark.SparkConf().setMaster("local[24]").setAll([
    ('spark.driver.memory','40G'),
    ('spark.driver.maxResultSize', '24G'),
    ('spark.ui.port', 4866),
    ('spark.sql.execution.arrow.pyspark.enabled', True),
])


spark = SparkSession.builder.config(conf=conf).config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR') # ERROR, WARN, INFO, DEBUG, ...
spark


21/12/11 13:56:11 WARN Utils: Your hostname, iccluster039 resolves to a loopback address: 127.0.1.1; using 10.90.38.15 instead (on interface ens786f0)
21/12/11 13:56:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/culjak/.conda/envs/speaker-disambiguation/lib/python3.9/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/culjak/.ivy2/cache
The jars for the packages stored in: /home/culjak/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8b7a44b3-36f2-4981-9f42-0fa8355c1ad6;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.3.4 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lomb

In [3]:
quotes = spark.read.parquet('/scratch/culjak/recent.parquet') #TODO make it more modular
# TODO add code for extracting domains
quotes.printSchema()

root
 |-- numOccurrences: long (nullable = true)
 |-- phase: string (nullable = true)
 |-- probas: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- qids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quotation: string (nullable = true)
 |-- quoteID: string (nullable = true)
 |-- speaker: string (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



## Cleaning

In [13]:
quotes.select('quotation').where(F.array_contains('domains', 'wikia.com')).show(100, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
print("Total quotes on Wikia:", quotes.where(F.array_contains('domains', 'wikia.com')).count())
quotes.where(F.array_contains('domains', 'wikia.com'))\
    .groupby('speaker')\
    .count()\
    .sort('count', ascending=False).show(50)

                                                                                

Total quotes on Wikia: 595558




+--------------------+------+
|             speaker| count|
+--------------------+------+
|                None|401619|
|         Hisao Nakai|  2270|
|          John Tracy|  2247|
|         Roger Ebert|  2110|
| Alexander the Great|  1962|
|         Casey Jones|  1098|
|        April O'Neil|  1069|
|     Queen of Hearts|   958|
|           Lady Gaga|   636|
|        Peter Parker|   601|
|             Big Mac|   591|
|    Vlad the Impaler|   553|
|   Dennis the Menace|   550|
|     Jenkins , David|   508|
|           Ruby Rose|   491|
|            la porte|   481|
|        Harry Potter|   466|
|       Jessica Jones|   436|
|      Captain Marvel|   397|
|          Sean Nolan|   396|
|Louisa Connolly-B...|   388|
|         Arthur Read|   385|
|     Cassandra Clare|   379|
|              Shen ?|   350|
|          Poison Ivy|   340|
|       Ebert , Roger|   326|
|             Peter ?|   317|
|           Adam West|   316|
|           Volume 10|   304|
|         Jeff Probst|   296|
|        S

                                                                                

We decided to exclude quotes from Wikia.com since
 - a majority of them are attributed either to no speaker at all and some of them are attributed to fictional characters e.g. Hisao Nakai, John Tracy, Casey Jones
 - some quotations are quite strange as seen from sample 100 quotations from wikia.com domain

In [59]:
to_remove = quotes.select('quoteID').where(F.array_contains('domains', 'wikia.com'))

Furthermore, we decided to remove all the quotations that (listed in order of appearing in the regex):
 - HTML tags
 - start with an equals sign
 - end with an equals sign
 - Start with "Bilde?"
 - start with an url i.e. with "https" or "http" string
 - start with "file:" string
 - contain equality with non-letter characters (Quotations containing equality where letters are on both sides may be valid. e.g. Sharing = Caring)
 - contains that contain an unopened closed bracket or an unclosed open bracket
 - contain CSS tags
 - contain HTML entities e.g. &nbsp;
 - can be parsed as dates
 
5 examples for each of the issues are listed below the following cell

In [62]:
# CSS tags are fetched from https://www.w3.org/TR/CSS2/propidx.html
css_tags = '(azimuth|background-attachment|background-color|background-image|background-position|background-repeat|background|border-collapse|border-color|border-spacing|border-style|border-top|border-right|border-bottom|border-left|border-top-color|border-right-color|border-bottom-color|border-left-color|border-top-style|border-right-style|border-bottom-style|border-left-style|border-top-width|border-right-width|border-bottom-width|border-left-width|border-width|border|bottom|caption-side|clip|color|content|counter-increment|counter-reset|cue-after|cue-before|cue|cursor|direction|display|elevation|empty-cells|float|font-family|font-size|font-style|font-variant|font-weight|font|height|left|letter-spacing|line-height|list-style-image|list-style-position|list-style-type|list-style|margin-right|margin-left|margin-top|margin-bottom|margin|max-height|max-width|min-height|min-width|orphans|outline-color|outline-style|outline-width|outline|overflow|padding-top|padding-right|padding-bottom|padding-left|padding|page-break-after|page-break-before|page-break-inside|pause-after|pause-before|pause|pitch-range|pitch|play-during|quotes|richness|right|speak-header|speak-numeral|speak-punctuation|speak|speech-rate|stress|table-layout|text-align|text-decoration|text-indent|text-transform|top|unicode-bidi|vertical-align|visibility|voice-family|volume|white-space|widows|width|word-spacing|z-index):'
regex = '<.*>|^=|=$|^Bilde?|^https?:|^file:|= [^ a-zA-Z]|^[^\[\{\(]*\]|\[[\]\}\)]*$|^[^\{\[\(]*\}|\{[^\}\]\)]*$|^[^<]*>|<[^>]*$' + '|' + css_tags + '&.*;'

to_remove = to_remove.union(quotes.select('quoteID').where(F.col('quotation').rlike(regex)))

def is_date(string):
    """Source: https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format"""
    try:
        parse(string, fuzzy=False)
        return True
    except ValueError:
        return False


is_date_udf = F.udf(is_date, BooleanType())
to_remove.union(quotes.select('quoteID').where(is_date_udf('quotation'))).write.parquet('to_remove.parquet') # Save all the QIDS that should be remove

                                                                                

In [44]:
regexs = ['<.*>', '^=', '^Bilde\?', '^https?:', '^file:', '= [^ a-zA-Z]+', '^[^\[\{\(]*\]|\[[\]\}\)]*$|^[^\{\[\(]*\}|\{[^\}\]\)]*$|', css_tags, '&.*;']
for rexp in regexs:
    print('Regular expression:', rexp)
    quotes.select('quotation').where(F.col('quotation').rlike(rexp)).show(5, truncate = False)
    print()

Regular expression: <.*>
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                                                                                                                         |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|x E9:? < t5 H@F=5 36 E96 7: CDE E @ E6 = = J@F[ E @ 86E E96; @ 3 =: < 6 E92E [ 7@C E92E C62D@?[: D?' E H92E J@F’C6 ECJ:? 8 E @ > 2 < 6 92AA6? ]

                                                                                

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                        |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|= = = = Categoría: YG Entertainment Categoría: Warner Music Taiwan Categoría: KRapero Categoría: KSolista Categoría: KCantante [ [ Categoría: KBailar...         |
|= @@ <:? 8:? E @ A@DD:3=6 = 625D ]                                                                                                                               |
|= Hector is the lead mechanic at Sevenjet on the Piaggio aircraft. Extremely knowledgeable and relentless on keeping the aircraft in perfect operating condition.|
|= / = discrimin

                                                                                

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                                                       |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Bilde? site = du & amp; date = 20170309 & amp; category = news01 & amp; artno = 309009999 & amp; ref = ph & amp; item = 1 & amp; newtbl = 1 & amp; maxw = 650 & amp; ts = 1489116838 & amp; imageversion = 16by9|
|Bilde? site = cj & amp; date = 20170318 & amp; category = news01 & amp; artno = 318009999 & amp; ref = ph & amp; item = 1 & amp; newtbl = 1 & amp; maxw = 6

                                                                                

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                 |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|file: / / 10.86.178.62 / Users/Gary% 20Dvorchak/Desktop/Blueshirt% 20Group% 20Asia/Clients/China% 20Rapid% 20Finance% 20 (XRF) / Earnings/2018-1Q/ralph@blueshirtgroup.com|
|file: / / / C | / Users/mexic/AppData / Roaming/Adobe/Dreamweaver CC 2017/en _ US/Configuration/Temp / Assets/eamD3F6. tmp/images/profile. png                            |
|file: / / localhost/Users/mmccarty / Library/Caches/TemporaryItems / msoclip/0/clip _ image002. png                                   

[Stage 200:>                                                        (0 + 1) / 1]

+---------------------------------------------------------+
|quotation                                                |
+---------------------------------------------------------+
|background-color: # 808080;                              |
|font-family: Cambria; font-size:16 px; line-height:22 px;|
|background: #C 0C0C0; width:140 px                       |
|vertical-align: top; width:150 px;                       |
|text-align: right; background: # 2B9AAA;                 |
+---------------------------------------------------------+
only showing top 5 rows


Regular expression: &.*;
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                  

                                                                                

In [4]:
to_remove = spark.read.parquet('to_remove.parquet')
quotes = quotes.join(to_remove, on='quoteID', how='leftanti') # We use left anti join to remove spurious quotations

## Censorship removal

The list of the words considered for censorship removal was taken from https://www.cs.cmu.edu/~biglou/resources/. The list was filtered by running profanity_check on all the words, selecting those with profanity probability higher than 0.9 and then removing ones which are unlikely to be censored or appear in the quotations by hand.

In [7]:
with open('bad-words.txt', 'r') as f:
    bad_words = f.readlines()
    bad_words = [word.strip('\n ') for word in bad_words]

In [8]:

scores = profanity_check.predict_prob(bad_words)
bad_words_filtered = [word for word, score in zip(bad_words, scores) if score > 0.9]
print(*bad_words_filtered, sep=' ')

arse ass asses asshole assholes bastard bitch bitches bitching boobs bullshit butt butt-fuck butt-fucker butt-fuckers cock cocksucker crack-whore crap cum cunt damn dick dickhead die dipshit dumb dumbass dyke fag faggot fuck fucked fucker fuckers fuckhead fuckin fucking fucks fucktard fuk gay goddamn hell hoes homosexual idiot jackass kill looser loser moron motherfucker negro negro's nig nigga niggah nigger niggers nigger's penis piss pussies pussy queer racist rapist retard retarded scum semen shit shitty stupid suck twat wanker whore wtf


In [9]:
words_to_remove = ['butt', 'butt-fuck', 'butt-fucker', 'butt-fuckers', 'crack-whore', 'cum', 'die', 'dumb', 'dyke', 'fuk', 'gay',
                   'hell', 'hoes','homosexual', 'idiot', 'jackass', 'kill', 'looser', 'loser', 'moron', "negro's", 'piss', 'queer',
                   'racist', 'rapist', 'retard', 'retarded', 'scum', 'semen', 'stupid', 'suck', 'wanker', 'wtf']
for word in words_to_remove:
    bad_words_filtered.remove(word)

We firstly define some helper functions for censorship removal regex construction

In [153]:
def possible_positions(string):
    """
    For a given string returns all the positions such that the string is not a substring of any other word in the text.
    """
    # It is possible for a string to be followed by one or three punctuation characters, a dash, an apostrophe or a quotation mark characters before the end of the string or a space
    return (f' {string}(["\'?!,\.\-]|[\.!?]{{3,}})? ', # The string is in the middle of a text
            f'^{string}(["\'?!,\.\-]|[\.!?]{{3,}})? ', # A text starts with the string
            f' {string}(["\'?!,\.\-]|[\.!?]{{3,}})?$', # A text ends with a string
            f'^{string}(["\'?!,\.\-]|[\.!?]{{3,}})?$') # Text contains only the string

def upper_lower(word):
    """
    Generates a regular expression that covers all the possible casings of the word.
    """
    string = ''
    for i in word:
        string += f'[{i}{i.swapcase()}]'
        
    return string

# For example
print(upper_lower('ADA'))

def middle_censorship(word):
    """
    Returns all the possible censorship cases where at least the first and the last character of the word are preserved. e.g f**k
    """
    censorship = []
    for i in range(1, len(word)):
        for j in range(1, len(word) - i):
            censorship.extend(possible_positions(f'{upper_lower(word[:j])} ?[_\-\.\*@#%]+ ?{upper_lower(word[j + i:])}'))
    return censorship
    
    
def end_censorship(word):
    """
    Returns all the possible censorship cases where the first character is always preserved, but the last one is never preserved. e.g. f***
    """
    
    censorship = []
    for i in range(1, len(word)):
        if i + 1 == len(word):
            censorship.extend(possible_positions(f'{upper_lower(word[:i])} ?[_\-\*@#%]'))
        else:
            censorship.extend(possible_positions(f'{upper_lower(word[:i])} ?([_\-\*@#%]{{1,{len(word) - i}}}|\.{{2,{len(word) - i}}}) ?'))
    return censorship
    
# For example
print(*middle_censorship('Fuck'), sep='\n')
print()
print(*end_censorship('Fuck'), sep='\n')

[Aa][Dd][Aa]
 [Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})? 
^[Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})? 
 [Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})?$
^[Ff] ?[_\-\.\*@#%]+ ?[cC][kK](["'?!,\.\-]|[\.!?]{3,})?$
 [Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
^[Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
 [Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$
^[Ff][uU] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$
 [Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
^[Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})? 
 [Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$
^[Ff] ?[_\-\.\*@#%]+ ?[kK](["'?!,\.\-]|[\.!?]{3,})?$

 [Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})? 
^[Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})? 
 [Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})?$
^[Ff] ?([_\-\*@#%]{1,3}|\.{2,3}) ?(["'?!,\.\-]|[\.!?]{3,})?$
 [Ff][uU] ?([_\-\*@#%]{1,2}|\.{2,2}) ?(["'?!,\.\-]|[\.!?]{3,})? 
^[Ff][uU]

In [155]:
middle_censorship_dict = {word: middle_censorship(word) for word in bad_words_filtered}

censorships = []
for censorship_arr in middle_censorship_dict.values():
    censorships.extend(censorship_arr)

For "end censorship", we consider a smaller list of words to account for ambigutity and false positives by heuristically removing the words we assume would likely not going to be censored and their frequency in the uncensored quotes which is computed below.

In [11]:
profanity_freqs = {}
for word in bad_words_filtered:
    profanity_freqs[word] = quotes.where(F.col('quotation').rlike('|'.join(possible_positions(word)))).count()
    
    
for word, count in profanity_freqs.items():
    print(word, count)



arse 3400
ass 37174
asses 3527
asshole 3517
assholes 1338
bastard 3113
bitch 14618
bitches 3349
bitching 830
boobs 6106
bullshit 8069
cock 1631
cocksucker 92
crap 20457
cunt 663
damn 45679
dick 5758
dickhead 466
dipshit 67
dumbass 686
fag 908
faggot 671
fuck 27332
fucked 6540
fucker 453
fuckers 398
fuckhead 36
fuckin 4516
fucking 40680
fucks 951
fucktard 4
goddamn 3770
motherfucker 1786
negro 1157
nig 58
nigga 3403
niggah 5
nigger 768
niggers 336
nigger's 11
penis 7075
pussies 336
pussy 3073
shit 54585
shitty 3483
twat 288
whore 2339


                                                                                

In [156]:
words_to_remove = ['boobs', 'fag', 'goddamn', 'fucker', 'damn', 'dipshit', 'niggah', 'nig'] 
# fucker and fucked would have the same end_censorship regex, so we remove fucker since fucked appears more frequently
# damn is quite frequent, but we believe it is unlikely to be censored. Thus, we remove it in this stage.
bad_words_end = bad_words_filtered.copy()
for word in words_to_remove:
    bad_words_end.remove(word)

end_censorship_dict = {word: end_censorship(word) for word in bad_words_end}

for censorship_arr in end_censorship_dict.values():
    censorships.extend(censorship_arr)

In [159]:
censorship_blended = '|'.join(censorships)

preliminary_filtered_quotes = quotes.where(F.col('quotation').rlike('[_\-\*@#%]|\.\.'))
censored_quotes = preliminary_filtered_quotes.select('quoteID').where(F.col('quotation').rlike(censorship_blended))# If dot was used as a censorship character, we consider only only censorships that include more that one dot.

In [None]:
censored_quotes.select('quoteID').write.parquet('/scratch/culjak/censorship2.parquet')

[Stage 285:>                                                     (0 + 24) / 148]

In [65]:
censored_quotes = spark.read.parquet('censorship.parquet').join(quotes, on='quoteID')
# censored_quotes_pd = censored_quotes.select('quoteID', 'quotation').toPandas()

                                                                                

Resolving "middle" censorship. Here, we sort the obscene words by their frequency in the uncensored quotes in descending order. This is done so that, in case of ambiguity, some censored string is replaced with a word that appears more frequently. However, although e.g. "boobs" appear more frequently in the quotes than "bitches", we firstly resolve "bitches" because it is more likely to be censored. A change in order is performed by modifying word frequencies.

In [120]:
profanity_freqs['bitches'] += profanity_freqs['boobs']
for word, _ in sorted(profanity_freqs.items(), key=lambda x: -x[1]):
    word_censorships = middle_censorship_dict[word]
    if word == 'assholes':
        print(word_censorships)
        rx = '|'.join(word_censorships)
    censored_quotes = censored_quotes.withColumn('quotation', F.regexp_replace('quotation', '|'.join(word_censorships), ' ' + word + n_groups(len(word_censorships)) + ' '))

censored_quotes.select('quotation').show(100, truncate=False)

[' [aA] ?[_\\-\\.\\*@#%]+ ?[sS][hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})? ', '^[aA] ?[_\\-\\.\\*@#%]+ ?[sS][hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})? ', ' [aA] ?[_\\-\\.\\*@#%]+ ?[sS][hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})?$', '^[aA] ?[_\\-\\.\\*@#%]+ ?[sS][hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})?$', ' [aA][sS] ?[_\\-\\.\\*@#%]+ ?[hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})? ', '^[aA][sS] ?[_\\-\\.\\*@#%]+ ?[hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})? ', ' [aA][sS] ?[_\\-\\.\\*@#%]+ ?[hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})?$', '^[aA][sS] ?[_\\-\\.\\*@#%]+ ?[hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})?$', ' [aA][sS][sS] ?[_\\-\\.\\*@#%]+ ?[oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})? ', '^[aA][sS][sS] ?[_\\-\\.\\*@#%]+ ?[oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})? ', ' [aA][sS][sS] ?[_\\-\\.\\*@#%]+ ?[oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})?$', '^[aA][sS][sS] ?[_\\-\\.\\*@#%]+ ?[oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})?$', ' [aA][sS][sS][hH] ?[_\\-\\.\\*@#%]+ ?[

[Stage 244:>                                                        (0 + 1) / 1]

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|quotation                                                                                                                                                                   

                                                                                

In [None]:
censored_quotes_pd['censored'] = 1
censored_quotes_pd['scores'] = profanity_check.predict_prob(censored_quotes_pd['quotation'])

In [5]:
dfs = []
for year in range(2015, 2021):
    print(str(year) + '...')
    for i, months in enumerate(('0[1-6]', '(0[7-9]|1[0-2])')):
        print('\t' + str(i + 1) + '. half...')
        partition = quotes.where(F.col('date').rlike(f'{year}-{months}')).select('quoteID', 'quotation').toPandas()
        partition['scores'] = profanity_check.predict_prob(partition['quotation'])
        dfs.append(partition[['quoteID', 'scores']])
        
# scores = pd.union(dfs)
# spark.write.parquet()

In [19]:
scores_all = pd.concat(dfs)

In [20]:
scores_all['censored'] = 0

15557151

error: invalid group reference 3 at position 6

In [111]:
def n_groups(n, referencing_char='$'):
    return ''.join([referencing_char + str(i) for i in range(1, n + 1)])

In [112]:
n_groups(5)

'$1$2$3$4$5'

In [121]:
# regex = '[aA] ?[_\-\.\*@#%]+ [hH][oO][lL][eE][sS](["\'?!,\\-]|[\\.!?]{3,})?$'
re.search(rx, 'a -- holes')

<re.Match object; span=(0, 10), match='a -- holes'>

In [125]:
spark.createDataFrame((' a -- holes. ', 'a'))

TypeError: Can not infer schema for type: <class 'str'>

In [146]:
data2 = [("James"," a -- holes. ",)]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True),
  ])
 
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)

+---------+-------------+
|firstname|middlename   |
+---------+-------------+
|James    | a -- holes. |
+---------+-------------+



In [147]:
df.withColumn('replacement', F.regexp_replace('middlename', rx, 'asshole' + n_groups(84))).show()

+---------+-------------+-------------+
|firstname|   middlename|  replacement|
+---------+-------------+-------------+
|    James| a -- holes. | a -- holes. |
+---------+-------------+-------------+



In [138]:
re.search(rx, ' a ?? hole ')

In [144]:
len(middle_censorship_dict['assholes'])

84