In [1]:
%pip install pyspark==3.1.1 pyarrow alt-profanity-check

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
sys.executable

'/usr/local/anaconda3/envs/ada/bin/python'

In [3]:
# Make sure that spark uses the same python distribution to avoid serialization issues due to missing packages
%env SPARK_PYTHON=/usr/local/anaconda3/envs/ada/bin/python
%env SPARK_DRIVER=/usr/local/anaconda3/envs/ada/bin/python

env: SPARK_PYTHON=/usr/local/anaconda3/envs/ada/bin/python
env: SPARK_DRIVER=/usr/local/anaconda3/envs/ada/bin/python


In [2]:
# Data handling
import pandas as pd

# PySpark
import pyspark
from pyspark.sql import functions as f
from pyspark.sql import SparkSession

# Profanity check
import profanity_check

# Helpers
import os

In [4]:
conf = pyspark.SparkConf().setMaster("local[*]").setAll([
    ('spark.driver.memory','8G'),
    ('spark.driver.maxResultSize', '8G'),
    ('spark.sql.execution.arrow.pyspark.enabled', True),
    ('spark.sql.execution.arrow.maxRecordsPerBatch', 10000),
    ('spark.local.dir', '/tmp')
])

spark = SparkSession.builder.config(conf=conf).config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.2').getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')
spark

In [6]:
DATA_DIR = 'data/'
QUOTEBANK_DATA_DIR = DATA_DIR + 'quotebank_data/'
PROFANITY_SCORES_DIR = DATA_DIR + 'profanity_scores/'

In [7]:
# Works on an iterator of pandas dataframes and used for mapInPandas()
def compute_profanity(iterator):
    for pdf in iterator:
        yield pd.DataFrame({"quoteID": pdf.quoteID, "profanity": profanity_check.predict_prob(pdf.quotation)})

In [8]:
def compute_all_profanity_scores(file_list=None):
    if not file_list:
        file_list = os.listdir(QUOTEBANK_DATA_DIR + 'parquet')
    for fn in file_list:
        df = spark.read.parquet(QUOTEBANK_DATA_DIR + 'parquet/' + fn)
        df = df.mapInPandas(compute_profanity, schema="quoteID string, profanity double")
        df.write.parquet(PROFANITY_SCORES_DIR + 'profanity' + fn[6:], 'overwrite')

In [None]:
%%time
compute_all_profanity_scores(file_list=['quotes-2020.parquet'])

                                                                                