# Quotation-level analysis

In [196]:
# Data handling and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

# PySpark
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StringType, BooleanType, StructType, StructField, FloatType, ArrayType

# String handling
import html
import tld
from dateutil.parser import parse

# Helpers
import sys
sys.path.append('/home/culjak/speaker-disambiguation-quotebank/')

# Calculating scores
import profanity_check
from sparknlp.pretrained import PretrainedPipeline
from empath import Empath
import plotly as pl
import plotly.express as px
import plotly.graph_objects as go

from dask.distributed import Client
import dask.bag as db
import dask.dataframe as dd
import chart_studio.plotly as py

chart_studio.tools.set_credentials_file(username='maculjak', api_key='FYjtIeggchZuO0fdfRPz')

In [6]:
# Starting the spark session
conf = pyspark.SparkConf().setMaster("local[24]").setAll([
    ('spark.driver.memory','20G'),
    ('spark.driver.maxResultSize', '20G'),
    ('spark.ui.port', 4866),
    ('spark.sql.execution.arrow.pyspark.enabled', True)
])

spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN') # ERROR, WARN, INFO, DEBUG, ...
spark

21/12/14 13:04:49 WARN Utils: Your hostname, iccluster039 resolves to a loopback address: 127.0.1.1; using 10.90.38.15 instead (on interface ens786f0)
21/12/14 13:04:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/12/14 13:04:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/14 13:04:50 WARN Utils: Service 'SparkUI' could not bind on port 4866. Attempting port 4867.
21/12/14 13:04:50 WARN Utils: Service 'SparkUI' could not bind on port 4867. Attempting port 4868.


In [142]:
quotes = spark.read.parquet('/dlabdata1/culjak/scratch/recent.parquet') #TODO make it more modular
# TODO add code for extracting domains
quotes.printSchema()

root
 |-- numOccurrences: long (nullable = true)
 |-- phase: string (nullable = true)
 |-- probas: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- qids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quotation: string (nullable = true)
 |-- quoteID: string (nullable = true)
 |-- speaker: string (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [110]:
profanity = spark.read.parquet('/dlabdata1/culjak/scratch/scores.parquet')
sentiment = spark.read.parquet('/dlabdata1/culjak/scratch/sentiments_final.parquet')
empath = spark.read.parquet('/dlabdata1/culjak/scratch/empath_final.parquet')
censored_quotes_without_censorship = spark.read.parquet('/dlabdata1/culjak/scratch/censorship_removed.parquet')

                                                                                

In [56]:
def get_hist(data, column, n_bins):
    """
    A function for getting histograms of a large dataset
    source: https://stackoverflow.com/questions/39154325/pyspark-show-histogram-of-a-data-frame-column
    """
    
    hist = data.select(column).rdd.flatMap(lambda x: x).histogram(20)

    # Loading the Computed Histogram into a Pandas Dataframe for plotting
    hist = pd.DataFrame(
        list(zip(*hist)), 
        columns=['bin', 'frequency']
    ).set_index(
        'bin'
    )
    
    return hist

In [57]:
profanity_hist = get_hist(profanity, 'scores', 20)

                                                                                

In [82]:
pl.offline.init_notebook_mode(connected=True) 
data=[go.Bar(
    x=profanity_hist.index,
    y=profanity_hist.frequency,
)]

layout = go.Layout(
    title="Profanity score distribution",
    xaxis_title="Profanity score",
    yaxis_title="Frequency",
    bargap=0,
)

figure = go.Figure(data, layout=layout)
figure.update_yaxes(type='log')
py.iplot(figure, filename='prof_score_dist')

In [76]:
profanity_hist.index

Float64Index([4.337957715976558e-05,   0.05004121059830178,
                0.10003904161944378,    0.1500368726405858,
                0.20003470366172782,   0.25003253468286984,
                 0.3000303657040118,   0.35002819672515384,
                0.40002602774629586,    0.4500238587674379,
                 0.5000216897885799,    0.5500195208097218,
                 0.6000173518308638,    0.6500151828520059,
                 0.7000130138731478,    0.7500108448942899,
                 0.8000086759154319,    0.8500065069365739,
                  0.900004337957716,    0.9500021689788579],
             dtype='float64', name='bin')

## Selecting the appropriate threshold for identifying profanity
Here we perform a manual evaluation of 100 quotes for each 0.1-wide range of profanity scores from 0.5 to 1.


In [102]:
for threshold in (0.5, 0.6, 0.7, 0.8, 0.9):
    sample = profanity.join(quotes, on='quoteID')\
        .where(F.col('scores') <= threshold + 0.1)\
        .where(F.col('scores') > threshold)\
        .select('quotation', 'scores')\
        .limit(100).collect()
    print(f'Threshold: {threshold}')
    for i in sample:
        print(i.quotation, i.scores)
        print()
    print()
    print()

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
                                                                                

Threshold: 0.5
I'll stuff it up a hell of a number of times and looked like an absolute dill. I've [ been telling myself I've ] just got to breathe and be as relaxed as you possibly can... and try to enjoy it, because it is a hell of a lot of fun, playing in front of a lot of people. 0.570556903273014

This is the land of opportunity, not of hate, and all I've heard tonight is hate, 0.5434425186400857

And then they're going to eat me! OH MY GOOOOOOOD! 0.5204966967520461

First, Wayand got hot. And then we got a little emotional and lost some assignments on defense. But, hey, we were able to knock down some foul shots to finish it off. It's a Wasaren League win on the road. We'll take it. 0.5426337751886801

When I was a boy, I grew up surrounded by strong women, like my mother and neighbors, the kind of generation of women who saved our country from the war. I'm talking about the women of La Mancha, without discriminating, who discussed everything out in the yard and in the streets, 0

                                                                                

Threshold: 0.6
It's sacrilegious. It's horrible, 0.6657661306358644

Symphony Masses: Ho Drakon Ho Megas, 0.6464166411059751

We fought our ass off again tonight. We just didn't make enough plays down the stretch. 0.6827753721427734

We want Jadda to shoot whenever she can, 0.6681073031221053

It was disrespectful for Mexicans and Mexican-Americans, 0.6595227555902524

I feel as if I've seen this before but that's probably because Miranda's half naked, red lips, white background... your typical Richardson, 0.6009269279988242

I still think you're an evil SOB, 0.6190855305458819

What has six balls and screws Texans? The Texas Lottery. 0.6140312766410176

Don't go into self-censorship even though it's your knee-jerk reaction, 0.6593514042803723

Barbra Boxer is the dumbest person in the senate, 0.6705062799591557

My death needs to mean something. My death needs to be counted in the number of transgender people who commit suicide this year. I want someone to look at that number and say 

                                                                                

Threshold: 0.7
There's already a buzz up there, 0.7568970859793362

I want to deal with those, acknowledge our mistakes, try to have a fortress controlled balance sheet, try to stop stepping in dog s *** which we do every now and then. 0.7098612886471241

We woke up like this! Smuchy frenchy face! #frenchy #frenchbulldog #bull #bully 0.7325710520006907

I'm in the kitchen, and I come back where she was and I see her legs jumping and not keeping time frantically. I was, like, `Oh shit, Granny's having one!' I'm like, `Granny, you all right?' And she said, `Yeah baby, I'm just dancing. ' 0.7730969590226954

It's like white people's `Truth or Dare,' except we don't have the dare, 0.7202339384502922

I hope he doesn't take his baseball bat and intimidate any of the legislators. I'd hate to think that he was a thug, 0.7256198367970206

Seth... I like your films, but right now, I wan na kick your ass. 0.7502556573369059

It just flat out stinks, 0.7866084129204685

Jealousy is an ugly emotio

                                                                                

Threshold: 0.8
a stupid rule and shouldn't be in the book. 0.8902521996539203

I don't know why you can't come here yet,'cause I am dying to kill NYPD. I am waiting for them. 0.8102046119454597

Just 'Cause I Wear Tight Pants Don't Mean I'm Gay 0.874972955046829

Well, we weren't going to win anyway so let's just play stupid, carefree football. 0.8107373549338153

Go ahead, fool. Keep going! 0.8443757913319192

They are just so ridiculous, so ridiculous. 0.8699791583778513

everything is corrupt, deformed and hypocritical 0.8416981482381971

Cami Li and Katie Hopkins are b ** ches to @AliciaDouvall, hope Alicia stays tomorrow and beats the bullies, 0.8335547984892082

Has this ever happened to you? You get sucked into Hell to marry Satan's daughter? Then you need Saints Row: Gat out of Hell. 0.8832934176013097

You will all die! I am a Muslim! 0.8550208348696227

OMG. The Pens are so dirty. 0.8672345052758674

This car is 22 years old, why the hell do you want to sit on this... 0.80998



Threshold: 0.9
We've worked really f *** ing hard to get to [ this ] point. But we're very, very close, and it's so nice. Follow topic 0.911469560677447

we should just grease this rat - **** son-of-a-bitch right now. 0.9974329882208443

ended the life of a man who was too much of a coward to do it himself. 0.9208400881094974

I'm not buying her shoes, I bought her a f *** ing career. 0.9548396560442687

Yes! Azealia is ranting again and all you motherf *** erS who pretend not to care are STILL LISTENING. What should I say next? 0.9175791640798693

The article sucks. Maybe. The show sucks. Absolutely. I didn't know A.M. wasn't writing until weeks... 0.9756386250870934

You make one mistake, one little fucking mistake, and the whole world comes crashing down around you. 0.925384932686649

Dirty Jew, you should stop working for Charlie Hebdo. Otherwise we're going to kill you, 0.9022300271012071

I'm going to bust my ass, 0.9715652136954949

When you hear that kind of polished pop and th

                                                                                

Clearly, thresholds lower than 0.9 allow for a large amount of false positives. Aditionally true positives for smaller thresholds contain mostly censored quotations for which we immediately set the profanity score to 1. Since we have a large mount of data, we aim for high precision so we set the threshold to 0.9.TODO: provide the numbers.

# Profanity and censorship analysis
## General stats

In [111]:
profanity = profanity.withColumn('profanity', F.when((F.col('scores') >= 0.9) | (F.col('censored') == 1), 1).otherwise(0)).select('quoteID', 'profanity', 'censored')

In [113]:
profanity_counts = profanity.groupby('profanity').count().toPandas()
profanity_counts

                                                                                

Unnamed: 0,profanity,count
0,1,643209
1,0,114321462


In [115]:
censored_counts = profanity.groupby('censored').count().toPandas()
censored_counts

Unnamed: 0,censored,count
0,0.0,114599181
1,1.0,365490


In [198]:
layout = go.Layout(
    title="Profanity in Quotebank",
    bargap=0,
)

figure = go.Figure(data=[go.Pie(labels=['Profane', 'Not profane'],
                             values=profanity_counts['count'], hole=.7)], layout=layout)

py.iplot(figure, filename='profanity_donut2')

In [134]:
layout = go.Layout(
    title="Censorship in Quotebank",
    bargap=0,
)

figure = go.Figure(data=[go.Pie(labels=['Censored', 'Not censored'],
                             values=censored_counts['count'], hole=.7)], layout=layout)

py.iplot(figure, filename='censorship_donut')

In [138]:
censored_profanity_counts = profanity.where(F.col('profanity') == 1).groupby('censored', 'profanity').count().toPandas()
censored_profanity_counts

                                                                                

Unnamed: 0,censored,profanity,count
0,0.0,1,277719
1,1.0,1,365490


In [140]:
layout = go.Layout(
    title="Censored profanities",
    bargap=0,
)

figure = go.Figure(data=[go.Pie(labels=['Censored profanities', 'Not censored profanities'],
                             values=censored_profanity_counts['count'], hole=.7)], layout=layout)

py.iplot(figure, filename='cen_prof_donut')

## Temporal analysis

In [144]:
profanity_date = profanity.join(quotes, on='quoteID').select('profanity', 'censored', 'date')
profanity_date = profanity_date.withColumn('day', F.dayofweek('date')).rolling(7)
profanity_date = profanity_date.withColumn('month', F.month('date'))
profanity_date = profanity_date.withColumn('year', F.year('date'))

In [161]:
w = (Window.orderBy(F.col("date").cast('long')).rangeBetween(-days(7), 0))

agg_per_day = profanity_date.groupby('date')\
    .agg(F.avg('profanity').alias('profanity_perc'),
         F.avg('censored').alias('censored_perc'),
         F.count('date').alias('n_quotes'))\
    .toPandas().sort_values('date')

moving_agg_per_day = profanity_date.withColumn('profanity_')

                                                                                

In [195]:
layout = go.Layout(
    title="Profanity through time",
)


figure = pl.subplots.make_subplots(rows=1, cols=1,
                    specs=[[{"secondary_y": True}]])



figure.add_trace(go.Scatter(x=agg_per_day['date'],
                            y=agg_per_day['profanity_perc'].rolling(30).mean(),
                            mode='lines', 
                            name='Profanity',
                            line_color='#fe4a49'), secondary_y=False)
                 
# figure.update_yaxes(type='log')
figure.add_trace(go.Scatter(x=agg_per_day['date'],
                            y=agg_per_day['censored_perc'].rolling(30).mean(),
                            mode='lines',
                            name='Censorship',
                            line_color='#2ab7ca'), secondary_y=False)

figure.add_trace(go.Scatter(x=agg_per_day['date'], 
                            y=agg_per_day['n_quotes'].rolling(30).sum(),
                            mode='lines',
                            name='Number of quotes',
                            line_color='#fed766'), secondary_y=True)


figure.update_layout(
        title_text="Profanity through time"
    )

py.iplot(figure, filename='prof_time')

In [191]:
sentiment.join(profanity, on='quoteID').where(F.col('scores') < 0.8).agg(F.avg('sentiment')).show()

AnalysisException: cannot resolve '`scores`' given input columns: [censored, confidence, profanity, quoteID, sentiment];
'Filter ('scores < 0.8)
+- Project [quoteID#2857, sentiment#2858L, confidence#2859, profanity#2927, censored#2853]
   +- Join Inner, (quoteID#2857 = quoteID#2851)
      :- Relation[quoteID#2857,sentiment#2858L,confidence#2859] parquet
      +- Project [quoteID#2851, profanity#2927, censored#2853]
         +- Project [quoteID#2851, scores#2852, censored#2853, CASE WHEN ((scores#2852 >= 0.9) OR (censored#2853 = cast(1 as double))) THEN 1 ELSE 0 END AS profanity#2927]
            +- Relation[quoteID#2851,scores#2852,censored#2853] parquet


In [None]:
sentiment.join(quotes, on='quoteID').rdd.map(x: lambda x: (x.quoteID, x.quotation.split(' '), sentiment, confidence))

In [22]:
spark.read.parquet('/scratch/culjak/removed_cen.parquet').join(profanity.where(F.col('censored') == 1), on='quoteID').count()

                                                                                

365490