In [4]:
# Data handling and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss

# PySpark
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StringType, BooleanType, StructType, StructField, FloatType, ArrayType, MapType

# String handling
import html
import tld

# Helpers
import sys
sys.path.append('/home/culjak/speaker-disambiguation-quotebank/')

# Calculating scores
import profanity_check
from sparknlp.pretrained import PretrainedPipeline
from empath import Empath
import plotly as pl
import plotly.express as px
import plotly.graph_objects as go

# Stats
import scipy.stats as ss
from statsmodels.stats.proportion import proportion_confint

import chart_studio.plotly as py
import chart_studio
from itertools import cycle
from plotly.offline import iplot
chart_studio.tools.set_credentials_file(username='mculyak', api_key='IQgDLvC6thrs3aANLk35')

In [3]:
# Starting the spark session
conf = pyspark.SparkConf().setMaster("local[24]").setAll([
    ('spark.driver.memory','20G'),
    ('spark.driver.maxResultSize', '20G'),
    ('spark.ui.port', 4866),
    ('spark.sql.execution.arrow.pyspark.enabled', True)
])

spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN') # ERROR, WARN, INFO, DEBUG, ...
spark

21/12/16 00:08:25 WARN Utils: Your hostname, iccluster111 resolves to a loopback address: 127.0.1.1; using 10.90.36.41 instead (on interface eno1)
21/12/16 00:08:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/12/16 00:08:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/16 00:08:30 WARN Utils: Service 'SparkUI' could not bind on port 4866. Attempting port 4867.


# Quotation-level analysis
In this notebook we plot some simple visualizations concerning profanity distribution through time and some general profanity and censorship statistics

In [4]:
DATA_DIR = '../data/'

quotes = spark.read.parquet(DATA_DIR + 'quotse.parquet')
# TODO add code for extracting domains
quotes.printSchema()

                                                                                

root
 |-- numOccurrences: long (nullable = true)
 |-- phase: string (nullable = true)
 |-- probas: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- qids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quotation: string (nullable = true)
 |-- quoteID: string (nullable = true)
 |-- speaker: string (nullable = true)
 |-- domains: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [5]:
profanity = spark.read.parquet(DATA_DIR + 'profanity_expanded.parquet')
empath = spark.read.parquet(DATA_DIR + 'empath_ultimate.parquet')
censored_quotes_without_censorship = spark.read.parquet(DATA_DIR 'censorship_removed.parquet')

# Profanity and censorship analysis
## General stats

In [282]:
profanity_counts = profanity.groupby('profanity').count().toPandas()
profanity_counts

                                                                                

Unnamed: 0,profanity,count
0,1,1146168
1,0,113818503


In [283]:
censored_counts = profanity.groupby('censored').count().toPandas()
censored_counts

Unnamed: 0,censored,count
0,0.0,114599181
1,1.0,365490


In [284]:
layout = go.Layout(
    title="Profanity in Quotebank",
    bargap=0,
)

figure = go.Figure(data=[go.Pie(labels=['Profane', 'Not profane'],
                             values=profanity_counts['count'], hole=.7)], layout=layout)

py.iplot(figure, filename='profanity_donut')

In [285]:
layout = go.Layout(
    title="Censorship in Quotebank",
    bargap=0,
)

figure = go.Figure(data=[go.Pie(labels=['Censored', 'Not censored'],
                             values=censored_counts['count'], hole=.7)], layout=layout)

py.iplot(figure, filename='censorship_donut')

In [286]:
censored_profanity_counts = profanity.where(F.col('profanity') == 1).groupby('censored', 'profanity').count().toPandas()
censored_profanity_counts

Unnamed: 0,censored,profanity,count
0,0.0,1,780678
1,1.0,1,365490


In [295]:
layout = go.Layout(
    title="Censored profanities",
    bargap=0,
)

figure = go.Figure(data=[go.Pie(labels=['Censored profanities', 'Not censored profanities'],
                             values=censored_profanity_counts['count'], hole=.7)], layout=layout)

py.iplot(figure, filename='cen_prof_donut')

## Temporal analysis

In [6]:
profanity_date = profanity.join(quotes, on='quoteID').select('profanity', 'censored', 'date')

In [47]:
agg_per_day = profanity_date.groupby('date')\
    .agg(F.sum('profanity').alias('profanity'),
         F.sum('censored').alias('censored'),
         F.count('date').alias('n_quotes'))\
    .toPandas()\
    .sort_values('date')

agg_per_day['date'] = pd.to_datetime(agg_per_day['date'])

                                                                                

In [49]:
agg_per_week = agg_per_day.resample('W-Mon', on='date').sum().reset_index().sort_values(by='date')
agg_per_month = agg_per_day.resample('M', on='date').sum().reset_index().sort_values(by='date')
agg_per_year = agg_per_day.resample('Y', on='date').sum().reset_index().sort_values(by='date')

In [250]:
def time_area_plot(fig, df, columns, fill, names, palette=px.colors.diverging.Portland, palette_indices=(1, 4, 2), log=True, visible=False):
    for c, f, n, i in zip(columns, fill, names, palette_indices):
        fig.add_trace(go.Scatter(x=df['date'],
                         y=df[c],
                         fill=f,
                         name=n,
                         line_color=palette[i],
                         visible=visible))
    if log:
        fig.update_yaxes(type='log')
    return fig

In [292]:
names = ['Censored quotes', 'Profane quotes', 'All quotes']
fills = ['tozeroy', 'tonexty', 'tonexty']
columns = ['censored', 'profanity', 'n_quotes']
fig = go.Figure()

fig = time_area_plot(fig, agg_per_day, columns, fills, names, visible=True)
fig = time_area_plot(fig, agg_per_week, columns, fills, names)
fig = time_area_plot(fig, agg_per_month, columns, fills, names)
fig = time_area_plot(fig, agg_per_year, columns, fills, names)

fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="right",
            x=0.205,
            y=1.21,
            showactive=True,
            buttons=list(
                [
                    dict(
                        label="Daily",
                        method="update",
                        args=[
                            {"visible": [True, True, True, False, False, False, False, False, False, False, False, False]},
                        ],
                    ),
                    dict(
                        label="Weekly",
                        method="update",
                        args=[
                            {"visible": [False, False, False, True, True, True, False, False, False, False, False, False]},
                        ],
                    ),
                    dict(
                        label="Monthly",
                        method="update",
                        args=[
                            {"visible": [False, False, False, False, False, False, True, True, True, False, False, False]},
                        ],
                    ),
                    dict(
                        label="Yearly",
                        method="update",
                        args=[
                            {"visible": [False, False, False, False, False, False, False, False, False, True, True, True,]},
                        ],
                    )
                ]
            ),
        )
    ]
)

fig.update_layout(title='Profanity through time',
                  xaxis_title='Year',
                  yaxis_title='Quote frequency',
                  font=dict(family='Helvetica'))
py.iplot(fig, filename='profanity_through_time')

In [252]:
ordered_weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                    'Friday', 'Saturday', 'Sunday']# because pandas sorts weekdays alphabetically
weekday_agg = agg_per_day.groupby(agg_per_day['date'].dt.day_name())\
                    .sum()\
                    .reindex(ordered_weekdays)\
                    .reset_index()

weekday_agg['profanity_perc'] = weekday_agg['profanity'] / weekday_agg['n_quotes']
weekday_agg['censorship_perc'] = weekday_agg['censored'] / weekday_agg['n_quotes']
# Confidence intervals
profanity_CI_lower_week, profanity_CI_upper_week = proportion_confint(weekday_agg['profanity'],
                                                                      weekday_agg['n_quotes'], alpha=0.01)
censorship_CI_lower_week, censorship_CI_upper_week = proportion_confint(weekday_agg['censored'],
                                                                      weekday_agg['n_quotes'], alpha=0.01)

In [293]:
palette = px.colors.diverging.Portland
fig = go.Figure()
fig.add_trace(go.Bar(
    name='Profanity',
    marker_color=palette[4],
    x=weekday_agg.date, y=weekday_agg.profanity_perc,
    error_y=dict(type='data', array=profanity_CI_lower_week - profanity_CI_upper_week)
))


fig.add_trace(go.Bar(
    name='Censorship',
    marker_color=palette[1],
    x=weekday_agg.date, y=weekday_agg.censorship_perc,
    error_y=dict(type='data', array=censorship_CI_lower_week - censorship_CI_upper_week, width=5)
))

fig.update_layout(
    title='Average profanity by day of the week',
    xaxis_title='Day of the week',
    yaxis_title='Average profanity',
    barmode='overlay'
)

py.iplot(fig, filename='profanity_dotw')

In [254]:
ordered_months = ['January', 'February', 'March', 'April', 'May', 'June',
                  'July', 'August', 'September', 'October', 'November', 'December'] # because pandas sorts weekdays alphabetically
month_agg = agg_per_day.groupby(agg_per_day['date'].dt.month_name())\
                    .sum()\
                    .reindex(ordered_months)\
                    .reset_index()


month_agg['profanity_perc'] = month_agg['profanity'] / month_agg['n_quotes']
month_agg['censorship_perc'] = month_agg['censored'] / month_agg['n_quotes']

profanity_CI_lower_month, profanity_CI_upper_month = proportion_confint(month_agg['profanity'],
                                                                        month_agg['n_quotes'], alpha=0.01)

censorship_CI_lower_month, censorship_CI_upper_month = proportion_confint(month_agg['censored'],
                                                                      month_agg['n_quotes'], alpha=0.01)

In [296]:
fig = go.Figure()
fig.add_trace(go.Bar(
    name='Profanity',
    marker_color=palette[4],
    x=month_agg.date, y=month_agg.profanity_perc,
    error_y=dict(type='data', array=profanity_CI_upper_month - profanity_CI_lower_month)
))


fig.add_trace(go.Bar(
    name='Censorship',
    marker_color=palette[1],
    x=month_agg.date, y=month_agg.censorship_perc,
    error_y=dict(type='data', array=censorship_CI_upper_month - censorship_CI_lower_month)
))
fig.update_layout(
    title='Average profanity by month',
    xaxis_title='Month',
    yaxis_title='Average profanity',
    barmode='overlay'
)

py.iplot(fig, filename='profanity_month')