In [6]:
# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

# PySpark
import pyspark
from pyspark import SparkContext
from pyspark.sql import functions as f
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import IntegerType, BooleanType, ArrayType, StringType

#Spark NLP
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

# Helpers
import os

In [7]:
conf = pyspark.SparkConf().setMaster("local[*]").setAll([
    ('spark.driver.memory','16G'),
    ('spark.driver.maxResultSize', '8G'),
    ('spark.sql.execution.arrow.pyspark.enabled', True),
    ('spark.local.dir', '/media/maculjak/2e9080dc-73f3-426f-a054-a46f620aea95/tmp')
])

spark = SparkSession.builder.config(conf=conf).config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.2").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
spark


In [8]:
DATA_DIR = 'data/'
QUOTEBANK_DATA_DIR = DATA_DIR + 'quotebank_data/'

In [None]:
dfs = []

for i in os.listdir(QUOTEBANK_DATA_DIR):
    dfs.append(spark.read.json(QUOTEBANK_DATA_DIR + i))

In [15]:
df = dfs[0]
for df_part in dfs[1:]:
    df = df.union(df_part)

In [15]:
df = spark.read.parquet(DATA_DIR + 'qb_data.parquet')

In [None]:
pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')


In [2]:
def query_entities(x, entity):
    return any([i.entity == entity for i in x])

In [51]:
df = pipeline.annotate(dff.limit(100), column='quotation')

In [77]:

def entities_to_str_entity_pair(x):
    return [(i.result, i.metadata['entity']) for i in x] #if i.result not in ("I'd", "I'm", "He's", "She's", "I", "He'd", "She'd", "They", "They're", "You", "You're", "You'd", "He", "She", "I've", "W're")]
df = pipeline.annotate(dff.where(dff.date.rlike('^2020')), column='quotation')\
    .select('quoteID', f.udf(entities_to_str_entity_pair, ArrayType(ArrayType(StringType())))(f.col('entities')).alias('entities'))

df.write.parquet('2020-quotes-ner.parquet')
# df = df.withColumn('entities', ).show(truncate=False)

                                                                                

In [5]:
df_ner = spark.read.parquet('2020-quotes-ner.parquet')
# dff.where(f.col('quoteID') == '2020-02-10-003820').first()
# def has_person(x):
#     return any([i[1] == 'PER' for i in x])
# dfff.where(f.udf(has_person, BooleanType())(f.col('entities'))).()
def is_person(x):
    return x[1] == 'PER'


In [89]:
df_ner.select(f.explode(f.col('entities')).alias('entity')).where(f.udf(is_person, BooleanType())(f.col('entity')))\
.groupby('entity').count().sort('count', ascending=False).show(100, truncate=False)



+-----------+-----+
|entity     |count|
+-----------+-----+
|[Dick, PER]|163  |
+-----------+-----+



                                                                                

In [30]:
conspiracy_theorists = set(map(lambda x: x.lower(), pd.read_csv('conspiracy.csv')['itemLabel']))
def is_conspiracy_theorist(x):
    return str(x) in conspiracy_theorists

In [39]:
df.select('quotation').where(f.lower(f.col('speaker')).isin(conspiracy_theorists)).where().groupby('speaker').count().sort('count', ascending=False).show()



+------------------+------+
|           speaker| count|
+------------------+------+
|      Donald Trump|252214|
|        Alex Jones|  8586|
|        Mike Adams|  3971|
|      John Coleman|  3135|
|    Robert Spencer|  1670|
|      David Berger|  1621|
|     Pamela Geller|  1613|
|    Mike Cernovich|  1493|
|     Bradley Smith|  1464|
|      Jerome Corsi|  1324|
|     Patrick Moore|  1096|
|Paul Joseph Watson|   989|
|        Rick Wiles|   958|
|         Mark Lane|   816|
|      Jack Burkman|   704|
|      Craig Murray|   631|
|      DONALD Trump|   627|
|      DONALD TRUMP|   621|
| Daniel Greenfield|   583|
|     Kevin Barrett|   528|
+------------------+------+
only showing top 20 rows



                                                                                

In [None]:
df.join(df_ner, on='quoteID').select('quotation', 'entities').show()

