In [1]:
# Data handling and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

# PySpark
import pyspark
from pyspark import SparkContext
from pyspark.sql import functions as f
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import IntegerType, BooleanType, ArrayType, StringType, MapType, FloatType

#Spark NLP
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

# Helpers
import os

In [2]:
conf = pyspark.SparkConf().setMaster("local[*]").setAll([
    ('spark.driver.memory','16G'),
    ('spark.driver.maxResultSize', '8G'),
    ('spark.sql.execution.arrow.pyspark.enabled', True),
    ('spark.local.dir', '/media/maculjak/2e9080dc-73f3-426f-a054-a46f620aea95/tmp')
])

spark = SparkSession.builder.config(conf=conf).config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.2").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
spark


21/11/09 18:44:15 WARN Utils: Your hostname, aventinus resolves to a loopback address: 127.0.1.1; using 192.168.1.118 instead (on interface wlp5s0)
21/11/09 18:44:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/maculjak/anaconda3/envs/ada/lib/python3.8/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/maculjak/.ivy2/cache
The jars for the packages stored in: /home/maculjak/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6d21f5a4-1e7e-4443-83de-f172435cf9de;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.3.2 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#

In [3]:
DATA_DIR = 'data/'
QUOTEBANK_DATA_DIR = DATA_DIR + 'quotebank_data/'

In [None]:
dfs = []

for i in os.listdir(QUOTEBANK_DATA_DIR):
    dfs.append(spark.read.json(QUOTEBANK_DATA_DIR + i))

In [15]:
df = dfs[0]
for df_part in dfs[1:]:
    df = df.union(df_part)

In [4]:
df = spark.read.parquet(DATA_DIR + 'qb_data.parquet')

In [None]:
pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')

In [2]:
def query_entities(x, entity):
    return any([i.entity == entity for i in x])

In [51]:
df = pipeline.annotate(dff.limit(100), column='quotation')

In [77]:

def entities_to_str_entity_pair(x):
    return [(i.result, i.metadata['entity']) for i in x] #if i.result not in ("I'd", "I'm", "He's", "She's", "I", "He'd", "She'd", "They", "They're", "You", "You're", "You'd", "He", "She", "I've", "W're")]
df = pipeline.annotate(dff.where(dff.date.rlike('^2020')), column='quotation')\
    .select('quoteID', f.udf(entities_to_str_entity_pair, ArrayType(ArrayType(StringType())))(f.col('entities')).alias('entities'))

df.write.parquet('2020-quotes-ner.parquet')
# df = df.withColumn('entities', ).show(truncate=False)

                                                                                

In [5]:
df_ner = spark.read.parquet('2020-quotes-ner.parquet')
# dff.where(f.col('quoteID') == '2020-02-10-003820').first()
# def has_person(x):
#     return any([i[1] == 'PER' for i in x])
# dfff.where(f.udf(has_person, BooleanType())(f.col('entities'))).()
def is_person(x):
    return x[1] == 'PER'


In [89]:
df_ner.select(f.explode(f.col('entities')).alias('entity')).where(f.udf(is_person, BooleanType())(f.col('entity')))\
.groupby('entity').count().sort('count', ascending=False).show(100, truncate=False)



+-----------+-----+
|entity     |count|
+-----------+-----+
|[Dick, PER]|163  |
+-----------+-----+



                                                                                

In [30]:
conspiracy_theorists = set(map(lambda x: x.lower(), pd.read_csv('conspiracy.csv')['itemLabel']))
def is_conspiracy_theorist(x):
    return str(x) in conspiracy_theorists

In [39]:
df.select('quotation').where(f.lower(f.col('speaker')).isin(conspiracy_theorists)).where().groupby('speaker').count().sort('count', ascending=False).show()



+------------------+------+
|           speaker| count|
+------------------+------+
|      Donald Trump|252214|
|        Alex Jones|  8586|
|        Mike Adams|  3971|
|      John Coleman|  3135|
|    Robert Spencer|  1670|
|      David Berger|  1621|
|     Pamela Geller|  1613|
|    Mike Cernovich|  1493|
|     Bradley Smith|  1464|
|      Jerome Corsi|  1324|
|     Patrick Moore|  1096|
|Paul Joseph Watson|   989|
|        Rick Wiles|   958|
|         Mark Lane|   816|
|      Jack Burkman|   704|
|      Craig Murray|   631|
|      DONALD Trump|   627|
|      DONALD TRUMP|   621|
| Daniel Greenfield|   583|
|     Kevin Barrett|   528|
+------------------+------+
only showing top 20 rows



                                                                                

In [None]:
df.join(df_ner, on='quoteID').select('quotation', 'entities', 'probas').show(truncate=False)

In [16]:
test_quotes = [i.quotation for i in df.sample(0.00001).select('quotation').collect()]

                                                                                

In [17]:
len(test_quotes)

1183

In [5]:
from top2vec import Top2Vec

In [19]:
import time
t = time.time()
model = Top2Vec(documents=test_quotes)
time.time() - t

2021-11-07 19:29:12,767 - top2vec - INFO - Pre-processing documents for training
2021-11-07 19:29:12,848 - top2vec - INFO - Creating joint document/word embedding
2021-11-07 19:29:15,147 - top2vec - INFO - Creating lower dimension embedding of documents
2021-11-07 19:29:18,273 - top2vec - INFO - Finding dense areas of documents
2021-11-07 19:29:18,311 - top2vec - INFO - Finding topics


5.553746938705444

list

In [22]:
df.where(f.col('speaker') != 'None').count()

                                                                                

75595368

In [25]:
df.where(f.col('speaker') != 'None').groupby('speaker').count().sort('count', ascending=False).show(100, truncate=False)



+--------------------------+------+
|speaker                   |count |
+--------------------------+------+
|President Donald Trump    |313624|
|Donald Trump              |252214|
|Narendra Modi             |147239|
|President Trump           |145751|
|Pope Francis              |102993|
|Hillary Clinton           |95458 |
|Bernie Sanders            |84018 |
|President Barack Obama    |73149 |
|Jose Mourinho             |65029 |
|Rodrigo Duterte           |60906 |
|Jurgen Klopp              |59651 |
|Benjamin Netanyahu        |58734 |
|Joe Biden                 |57891 |
|Vladimir Putin            |55895 |
|President Obama           |52870 |
|Rahul Gandhi              |52854 |
|Nancy Pelosi              |51999 |
|Boris Johnson             |49713 |
|Jeremy Corbyn             |48512 |
|Elizabeth Warren          |48397 |
|Mike Pence                |46893 |
|Pep Guardiola             |46313 |
|Ted Cruz                  |46301 |
|Lindsey Graham            |45069 |
|Andrew Cuomo              |

                                                                                

In [30]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("quotation") \
    .setOutputCol("document")

wordSegmenter = WordSegmenterApproach() \
    .setInputCols(["document"]) \
    .setOutputCol("token") \
    .setPosColumn("tags") \
    .setNIterations(5)

pipeline = Pipeline().setStages([
    documentAssembler,
    wordSegmenter
])

pipelineModel = pipeline.fit(df.sample(0.00001))

IllegalArgumentException: requirement failed: Cannot train from DataFrame without POS annotatorType by posCol

In [33]:
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA

lda = LDA(k=20, seed=1, optimizer="em")
model = lda.fit(df.sample(0.00001).with, 'quotation')

ValueError: Params must be either a param map or a list/tuple of param maps, but got <class 'str'>.

In [34]:
df.select('speaker').distinct().count()

                                                                                

818225

In [38]:
df.select('speaker').groupby('speaker').count().where(f.col('count') < 1000).agg(f.sum('count')).show()



+----------+
|sum(count)|
+----------+
|  40800538|
+----------+



                                                                                

In [40]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 5.2 MB/s eta 0:00:01
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.2
Note: you may need to restart the kernel to use updated packages.


In [41]:
from unidecode import unidecode

In [44]:
df.select('quotation').where()

"Trump's"

In [5]:
counts = df.select('speaker', 'qids').groupby('speaker', 'qids').count().sort('count', asc=False).toPandas()

                                                                                

In [43]:
d = counts[counts['speaker'] != 'None']
d['qids'] = d['qids'].apply(lambda x: 'Q' + str(min(map(lambda y: int(y[1:]), x))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['qids'] = d['qids'].apply(lambda x: 'Q' + str(min(map(lambda y: int(y[1:]), x))))


In [46]:
d.sort_values('count', ascending=False).groupby('qids').sum('count').sort_values(')

Unnamed: 0_level_0,count
qids,Unnamed: 1_level_1
Q1000053,943
Q1000074,2
Q1000087,8
Q1000204,36
Q1000275,118
...,...
Q999769,1535
Q999795,17
Q999889,120
Q999968,23


In [63]:
d[d.duplicated('qids', keep=False)].sort_values('speaker')

Unnamed: 0,speaker,qids,count
230242,( SANDY ) ALEX G,Q20709155,3
741178,( Sandy ) Alex G,Q20709155,170
55148,... LANGE,Q55088840,1
592590,... Lane,Q55088819,33
276326,... Lange,Q55088840,4
...,...,...,...
135921,Šárka Pančochová,Q2483405,2
32800,ŽELJKO KRAJAN,Q8083655,1
25533,Želimir Žilnik,Q1264297,1
179372,Željko Ivanek,Q382197,2


In [78]:
counts['lens'] = counts['qids'].apply(len)

In [85]:
counts[counts.lens > 1][counts['count'] > 10000]

  counts[counts.lens > 1][counts['count'] > 10000]


Unnamed: 0,speaker,qids,count,lens
817816,John Roberts,"[Q11153, Q14949621, Q16196671, Q19325651, Q208...",10030,22
817819,Doug Ford,"[Q28066064, Q4348031, Q5300478, Q5300480]",10073,4
817830,Mark Hughes,"[Q1494363, Q214513, Q21516094, Q3294110, Q3367...",10312,14
817834,John Williams,"[Q11310708, Q12633687, Q131285, Q1367551, Q149...",10365,51
817836,Winston Peters,"[Q1396178, Q5625319]",10409,2
...,...,...,...,...
818188,Imran Khan,"[Q155164, Q15987686, Q1660487, Q17306146, Q183...",36884,25
818189,Amit Shah,"[Q19946588, Q4746875, Q4746876]",37697,3
818191,Scott Morrison,"[Q1286476, Q21285393, Q7436904, Q7436906, Q743...",38350,5
818192,Theresa May,"[Q264766, Q30161835]",39871,2


In [143]:
speakers = set(counts[(counts['count'] > 10000) & (counts['speaker'] != 'None')].speaker)
len(speakers)

410

In [129]:
import time
t = time.time()
m = 0
for i in speakers:
    for j in speakers:
        m += len(i) + len(j)
print(time.time() - t)

28.05025625228882


In [144]:
counts[counts.speaker.isin(speakers)]['count'].sum()

8513416

In [145]:
import numpy as np

In [155]:
vecs = np.random.random((1500, 4 * 768))
np.linalg.norm(vecs @ vecs.T)


array([[251.41750425, 186.09194996, 188.45788462, ..., 187.97165664,
        191.33677433, 188.98229849],
       [186.09194996, 254.53951633, 187.83945308, ..., 186.46105363,
        189.36162003, 193.76817544],
       [188.45788462, 187.83945308, 256.02979783, ..., 182.79819681,
        192.79039054, 192.10726027],
       ...,
       [187.97165664, 186.46105363, 182.79819681, ..., 246.14677797,
        186.66657753, 189.5945456 ],
       [191.33677433, 189.36162003, 192.79039054, ..., 186.66657753,
        260.40973149, 200.96562751],
       [188.98229849, 193.76817544, 192.10726027, ..., 189.5945456 ,
        200.96562751, 266.66173144]])

In [None]:
conda install pandas

In [None]:
conda install matplotlib

In [None]:
conda install pyspark

In [9]:
from profanity_check import predict_prob
@f.pandas_udf('float')
def get_profanity(x):
    return float(predict_prob([x])[0])

In [27]:
df = df.withColumn('profanity', f.udf(get_profanity, FloatType())(f.col('quotation')))

In [None]:
df.where(f.col('profanity') > 0.5).count()

In [12]:
pip uninstall alt-profanity-check -y

Found existing installation: alt-profanity-check 1.0.1
Uninstalling alt-profanity-check-1.0.1:
  Successfully uninstalled alt-profanity-check-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install alt-profanity-check

Collecting alt-profanity-check
  Using cached alt_profanity_check-1.0.1-py3-none-any.whl
Installing collected packages: alt-profanity-check
Successfully installed alt-profanity-check-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [15]:
with bz2.open('data/quotebank_data/quotes-2020.json.bz2', 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance)
            print(instance)# loading a sample
            urls = instance['urls'] # extracting list of links
            domains = []
            instance['domains'] = domains # updating the sample with domain name
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

NameError: name 'bz2' is not defined

In [14]:
!pwd

/home/maculjak/ada-2021-project-k-beans


In [56]:
a = ['dalfskj asldk jadlsk jadslk jadskf jasdj fadskjf hadskhf asdljf asldjf dalskfj adkslfj adskflj dafsl kjdaf lkjaf lasjdf daskfl jadsf'] * 1000000

In [51]:
import time
t = time.time()
# for i in a:
#     predict_prob([i])
p = predict_prob(a)
time.time() - t


13.358476638793945

In [6]:
df_2019 = df.select('quoteID', 'quotation').where(f.col('date').rlike('^2019')).where(f.col('speaker') != 'None').toPandas()

                                                                                

In [10]:
import time
t = time.time()
df_2019['profanity'] = predict_prob(df_2019['quotation'])
time.time() - t

231.47203707695007

In [59]:
getsizeof(df_2020)/8/1024/1024

168.27794075012207

In [53]:
from sys import getsizeof

In [71]:
predict_prob(['coward'])

array([0.94435019])

In [88]:
[i for i in df_2019[df_2019['profanity'] > 0.9].sort_values('profanity')['quotation'][:10]]

['Marathon Man for idiots.',
 "And years ago when we made `Clerks,' I remember sitting on set while we were shooting the movie going like, `Man, wouldn't it be cool if Jay and Silent Bob became like Cheech and Chong and they had their own movies? Nah, that's f -- ing stupid.' I live in that stupid f -- ing dream, man, because that dopey kid took a shot.",
 'Words to which objection has been taken by the Speaker over the years include blackguard, coward, git, guttersnipe, hooligan, rat, swine, stoolpigeon and traitor,',
 'You absolute thick, privileged, uncaring twat.',
 'Vanessa Hudgens, you absolute thick, privileged, uncaring twat,']

In [16]:
df_2019[df_2019['profanity'] > 0.5].count()

quoteID      164568
quotation    164568
profanity    164568
dtype: int64

In [14]:
df_2019.count()

quoteID      14183294
quotation    14183294
profanity    14183294
dtype: int64