In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import findspark
findspark.init()

import re
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import DateType

from pyspark.mllib.clustering import LDA, LDAModel

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
test = spark.read.parquet('tfidf_all.parquet')

In [3]:
test.show()

+-------+--------------------+
| textID|            features|
+-------+--------------------+
|1787313|(335,[0,2,4,5,7,8...|
|1787819|(335,[1,2,3,4,6,9...|
|1787820|(335,[18,20,22,23...|
+-------+--------------------+



In [5]:
test.rdd.map(lambda r: r[1].norm(2)).take(3)

[1.0, 1.0, 1.0]

In [10]:
voc = sc.textFile('voc.txt').collect()

In [27]:
#read the text file and remove the first three rows (zip trick)
wlp_rdd = sc.textFile('../*-*-*.txt').zipWithIndex().filter(lambda r: r[1] > 2).keys()

In [28]:
#we split the elements separated by tabs
lines = wlp_rdd.map(lambda r: r.split('\t'))

#identify the columns
wlp_schema = lines.map(lambda r: Row(textID=int(r[0]),idseq=int(r[1]),word=r[2],lemma=r[3],pos=r[4]))
wlp = spark.createDataFrame(wlp_schema)
wlp.show()

+----------+-----------+-------+-------+-----------+
|     idseq|      lemma|    pos| textID|       word|
+----------+-----------+-------+-------+-----------+
|2654351732|   official|    nn2|1787313|  officials|
|2654351733|       have|    vh0|1787313|       have|
|2654351734|     accuse|    vvn|1787313|    accused|
|2654351735|       mine|vvg_nn1|1787313|     mining|
|2654351736|       firm|    nn2|1787313|      firms|
|2654351737|         of|     io|1787313|         of|
|2654351738|      react|    vvg|1787313|   reacting|
|2654351739|       like|     ii|1787313|       like|
|2654351740|    spoiled|    jj@|1787313|    spoiled|
|2654351741|      child|    nn2|1787313|   children|
|2654351742|           |      ,|1787313|          ,|
|2654351743|        but|    ccb|1787313|        but|
|2654351744|        the|     at|1787313|        the|
|2654351745|  tanzanian|     jj|1787313|  Tanzanian|
|2654351746| government|    nn1|1787313| government|
|2654351747|         's|     ge|1787313|      

In [29]:
wlp.groupBy('textID').count().show()

+-------+-----+
| textID|count|
+-------+-----+
|1787820|  169|
|1787313|  846|
|1787819|  386|
+-------+-----+



In [30]:
pos_remove = ['.',',',"\'",'\"','null']
wlp_nopos = wlp.filter(~wlp['pos'].isin(pos_remove)).filter(~wlp['pos'].startswith('m')).filter(~wlp['pos'].startswith('f')).drop('idseq','pos','word')

In [31]:
#np.save('our_stopwords',stopwords)
stopwords = sc.textFile('../our_stopwords.txt').collect()
print('Number of stopwords: ', len(stopwords))

Number of stopwords:  5639


In [32]:
#filter out stopwords and looking at the frequency of words without them
wlp_nostop = wlp_nopos.filter(~wlp['lemma'].isin(stopwords))
lemma_freq = wlp_nostop.groupBy('lemma').count().sort('count', ascending=False)
lemma_freq.show()

+-----------+-----+
|      lemma|count|
+-----------+-----+
|       mine|   10|
|   tanzania|    9|
|     mining|    8|
| government|    6|
|     cookie|    6|
|        fee|    6|
|    royalty|    6|
|     public|    6|
|legislation|    6|
|   industry|    6|
|     device|    6|
| investment|    6|
|      astro|    5|
|     change|    5|
|   investor|    5|
|     sector|    5|
|        use|    5|
|  tanzanian|    5|
|       high|    4|
|   minister|    4|
+-----------+-----+
only showing top 20 rows



In [33]:
#calculate percentiles and filtering out the lemmas above and below them
[bottom,top] = lemma_freq.approxQuantile('count', [0.8,0.99], 0.01)
lemma_tokeep = lemma_freq.filter(lemma_freq['count']>bottom).filter(lemma_freq['count']<top)
c = lemma_tokeep.count()
print('Number of lemmas left: %d'%c)
print('Percentage of lemmas left: %.2f'%(c/lemma_freq.count()*100))

Number of lemmas left: 46
Percentage of lemmas left: 12.81


In [34]:
#perform sql query and inner join
wlp_nostop.registerTempTable('wlp_nostop')
lemma_tokeep.registerTempTable('lemma_tokeep')

query = """
SELECT wlp_nostop.lemma, wlp_nostop.textID
FROM wlp_nostop
INNER JOIN lemma_tokeep ON wlp_nostop.lemma = lemma_tokeep.lemma
"""

wlp_kept = spark.sql(query)
wlp_bytext = wlp_kept.groupBy('textID').agg(collect_list('lemma'))\
                    .sort('textID')\
                    .withColumnRenamed('collect_list(lemma)','lemma_list')
wlp_bytext.show()

+-------+--------------------+
| textID|          lemma_list|
+-------+--------------------+
|1787313|[subject, subject...|
|1787819|[subject, set, re...|
|1787820|[astro, astro, as...|
+-------+--------------------+

