In [1]:
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

pd.set_option("max_colwidth", 800)

In [2]:
# Run Spark on localhost
spark = SparkSession\
    .Builder()\
    .config("spark.driver.host", "127.0.0.1")\
    .appName("wiki_bias")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
import os
for dirname, _, filenames in os.walk("/Users/chriswallerstein/Development/python/wikipedia_bias/data"):
    for file in filenames:
        print(os.path.join(dirname, file))

wiki_data = spark.read\
    .option("mode", "dropmalformed")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .option("multiline", "true")\
    .option("charset", "UTF-8")\
    .csv("/Users/chriswallerstein/Development/python/wikipedia_bias/data/*.csv")

/Users/chriswallerstein/Development/python/wikipedia_bias/data/wiki_corpus_2022_2_16.csv
/Users/chriswallerstein/Development/python/wikipedia_bias/data/wiki_corpus_2022_2_17.csv
/Users/chriswallerstein/Development/python/wikipedia_bias/data/wiki_corpus_2022_2_13.csv


In [4]:
# basic data cleansing
import pyspark.sql.functions as F

@F.udf
def ascii_ignore(x):
    return x.encode("ascii", "ignore").decode("ascii")

wiki_data = wiki_data.dropna()
wiki_data = wiki_data.withColumn("sentence_text", ascii_ignore("sentence"))

In [5]:
# tokenize
from pyspark.ml.feature import RegexTokenizer, Tokenizer

regex_tokenizer = RegexTokenizer(inputCol="sentence_text", outputCol="words", gaps=False, pattern="[a-z]+")
wiki_data = regex_tokenizer.transform(wiki_data)
wiki_data.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+-------------+--------------------+--------------------+--------------------+
|                  id|       source|            sentence|       sentence_text|               words|
+--------------------+-------------+--------------------+--------------------+--------------------+
|20220216130833233044|conservapedia|right|thumb|the t...|right|thumb|the t...|[right, thumb, th...|
|20220216130833233089|conservapedia|a great player bo...|a great player bo...|[a, great, player...|
|20220216130833233103|conservapedia|in his career (18...|in his career (18...|[in, his, career,...|
|20220216130833404323|conservapedia|thumb|left|buildi...|thumb|left|buildi...|[thumb, left, bui...|
|20220216130833588118|conservapedia|robert lloyd dunc...|robert lloyd dunc...|[robert, lloyd, d...|
+--------------------+-------------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [6]:
# vectorize words
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="words", outputCol="features")
cv_model = cv.fit(wiki_data)
wiki_data = cv_model.transform(wiki_data)
wiki_data.select("words", "features").show(5)

                                                                                

+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[right, thumb, th...|(20579,[0,1,2,5,8...|
|[a, great, player...|(20579,[1,2,5,8,1...|
|[in, his, career,...|(20579,[0,1,2,3,4...|
|[thumb, left, bui...|(20579,[0,2,4,5,6...|
|[robert, lloyd, d...|(20579,[0,1,2,3,4...|
+--------------------+--------------------+
only showing top 5 rows



In [7]:
# transform label column (source)
from pyspark.ml.feature import StringIndexer

si = StringIndexer(inputCol="source", outputCol="label")
si_model = si.fit(wiki_data)
wiki_data = si_model.transform(wiki_data)
wiki_data.sample(True, 0.1).select("source", "label").show(5)

+-------------+-----+
|       source|label|
+-------------+-----+
|conservapedia|  2.0|
|conservapedia|  2.0|
|conservapedia|  2.0|
|conservapedia|  2.0|
|conservapedia|  2.0|
+-------------+-----+
only showing top 5 rows



In [8]:
train, test = wiki_data.select("features", "label").randomSplit([0.9,0.1])
print(train.count())
print(test.count())

4749
564


In [9]:
# Try Logistic regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()
lr_model = lr.fit(train)
lr_predictions = lr_model.transform(test)
lr_predictions.limit(5).toPandas()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",1.0,"[5.112159899323037, 6.398755982340817, -11.510915881663854]","[0.2164295135004896, 0.783570473437609, 1.3061901345478775e-08]",1.0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",1.0,"[5.112159899323037, 6.398755982340817, -11.510915881663854]","[0.2164295135004896, 0.783570473437609, 1.3061901345478775e-08]",1.0
2,"(6.0, 3.0, 2.0, 6.0, 1.0, 1.0, 3.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, ...)",0.0,"[380.3160592113952, -262.01303038796, -118.303028823435]","[1.0, 1.0965307650501619e-279, 2.834532852316016e-217]",0.0
3,"(3.0, 1.0, 5.0, 3.0, 1.0, 2.0, 1.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[275.5727825483716, -212.6299820055412, -62.94280054283031]","[1.0, 9.467456194216645e-213, 9.650509114290981e-148]",0.0
4,"(3.0, 4.0, 3.0, 7.0, 3.0, 4.0, 2.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[196.22676782720345, -106.4359471708314, -89.79082065637205]","[1.0, 3.591310857363413e-132, 6.083305041761479e-125]",0.0


In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
print(f"Linear regression f1 {evaluator.evaluate(lr_predictions):.4f}")

Linear regression f1 0.7625


In [20]:
# create TF-IDF weightings - to be used later
from pyspark.ml.feature import IDF

# we created the term frequency vectors above with CountVectorizer
idf = IDF(inputCol="features", outputCol="features-idf")
idfModel = idf.fit(wiki_data)
wiki_data_idf = idfModel.transform(wiki_data)
wiki_data_idf.sample(True, 0.005)\
    .select("features","features-idf").toPandas()

Unnamed: 0,features,features-idf
0,"(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.0, 0.8339634986917686, 0.909071837730076, 0.0, 1.0444064164711266, 0.0, 2.9911031139289177, 0.0, 0.0, 1.7947749257157988, 0.0, 0.0, 0.0, 1.9620349411869416, 0.0, 2.017069460423186, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.4666327868170805, 0.0, 0.0, 0.0, 0.0, 2.6754667929183924, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.943310523150509, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.521854320971451, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.5944935046114224, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
2,"(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.5194564141041411, 0.0, 0.0, 0.0, 1.0444064164711266, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
3,"(0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.0, 0.0, 0.909071837730076, 0.9887643031491419, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.943466768458073, 0.0, 0.0, 0.0, 0.0, 2.3358768608645937, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.943310523150509, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.407616131281607, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0133431134140298, 2.9911031139289177, 0.0, 0.0, 0.0, 0.0, 0.0, 1.8700160424666894, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
5,"(2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...)","(1.0389128282082822, 0.0, 0.0, 1.9775286062982838, 0.0, 0.0, 1.4955515569644589, 1.6693453470045383, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.943466768458073, 0.0, 0.0, 0.0, 0.0, 0.0, 2.3242713147442857, 0.0, 0.0, 0.0, 0.0, 2.5307279472734807, 2.6119533871960665, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.599970934178483, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.3631643687107733, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.7105656758641765, 0.0, 0.0, 0.0, ...)"
6,"(1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.5194564141041411, 0.8339634986917686, 0.0, 0.0, 0.0, 1.0133431134140298, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.390714320479004, 3.4905037910873746, 0.0, 0.0, 0.0, 0.0, 0.0, 3.5411475239061296, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.4843499255129964, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.774079081586502, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
7,"(1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...)","(0.5194564141041411, 0.0, 0.909071837730076, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.8972454475295437, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.353541697044399, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.7105656758641765, ...)"
8,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.207462098192096, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.5411475239061296, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.4843499255129964, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
9,"(1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)","(0.5194564141041411, 0.0, 0.0, 0.0, 1.0444064164711266, 1.0133431134140298, 0.0, 0.0, 0.0, 0.0, 1.8972454475295437, 0.0, 0.0, 0.0, 0.0, 0.0, 2.2133493694678474, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.822357912732847, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.235765874354948, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.5476622049273234, 0.0, 3.5029263110859317, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"


In [12]:
# Remove stop words
from pyspark.ml.feature import StopWordsRemover

english_stop_words = StopWordsRemover.loadDefaultStopWords("english")
sw_remover = StopWordsRemover(
    inputCol="words",
    outputCol="sw_free_words",
    stopWords=english_stop_words)
sw_free_wiki_data = sw_remover.transform(wiki_data)

In [13]:
# flatten -> (source, word)
sw_free_wiki_data_rdd = sw_free_wiki_data\
    .select("source", "sw_free_words").rdd\
    .flatMapValues(lambda x: x)

# convert each pair to (pair, 1) so that the occurrences can be counted
# the key is now (source, word), i.e. (conservapedia, obamacare)
sw_free_wiki_data_rdd = sw_free_wiki_data_rdd\
    .map(lambda x: (x, 1))\
    .reduceByKey(lambda x, y: x + y)

# flip (key, count) to (count, key) and sort
sw_free_wiki_data_rdd\
    .map(lambda x: (x[1], x[0]))\
    .sortByKey(ascending=False)\
    .filter(lambda x : x[1][0] == "conservapedia")\
    .take(10)



[(67, ('conservapedia', 'committee')),
 (58, ('conservapedia', 'american')),
 (27, ('conservapedia', 'also')),
 (23, ('conservapedia', 'peace')),
 (22, ('conservapedia', 'duncan')),
 (22, ('conservapedia', 'new')),
 (22, ('conservapedia', 'league')),
 (19, ('conservapedia', 'council')),
 (18, ('conservapedia', 'school')),
 (17, ('conservapedia', 'people'))]

In [14]:
sw_free_wiki_data_rdd\
    .map(lambda x: (x[1], x[0]))\
    .sortByKey(ascending=False)\
    .filter(lambda x : x[1][0] == "wikipedia")\
    .take(10)

[(162, ('wikipedia', 'style')),
 (132, ('wikipedia', 'new')),
 (120, ('wikipedia', 'episode')),
 (106, ('wikipedia', 'width')),
 (103, ('wikipedia', 'align')),
 (93, ('wikipedia', 'faj')),
 (92, ('wikipedia', 'also')),
 (90, ('wikipedia', 'treaties')),
 (89, ('wikipedia', 'left')),
 (83, ('wikipedia', 'first'))]

In [15]:
sw_free_wiki_data_rdd\
    .map(lambda x: (x[1], x[0]))\
    .sortByKey(ascending=False)\
    .filter(lambda x : x[1][0] == "rational")\
    .take(10)

[(326, ('rational', 'right')),
 (253, ('rational', 'one')),
 (249, ('rational', 'also')),
 (234, ('rational', 'align')),
 (225, ('rational', 'people')),
 (211, ('rational', 'de')),
 (173, ('rational', 'px')),
 (172, ('rational', 'war')),
 (160, ('rational', 'even')),
 (159, ('rational', 'comfort'))]

In [62]:
# Use TD-IDF to find unique words in each source
from sklearn.feature_extraction.text import TfidfVectorizer

wiki_data_by_source = sw_free_wiki_data_rdd.map(lambda x : x[0]).reduceByKey(lambda a,b: " ".join([a,b]))
column_names = [source[0] for source in wiki_data_by_source.toLocalIterator()]
wiki_data_by_source = wiki_data_by_source.map(lambda x: x[1])



In [64]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,1), max_df = .6, min_df = .05)
corpus = []
for it in wiki_data_by_source.toLocalIterator():
    corpus.append(it[1:])
X = vectorizer.fit_transform(corpus)

In [65]:
feature_names = vectorizer.get_feature_names()
feature_names



['aak',
 'aaron',
 'aba',
 'abacetus',
 'abahachi',
 'abandoning',
 'abandonment',
 'abbotsford',
 'abbott',
 'abbrev',
 'abdel',
 'abdication',
 'abdomen',
 'abducted',
 'abductions',
 'abdullatif',
 'abdzi',
 'abe',
 'abebooks',
 'abelheira',
 'abide',
 'abies',
 'abiezer',
 'abigail',
 'abill',
 'abingdon',
 'abiogenesis',
 'abject',
 'ablation',
 'ableism',
 'ableismcategory',
 'abnormalities',
 'abnormally',
 'aboard',
 'abolished',
 'abomination',
 'abord',
 'abort',
 'aborted',
 'abortionis',
 'abortions',
 'abound',
 'aboutseekfindnet',
 'abrahamic',
 'abrams',
 'abrilada',
 'abruptly',
 'absent',
 'absentee',
 'absentiaduped',
 'absoltne',
 'absolument',
 'absolute',
 'absolutely',
 'absolve',
 'absorb',
 'absorbing',
 'absord',
 'abstentions',
 'abstract',
 'abstracts',
 'abstruse',
 'absurd',
 'absurdum',
 'abu',
 'abundance',
 'aburish',
 'abused',
 'abusegirls',
 'abuser',
 'abusers',
 'abusive',
 'abusively',
 'abyssinia',
 'academia',
 'academies',
 'academy',
 'acar',
 

In [66]:
dense = X.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df.head()

Unnamed: 0,aak,aaron,aba,abacetus,abahachi,abandoning,abandonment,abbotsford,abbott,abbrev,...,zoning,zoo,zoological,zoology,zoos,zoroastrian,zoroastrianism,zoroastrians,zozo,zwayne
0,0.0,0.0,0.0,0.0,0.0,0.053074,0.0,0.0,0.053074,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.013833,0.0,0.013833,0.013833,0.013833,0.0,0.013833,0.013833,0.0,0.0,...,0.013833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.009822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009822,...,0.0,0.009822,0.009822,0.009822,0.009822,0.009822,0.009822,0.009822,0.009822,0.009822


In [67]:
data = df.transpose()
data.columns = column_names

In [68]:
# Find 20 most unique words in each source
top_dict = {}
for c in range(3):
    top = data.iloc[:,c].sort_values(ascending=False).head(20)
    top_dict[data.columns[c]]= list(zip(top.index, top.values))

for source, top_words in top_dict.items():
    print(source)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

conservapedia
shortstop, stinging, seliger, localities, dickens, schappes, rarest, milam, trier, wagner, objection, wireless, attorneyscategory, attract
---
wikipedia
aak, krlov, kucha, kuba, kuala, ktv, ktitel, ksqof, ksiyc, ksiniczka, krzysztof, krowa, kroll, krol
---
rational
zwayne, souvent, joking, soviets, jolts, jon, sovereign, souzainterview, souza, jordan, souvenirs, joints, soutient, jornadas
---
