In [141]:
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

pd.set_option("max_colwidth", 800)

In [142]:
# Run Spark on localhost
spark = SparkSession\
    .Builder()\
    .config("spark.driver.host", "127.0.0.1")\
    .appName("wiki_bias")\
    .getOrCreate()

In [143]:
import os
for dirname, _, filenames in os.walk("/Users/chriswallerstein/Development/python/wikipedia_bias/data"):
    for file in filenames:
        print(os.path.join(dirname, file))

wiki_data = spark.read\
    .option("mode", "dropmalformed")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .option("multiline", "true")\
    .option("charset", "UTF-8")\
    .csv("/Users/chriswallerstein/Development/python/wikipedia_bias/data/*.csv")

/Users/chriswallerstein/Development/python/wikipedia_bias/data/wiki_corpus_2022_2_13.csv


In [144]:
# basic data cleansing
import pyspark.sql.functions as F

@F.udf
def ascii_ignore(x):
    return x.encode("ascii", "ignore").decode("ascii")

wiki_data = wiki_data.dropna()
wiki_data = wiki_data.withColumn("sentence_text", ascii_ignore("sentence"))

In [145]:
# tokenize
from pyspark.ml.feature import RegexTokenizer

regex_tokenizer = RegexTokenizer(inputCol="sentence_text", outputCol="words", pattern="\\W")
wiki_data = regex_tokenizer.transform(wiki_data)
wiki_data.show(5)

+--------------------+-------------+--------------------+--------------------+--------------------+
|                  id|       source|            sentence|       sentence_text|               words|
+--------------------+-------------+--------------------+--------------------+--------------------+
|20220213123309464889|conservapedia|"feisal abdul rau...|"feisal abdul rau...|[feisal, abdul, r...|
|20220213123309464934|conservapedia|feisal initially ...|feisal initially ...|[feisal, initiall...|
|20220213123309464950|conservapedia|according to isla...|according to isla...|[according, to, i...|
|20220213123309464963|conservapedia|the mosque was la...|the mosque was la...|[the, mosque, was...|
|20220213123309465060|conservapedia|in fact, in the m...|in fact, in the m...|[in, fact, in, th...|
+--------------------+-------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [146]:
# vectorize words
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="words", outputCol="features")
cv_model = cv.fit(wiki_data)
wiki_data = cv_model.transform(wiki_data)
wiki_data.show(5)

+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+
|                  id|       source|            sentence|       sentence_text|               words|            features|
+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+
|20220213123309464889|conservapedia|"feisal abdul rau...|"feisal abdul rau...|[feisal, abdul, r...|(5369,[0,1,2,3,5,...|
|20220213123309464934|conservapedia|feisal initially ...|feisal initially ...|[feisal, initiall...|(5369,[0,1,4,5,18...|
|20220213123309464950|conservapedia|according to isla...|according to isla...|[according, to, i...|(5369,[0,1,2,3,5,...|
|20220213123309464963|conservapedia|the mosque was la...|the mosque was la...|[the, mosque, was...|(5369,[0,1,4,6,15...|
|20220213123309465060|conservapedia|in fact, in the m...|in fact, in the m...|[in, fact, in, th...|(5369,[0,1,4,6,7,...|
+--------------------+----------

In [147]:
# transform label column (source)
from pyspark.ml.feature import StringIndexer

si = StringIndexer(inputCol="source", outputCol="label")
si_model = si.fit(wiki_data)
wiki_data = si_model.transform(wiki_data)
wiki_data.show(5)

+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-----+
|                  id|       source|            sentence|       sentence_text|               words|            features|label|
+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-----+
|20220213123309464889|conservapedia|"feisal abdul rau...|"feisal abdul rau...|[feisal, abdul, r...|(5369,[0,1,2,3,5,...|  2.0|
|20220213123309464934|conservapedia|feisal initially ...|feisal initially ...|[feisal, initiall...|(5369,[0,1,4,5,18...|  2.0|
|20220213123309464950|conservapedia|according to isla...|according to isla...|[according, to, i...|(5369,[0,1,2,3,5,...|  2.0|
|20220213123309464963|conservapedia|the mosque was la...|the mosque was la...|[the, mosque, was...|(5369,[0,1,4,6,15...|  2.0|
|20220213123309465060|conservapedia|in fact, in the m...|in fact, in the m...|[in, fact, in, th...|(5369,[0,1,4

In [148]:
train, test = wiki_data.select("features", "label").randomSplit([0.9,0.1])
print(train.count())
print(test.count())

544
51


In [149]:
# Try Logistic regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()
lr_model = lr.fit(train)
lr_predictions = lr_model.transform(test)
lr_predictions.limit(5).toPandas()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(6.0, 2.0, 2.0, 5.0, 2.0, 2.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[20.18425048667843, -7.1554656635259395, -13.028784823152492]","[0.999999999998658, 1.3381726185084823e-12, 3.764974487793454e-15]",0.0
1,"(4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...)",1.0,"[-16.074270756387254, 25.710696787746404, -9.636426031359154]","[7.128844950339842e-19, 0.9999999999999996, 4.455942812839158e-16]",1.0
2,"(6.0, 2.0, 1.0, 1.0, 2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",1.0,"[5.17970923625879, 1.6775232903159334, -6.857232526574723]","[0.9707443220595997, 0.02924992979860963, 5.748141790551174e-06]",0.0
3,"(5.0, 1.0, 1.0, 1.0, 3.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[31.896279496940615, -16.57714377382989, -15.31913572311073]","[1.0, 8.876867849194026e-22, 3.12324252449076e-21]",0.0
4,"(11.0, 3.0, 6.0, 7.0, 3.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...)",1.0,"[-3.1071025562341865, 20.950973718435, -17.843871162200806]","[3.562133803209278e-11, 0.9999999999643787, 1.41779337291852e-17]",1.0


In [150]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
print(f"Linear regression f1 {evaluator.evaluate(lr_predictions):.4f}")

Linear regression f1 0.8921


In [151]:
training_summary = lr_model.summary

