In [130]:
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

pd.set_option("max_colwidth", 800)

In [131]:
# Run Spark on localhost
spark = SparkSession\
    .Builder()\
    .config("spark.driver.host", "127.0.0.1")\
    .appName("wiki_bias")\
    .getOrCreate()

In [132]:
import os
for dirname, _, filenames in os.walk("/Users/chriswallerstein/Development/python/wikipedia_bias/data"):
    for file in filenames:
        print(os.path.join(dirname, file))

wiki_data = spark.read\
    .option("mode", "dropmalformed")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .option("multiline", "true")\
    .option("charset", "UTF-8")\
    .csv("/Users/chriswallerstein/Development/python/wikipedia_bias/data/*.csv")

/Users/chriswallerstein/Development/python/wikipedia_bias/data/wiki_corpus_2022_2_13.csv


In [133]:
# basic data cleansing
import pyspark.sql.functions as F

@F.udf
def ascii_ignore(x):
    return x.encode("ascii", "ignore").decode("ascii")

wiki_data = wiki_data.dropna()
wiki_data = wiki_data.withColumn("sentence_text", ascii_ignore("sentence"))

In [134]:
# tokenize
from pyspark.ml.feature import RegexTokenizer

regex_tokenizer = RegexTokenizer(inputCol="sentence_text", outputCol="words", pattern="\\W")
wiki_data = regex_tokenizer.transform(wiki_data)
wiki_data.sample(True, 0.05).toPandas()

Unnamed: 0,id,source,sentence,sentence_text,words
0,20220213123311980643,rational,"in particular, hip hop/country/southern rock/heartland rock artist kid rock (who is actually from detroit, like his good friend ted nugent, but recently moved to alabama and has gone on record as a raging libertarian or conservative, depending on the issue) has shouted down protests from his hometown to stop and/or apologize.http://www.rollingstone.com/music/news/kid-rock-tells-protestors-to-kiss-my-ass-over-confederate-flag-20150710=====2015=====in the wake of the charleston shootings, the battle flag came to increasing prominence due to pictures of an idiot with a bowl cut the confessed shooter with the flag.","in particular, hip hop/country/southern rock/heartland rock artist kid rock (who is actually from detroit, like his good friend ted nugent, but recently moved to alabama and has gone on record as a raging libertarian or conservative, depending on the issue) has shouted down protests from his hometown to stop and/or apologize.http://www.rollingstone.com/music/news/kid-rock-tells-protestors-to-kiss-my-ass-over-confederate-flag-20150710=====2015=====in the wake of the charleston shootings, the battle flag came to increasing prominence due to pictures of an idiot with a bowl cut the confessed shooter with the flag.","[in, particular, hip, hop, country, southern, rock, heartland, rock, artist, kid, rock, who, is, actually, from, detroit, like, his, good, friend, ted, nugent, but, recently, moved, to, alabama, and, has, gone, on, record, as, a, raging, libertarian, or, conservative, depending, on, the, issue, has, shouted, down, protests, from, his, hometown, to, stop, and, or, apologize, http, www, rollingstone, com, music, news, kid, rock, tells, protestors, to, kiss, my, ass, over, confederate, flag, 20150710, 2015, in, the, wake, of, the, charleston, shootings, the, battle, flag, came, to, increasing, prominence, due, to, pictures, of, an, idiot, with, a, bowl, cut, the, confessed, ...]"
1,20220213123311980773,rational,"neo-nazis in germany fly the confederate flag because it is illegal to display nazi symbols.the russian invasions of ukraine in 2014 were supported by american white supremacists richard spencer, matthew heimbach, and david duke.","neo-nazis in germany fly the confederate flag because it is illegal to display nazi symbols.the russian invasions of ukraine in 2014 were supported by american white supremacists richard spencer, matthew heimbach, and david duke.","[neo, nazis, in, germany, fly, the, confederate, flag, because, it, is, illegal, to, display, nazi, symbols, the, russian, invasions, of, ukraine, in, 2014, were, supported, by, american, white, supremacists, richard, spencer, matthew, heimbach, and, david, duke]"
2,20220213123311980865,rational,"does it remind you of islam?flag_of_alabama_(1861,_reverse).svg|the alabama flag (reverse side).","does it remind you of islam?flag_of_alabama_(1861,_reverse).svg|the alabama flag (reverse side).","[does, it, remind, you, of, islam, flag_of_alabama_, 1861, _reverse, svg, the, alabama, flag, reverse, side]"
3,20220213123312203753,rational,"la orientación sexual es la dirección que toma la sexualidad del individuo, usualmente clasificada en relación al sexo o género por el que se siente atraído.","la orientacin sexual es la direccin que toma la sexualidad del individuo, usualmente clasificada en relacin al sexo o gnero por el que se siente atrado.","[la, orientacin, sexual, es, la, direccin, que, toma, la, sexualidad, del, individuo, usualmente, clasificada, en, relacin, al, sexo, o, gnero, por, el, que, se, siente, atrado]"
4,20220213123312358984,rational,"a single photon of frequency 5.4×1014 hz, the frequency of visible yellow light, has a paltry 3.6×10-19 joules of energy.","a single photon of frequency 5.41014 hz, the frequency of visible yellow light, has a paltry 3.610-19 joules of energy.","[a, single, photon, of, frequency, 5, 41014, hz, the, frequency, of, visible, yellow, light, has, a, paltry, 3, 610, 19, joules, of, energy]"
5,20220213123313340269,rational,"thumb|right|195px|a strange little monument in little rock that is really important to one strange little man.stanley jason rapert is a christian fundamentalist state senator in arkansas and an advocate for christian theocracy, who is notable for his conflicts with the satanic temple.","thumb|right|195px|a strange little monument in little rock that is really important to one strange little man.stanley jason rapert is a christian fundamentalist state senator in arkansas and an advocate for christian theocracy, who is notable for his conflicts with the satanic temple.","[thumb, right, 195px, a, strange, little, monument, in, little, rock, that, is, really, important, to, one, strange, little, man, stanley, jason, rapert, is, a, christian, fundamentalist, state, senator, in, arkansas, and, an, advocate, for, christian, theocracy, who, is, notable, for, his, conflicts, with, the, satanic, temple]"
6,20220213123313340305,rational,"""he has tried to outlaw abortion in arkansas abill to outlaw abortion filed with hopes of reaching the trump supreme court arkansas times 18 november 2020 couplewhat rapert’s abortion bill doesn’t say means trouble for women arkansas times 23 january 2019 timesarkansas’s abortion ban and one man’s strong will the new york times 11 march 2013 in the hope of overturning roe v. wade and punishing sexually active women with """"vaginal probes.""\r","""he has tried to outlaw abortion in arkansas abill to outlaw abortion filed with hopes of reaching the trump supreme court arkansas times 18 november 2020 couplewhat raperts abortion bill doesnt say means trouble for women arkansas times 23 january 2019 timesarkansass abortion ban and one mans strong will the new york times 11 march 2013 in the hope of overturning roe v. wade and punishing sexually active women with """"vaginal probes.""\r","[he, has, tried, to, outlaw, abortion, in, arkansas, abill, to, outlaw, abortion, filed, with, hopes, of, reaching, the, trump, supreme, court, arkansas, times, 18, november, 2020, couplewhat, raperts, abortion, bill, doesnt, say, means, trouble, for, women, arkansas, times, 23, january, 2019, timesarkansass, abortion, ban, and, one, mans, strong, will, the, new, york, times, 11, march, 2013, in, the, hope, of, overturning, roe, v, wade, and, punishing, sexually, active, women, with, vaginal, probes]"
7,20220213123313340305,rational,"""he has tried to outlaw abortion in arkansas abill to outlaw abortion filed with hopes of reaching the trump supreme court arkansas times 18 november 2020 couplewhat rapert’s abortion bill doesn’t say means trouble for women arkansas times 23 january 2019 timesarkansas’s abortion ban and one man’s strong will the new york times 11 march 2013 in the hope of overturning roe v. wade and punishing sexually active women with """"vaginal probes.""\r","""he has tried to outlaw abortion in arkansas abill to outlaw abortion filed with hopes of reaching the trump supreme court arkansas times 18 november 2020 couplewhat raperts abortion bill doesnt say means trouble for women arkansas times 23 january 2019 timesarkansass abortion ban and one mans strong will the new york times 11 march 2013 in the hope of overturning roe v. wade and punishing sexually active women with """"vaginal probes.""\r","[he, has, tried, to, outlaw, abortion, in, arkansas, abill, to, outlaw, abortion, filed, with, hopes, of, reaching, the, trump, supreme, court, arkansas, times, 18, november, 2020, couplewhat, raperts, abortion, bill, doesnt, say, means, trouble, for, women, arkansas, times, 23, january, 2019, timesarkansass, abortion, ban, and, one, mans, strong, will, the, new, york, times, 11, march, 2013, in, the, hope, of, overturning, roe, v, wade, and, punishing, sexually, active, women, with, vaginal, probes]"
8,20220213123313340490,rational,5 february 2020 he also uses twitter to beg wal-mart not to sell satanic-themed merchandise.,5 february 2020 he also uses twitter to beg wal-mart not to sell satanic-themed merchandise.,"[5, february, 2020, he, also, uses, twitter, to, beg, wal, mart, not, to, sell, satanic, themed, merchandise]"
9,20220213123313340609,rational,"""""""seed"""" !!""\r","""""""seed"""" !!""\r",[seed]


In [135]:
# vectorize words
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="words", outputCol="features")
cv_model = cv.fit(wiki_data)
wiki_data = cv_model.transform(wiki_data)
wiki_data.sample(True, 0.05).toPandas()

Unnamed: 0,id,source,sentence,sentence_text,words,features
0,20220213123309464889,conservapedia,"""feisal abdul rauf is a muslim religious leader, activist and author who is spearheading the proposal to construct an islamic cultural center two blocks from the wreckage of the world trade center.imam feisal abdul rauf's goodwill tour comes amid 'ground zero mosque' furor the media have dubbed the project the """"ground zero mosque"""" from its nearness to ground zero and because it will include a mosque.""\r","""feisal abdul rauf is a muslim religious leader, activist and author who is spearheading the proposal to construct an islamic cultural center two blocks from the wreckage of the world trade center.imam feisal abdul rauf's goodwill tour comes amid 'ground zero mosque' furor the media have dubbed the project the """"ground zero mosque"""" from its nearness to ground zero and because it will include a mosque.""\r","[feisal, abdul, rauf, is, a, muslim, religious, leader, activist, and, author, who, is, spearheading, the, proposal, to, construct, an, islamic, cultural, center, two, blocks, from, the, wreckage, of, the, world, trade, center, imam, feisal, abdul, rauf, s, goodwill, tour, comes, amid, ground, zero, mosque, furor, the, media, have, dubbed, the, project, the, ground, zero, mosque, from, its, nearness, to, ground, zero, and, because, it, will, include, a, mosque]","(6.0, 1.0, 2.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
1,20220213123311786617,rational,"david koch was responsible for founding americans for prosperity, a teabagger astroturf group.","david koch was responsible for founding americans for prosperity, a teabagger astroturf group.","[david, koch, was, responsible, for, founding, americans, for, prosperity, a, teabagger, astroturf, group]","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
2,20220213123311786627,rational,koch denied involvement but video later leaked of him at teabagger parties.,koch denied involvement but video later leaked of him at teabagger parties.,"[koch, denied, involvement, but, video, later, leaked, of, him, at, teabagger, parties]","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
3,20220213123311980156,rational,"thumb|right|165px|the battle flag, here shown with the proper aspect ratio.","thumb|right|165px|the battle flag, here shown with the proper aspect ratio.","[thumb, right, 165px, the, battle, flag, here, shown, with, the, proper, aspect, ratio]","(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
4,20220213123311980379,rational,"it made its way to the political scene in 1948, when members of the kappa alpha orderyes, that and yes, that figures.","it made its way to the political scene in 1948, when members of the kappa alpha orderyes, that and yes, that figures.","[it, made, its, way, to, the, political, scene, in, 1948, when, members, of, the, kappa, alpha, orderyes, that, and, yes, that, figures]","(2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
5,20220213123311980690,rational,nontroversies ensued when the flag was removed from various public buildings.,nontroversies ensued when the flag was removed from various public buildings.,"[nontroversies, ensued, when, the, flag, was, removed, from, various, public, buildings]","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
6,20220213123311980884,rational,"""it seems rather ironic now; the motto on the flag, sic semper tyrannis, translates as """"thus always to tyrants.""\r","""it seems rather ironic now; the motto on the flag, sic semper tyrannis, translates as """"thus always to tyrants.""\r","[it, seems, rather, ironic, now, the, motto, on, the, flag, sic, semper, tyrannis, translates, as, thus, always, to, tyrants]","(2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
7,20220213123312203789,rational,"está dividida en cuatro grandes categorías: la heterosexualidad, homosexualidad, bisexualidad, y la asexualidad, y otras orientaciones minoritarias.==opinión médica==la ciencia médica no ha comprobado qué factor determina la orientación sexual, sin embargo la mayoría de especialistas están de acuerdo en que se debe a una combinación compleja de factores, como los genes, el ambiente fetal y social.","est dividida en cuatro grandes categoras: la heterosexualidad, homosexualidad, bisexualidad, y la asexualidad, y otras orientaciones minoritarias.==opinin mdica==la ciencia mdica no ha comprobado qu factor determina la orientacin sexual, sin embargo la mayora de especialistas estn de acuerdo en que se debe a una combinacin compleja de factores, como los genes, el ambiente fetal y social.","[est, dividida, en, cuatro, grandes, categoras, la, heterosexualidad, homosexualidad, bisexualidad, y, la, asexualidad, y, otras, orientaciones, minoritarias, opinin, mdica, la, ciencia, mdica, no, ha, comprobado, qu, factor, determina, la, orientacin, sexual, sin, embargo, la, mayora, de, especialistas, estn, de, acuerdo, en, que, se, debe, a, una, combinacin, compleja, de, factores, como, los, genes, el, ambiente, fetal, y, social]","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
8,20220213123313340437,rational,"senator rapert, unfortunately, is not one of those people.instead, rapert has zealously promoted his teensy little monument ever since, fervently defending it in court and quickly re-erecting it when it was toppled by vandals.","senator rapert, unfortunately, is not one of those people.instead, rapert has zealously promoted his teensy little monument ever since, fervently defending it in court and quickly re-erecting it when it was toppled by vandals.","[senator, rapert, unfortunately, is, not, one, of, those, people, instead, rapert, has, zealously, promoted, his, teensy, little, monument, ever, since, fervently, defending, it, in, court, and, quickly, re, erecting, it, when, it, was, toppled, by, vandals]","(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
9,20220213123313519251,rational,"there is variation among hierarchies, with more than 80 documented,hierarchies of evidence chris j. blunt but they follow a basic pattern of attempting to rank evidence based on:* relevance to the target population (usually humans, but potentially also animals in veterinary medicine), e.g.","there is variation among hierarchies, with more than 80 documented,hierarchies of evidence chris j. blunt but they follow a basic pattern of attempting to rank evidence based on:* relevance to the target population (usually humans, but potentially also animals in veterinary medicine), e.g.","[there, is, variation, among, hierarchies, with, more, than, 80, documented, hierarchies, of, evidence, chris, j, blunt, but, they, follow, a, basic, pattern, of, attempting, to, rank, evidence, based, on, relevance, to, the, target, population, usually, humans, but, potentially, also, animals, in, veterinary, medicine, e, g]","(1.0, 2.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"


In [136]:
# transform label column (source)
from pyspark.ml.feature import StringIndexer

si = StringIndexer(inputCol="source", outputCol="label")
si_model = si.fit(wiki_data)
wiki_data = si_model.transform(wiki_data)
wiki_data.sample(True, 0.05).toPandas()

Unnamed: 0,id,source,sentence,sentence_text,words,features,label
0,20220213123311273317,conservapedia,"the breadth and variety of his activities and associations, and his ability to describe them vividly in writing, earned him international fame as lawrence of arabia, a title used for the 1962 film based on his wartime activities.","the breadth and variety of his activities and associations, and his ability to describe them vividly in writing, earned him international fame as lawrence of arabia, a title used for the 1962 film based on his wartime activities.","[the, breadth, and, variety, of, his, activities, and, associations, and, his, ability, to, describe, them, vividly, in, writing, earned, him, international, fame, as, lawrence, of, arabia, a, title, used, for, the, 1962, film, based, on, his, wartime, activities]","(2.0, 2.0, 3.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",2.0
1,20220213123311786627,rational,koch denied involvement but video later leaked of him at teabagger parties.,koch denied involvement but video later leaked of him at teabagger parties.,"[koch, denied, involvement, but, video, later, leaked, of, him, at, teabagger, parties]","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
2,20220213123311786731,rational,snyder signed it into law despite referendums telling him not to.,snyder signed it into law despite referendums telling him not to.,"[snyder, signed, it, into, law, despite, referendums, telling, him, not, to]","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
3,20220213123311980336,rational,"unfortunately, it was also adopted as a symbol emblazoned on the white sheets of a bunch of idiots shortly thereafter,one theory ties this to a desire to appear as ghosts of the war dead.","unfortunately, it was also adopted as a symbol emblazoned on the white sheets of a bunch of idiots shortly thereafter,one theory ties this to a desire to appear as ghosts of the war dead.","[unfortunately, it, was, also, adopted, as, a, symbol, emblazoned, on, the, white, sheets, of, a, bunch, of, idiots, shortly, thereafter, one, theory, ties, this, to, a, desire, to, appear, as, ghosts, of, the, war, dead]","(2.0, 3.0, 0.0, 2.0, 0.0, 3.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
4,20220213123312203852,rational,"doi:10.1007/s10508-008-9386-1 de todas formas, las hipótesis sobre el origen o la influencia del ambiente post-natal a la orientación sexual son deficientes, sobretodo para los hombres.bailey jm, vasey pl, diamond lm, breedlove sm, vilain e, epprecht m (2016).","doi:10.1007/s10508-008-9386-1 de todas formas, las hiptesis sobre el origen o la influencia del ambiente post-natal a la orientacin sexual son deficientes, sobretodo para los hombres.bailey jm, vasey pl, diamond lm, breedlove sm, vilain e, epprecht m (2016).","[doi, 10, 1007, s10508, 008, 9386, 1, de, todas, formas, las, hiptesis, sobre, el, origen, o, la, influencia, del, ambiente, post, natal, a, la, orientacin, sexual, son, deficientes, sobretodo, para, los, hombres, bailey, jm, vasey, pl, diamond, lm, breedlove, sm, vilain, e, epprecht, m, 2016]","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
5,20220213123312204006,rational,"de ser cierta la hipótesis epigenética, es posible que en el futuro la terapia epigenética sea capaz de alterar la orientación sexual.tenga en cuenta el uso extensivo de calificadores.","de ser cierta la hiptesis epigentica, es posible que en el futuro la terapia epigentica sea capaz de alterar la orientacin sexual.tenga en cuenta el uso extensivo de calificadores.","[de, ser, cierta, la, hiptesis, epigentica, es, posible, que, en, el, futuro, la, terapia, epigentica, sea, capaz, de, alterar, la, orientacin, sexual, tenga, en, cuenta, el, uso, extensivo, de, calificadores]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
6,20220213123312204033,rational,"junto con la naturaleza todavía especulativa de la causa epigenética de la homosexualidad, hay mucha ciencia que hacer entre ahora y este tratamiento hipotético==opinión evangélica==un número relevante de evangélicos te dirá que la orientación sexual es una decisión y que cualquiera que escoja ser gay será horneado en el escupitajo sodomita especial de satanás.","junto con la naturaleza todava especulativa de la causa epigentica de la homosexualidad, hay mucha ciencia que hacer entre ahora y este tratamiento hipottico==opinin evanglica==un nmero relevante de evanglicos te dir que la orientacin sexual es una decisin y que cualquiera que escoja ser gay ser horneado en el escupitajo sodomita especial de satans.","[junto, con, la, naturaleza, todava, especulativa, de, la, causa, epigentica, de, la, homosexualidad, hay, mucha, ciencia, que, hacer, entre, ahora, y, este, tratamiento, hipottico, opinin, evanglica, un, nmero, relevante, de, evanglicos, te, dir, que, la, orientacin, sexual, es, una, decisin, y, que, cualquiera, que, escoja, ser, gay, ser, horneado, en, el, escupitajo, sodomita, especial, de, satans]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
7,20220213123312358894,rational,==electromagnetism==electromagnetic rays or electromagnetic waves that scientists study range from radio waves with very long wavelength through to very short wave gamma rays or x-rays.,==electromagnetism==electromagnetic rays or electromagnetic waves that scientists study range from radio waves with very long wavelength through to very short wave gamma rays or x-rays.,"[electromagnetism, electromagnetic, rays, or, electromagnetic, waves, that, scientists, study, range, from, radio, waves, with, very, long, wavelength, through, to, very, short, wave, gamma, rays, or, x, rays]","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
8,20220213123312358997,rational,"you'd have to have 2.8×1018 such photons hitting you every second just to receive 1 watt of light.==speed of light==the speed of a photon, in a vacuum, is constant (generally denoted c) and is the limiting speed of information transfer in the universe.","you'd have to have 2.81018 such photons hitting you every second just to receive 1 watt of light.==speed of light==the speed of a photon, in a vacuum, is constant (generally denoted c) and is the limiting speed of information transfer in the universe.","[you, d, have, to, have, 2, 81018, such, photons, hitting, you, every, second, just, to, receive, 1, watt, of, light, speed, of, light, the, speed, of, a, photon, in, a, vacuum, is, constant, generally, denoted, c, and, is, the, limiting, speed, of, information, transfer, in, the, universe]","(3.0, 4.0, 1.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0
9,20220213123313340698,rational,"which is outrageous, because it is well known that republicans are an oppressed minority.arkansas lawmaker opposes hate-crime bill because it doesn’t protect republicans friendly atheist 22 august 2020==see also==* satanic panic* american taliban* kim davis==references==category:united states politicianscategory:satanismcategory:shysterscategory:living peoplecategory:authoritarian wingnuttery","which is outrageous, because it is well known that republicans are an oppressed minority.arkansas lawmaker opposes hate-crime bill because it doesnt protect republicans friendly atheist 22 august 2020==see also==* satanic panic* american taliban* kim davis==references==category:united states politicianscategory:satanismcategory:shysterscategory:living peoplecategory:authoritarian wingnuttery","[which, is, outrageous, because, it, is, well, known, that, republicans, are, an, oppressed, minority, arkansas, lawmaker, opposes, hate, crime, bill, because, it, doesnt, protect, republicans, friendly, atheist, 22, august, 2020, see, also, satanic, panic, american, taliban, kim, davis, references, category, united, states, politicianscategory, satanismcategory, shysterscategory, living, peoplecategory, authoritarian, wingnuttery]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0


In [137]:
train, test = wiki_data.select("features", "label").randomSplit([0.9,0.1])
print(train.count())
print(test.count())

539
56


In [138]:
# Try Logistic regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()
lr_model = lr.fit(train)
lr_predictions = lr_model.transform(test)
lr_predictions.limit(5).toPandas()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(3.0, 1.0, 2.0, 2.0, 1.0, 1.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...)",0.0,"[12.110304106810734, -3.206377919342339, -8.903926187468395]","[0.9999997763835844, 2.2286887347190357e-07, 7.475420800285711e-10]",0.0
1,"(5.0, 1.0, 1.0, 1.0, 3.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[31.617782787282025, -16.639744971806103, -14.978037815475918]","[1.0, 1.1015950707415723e-21, 5.8035299826442485e-21]",0.0
2,"(2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[13.72602006475191, -2.2450515512500564, -11.480968513501853]","[0.9999998841505419, 1.1583816701534736e-07, 1.1291298917678416e-11]",0.0
3,"(9.0, 3.0, 4.0, 3.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[9.896213821826361, -2.0296672210330216, -7.866546600793347]","[0.9999933638195524, 6.616872848106036e-06, 1.930759956252672e-08]",0.0
4,"(3.0, 4.0, 1.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[14.756575644500764, -5.962374679345094, -8.794200965155671]","[0.9999999989365151, 1.0043248372686986e-09, 5.915993299341227e-11]",0.0


In [139]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
print(f"Linear regression f1 {evaluator.evaluate(lr_predictions):.4f}")

Linear regression f1 0.9031


In [140]:
training_summary = lr_model.summary

