In [43]:
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

pd.set_option("max_colwidth", 800)

In [5]:
# Run Spark on localhost
spark = SparkSession\
    .Builder()\
    .config("spark.driver.host", "127.0.0.1")\
    .appName("wiki_bias")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [93]:
import os
for dirname, _, filenames in os.walk("/Users/chriswallerstein/Development/python/wikipedia_bias/data"):
    for file in filenames:
        print(os.path.join(dirname, file))

wiki_data = spark.read\
    .option("mode", "dropmalforms")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .option("multiline", "true")\
    .option("charset", "UTF-8")\
    .csv("/Users/chriswallerstein/Development/python/wikipedia_bias/data/*.csv")

/Users/chriswallerstein/Development/python/wikipedia_bias/data/wiki_corpus_2022_2_13.csv


In [94]:
# basic data cleansing
import pyspark.sql.functions as F

@F.udf
def ascii_ignore(x):
    return x.encode("ascii", "ignore").decode("ascii")

wiki_data = wiki_data.dropna()
wiki_data = wiki_data.withColumn("sentence_text", ascii_ignore("sentence"))

In [95]:
# tokenize
from pyspark.ml.feature import RegexTokenizer

regex_tokenizer = RegexTokenizer(inputCol="sentence_text", outputCol="words", pattern="\\W")
wiki_data = regex_tokenizer.transform(wiki_data)
wiki_data.sample(True, 0.05).toPandas()

Unnamed: 0,id,source,sentence,sentence_text,words
0,20220213090531263591,rational,"he also helped give the nazis their reputation for creepy occultism.==early life and views==himmler first grew interested in nazi politics when meeting ernst röhm, who convinced him to join his antisemitic paramilitary group, the bund reichskriegsflagge (imperial war flag society).weale, adrian (2010).","he also helped give the nazis their reputation for creepy occultism.==early life and views==himmler first grew interested in nazi politics when meeting ernst rhm, who convinced him to join his antisemitic paramilitary group, the bund reichskriegsflagge (imperial war flag society).weale, adrian (2010).","[he, also, helped, give, the, nazis, their, reputation, for, creepy, occultism, early, life, and, views, himmler, first, grew, interested, in, nazi, politics, when, meeting, ernst, rhm, who, convinced, him, to, join, his, antisemitic, paramilitary, group, the, bund, reichskriegsflagge, imperial, war, flag, society, weale, adrian, 2010]"
1,20220213090531263640,rational,"himmler joined the nazi party in august 1923. in november, he participated in adolf hitler's beer hall putsch, a failed coup d'état.himmler became obsessed with racial purity.","himmler joined the nazi party in august 1923. in november, he participated in adolf hitler's beer hall putsch, a failed coup d'tat.himmler became obsessed with racial purity.","[himmler, joined, the, nazi, party, in, august, 1923, in, november, he, participated, in, adolf, hitler, s, beer, hall, putsch, a, failed, coup, d, tat, himmler, became, obsessed, with, racial, purity]"
2,20220213090531450833,rational,"of course, pseudolinguistics has ties with other fields such as pseudohistory and pseudoarcheology.== basics of linguistics ==in linguistics, the comparative method is the accepted method to show that languages are related.","of course, pseudolinguistics has ties with other fields such as pseudohistory and pseudoarcheology.== basics of linguistics ==in linguistics, the comparative method is the accepted method to show that languages are related.","[of, course, pseudolinguistics, has, ties, with, other, fields, such, as, pseudohistory, and, pseudoarcheology, basics, of, linguistics, in, linguistics, the, comparative, method, is, the, accepted, method, to, show, that, languages, are, related]"
3,20220213090531451435,rational,a typological evaluation of celtic/hamito-semitic parallels.,a typological evaluation of celtic/hamito-semitic parallels.,"[a, typological, evaluation, of, celtic, hamito, semitic, parallels]"
4,20220213090531451451,rational,"""""""https://linguistics.stackexchange.com/questions/335/are-there-other-pairs-of-languages-that-are-as-close-grammatically-despite-not-b/346#346 answer to are there other pairs of languages that are as close grammatically despite not being in the same language family as korean and japanese?]"""".""\r","""""""https://linguistics.stackexchange.com/questions/335/are-there-other-pairs-of-languages-that-are-as-close-grammatically-despite-not-b/346#346 answer to are there other pairs of languages that are as close grammatically despite not being in the same language family as korean and japanese?]"""".""\r","[https, linguistics, stackexchange, com, questions, 335, are, there, other, pairs, of, languages, that, are, as, close, grammatically, despite, not, b, 346, 346, answer, to, are, there, other, pairs, of, languages, that, are, as, close, grammatically, despite, not, being, in, the, same, language, family, as, korean, and, japanese]"
5,20220213090532317183,wikipedia,"in january 2020, she was named in new zealand's squad for the 2020 icc women's t20 world cup in australia.","in january 2020, she was named in new zealand's squad for the 2020 icc women's t20 world cup in australia.","[in, january, 2020, she, was, named, in, new, zealand, s, squad, for, the, 2020, icc, women, s, t20, world, cup, in, australia]"


In [96]:
# vectorize words
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="words", outputCol="features")
cv_model = cv.fit(wiki_data)
wiki_data = cv_model.transform(wiki_data)
wiki_data.sample(True, 0.05).toPandas()

Unnamed: 0,id,source,sentence,sentence_text,words,features
0,20220213090531263640,rational,"himmler joined the nazi party in august 1923. in november, he participated in adolf hitler's beer hall putsch, a failed coup d'état.himmler became obsessed with racial purity.","himmler joined the nazi party in august 1923. in november, he participated in adolf hitler's beer hall putsch, a failed coup d'tat.himmler became obsessed with racial purity.","[himmler, joined, the, nazi, party, in, august, 1923, in, november, he, participated, in, adolf, hitler, s, beer, hall, putsch, a, failed, coup, d, tat, himmler, became, obsessed, with, racial, purity]","(1.0, 0.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
1,20220213090531263721,rational,he also gained control of the gestapo from hermann goering.,he also gained control of the gestapo from hermann goering.,"[he, also, gained, control, of, the, gestapo, from, hermann, goering]","(1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
2,20220213090531451143,rational,")** any form of ; theories that sanskrit, hebrew, romanian, or some other historical attested language is the original language of mankind and all others are derived from them.",")** any form of ; theories that sanskrit, hebrew, romanian, or some other historical attested language is the original language of mankind and all others are derived from them.","[any, form, of, theories, that, sanskrit, hebrew, romanian, or, some, other, historical, attested, language, is, the, original, language, of, mankind, and, all, others, are, derived, from, them]","(1.0, 2.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
3,20220213090531451154,rational,"*** chief among these is the sun language theory, claiming that language was invented by the turks as a way to convert ritual blathering into a means of meaningful communication.","*** chief among these is the sun language theory, claiming that language was invented by the turks as a way to convert ritual blathering into a means of meaningful communication.","[chief, among, these, is, the, sun, language, theory, claiming, that, language, was, invented, by, the, turks, as, a, way, to, convert, ritual, blathering, into, a, means, of, meaningful, communication]","(2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 2.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)"
4,20220213090531451869,rational,"""the proposed ural-altaic language family (""""turanian"""") is a typical example.""\r","""the proposed ural-altaic language family (""""turanian"""") is a typical example.""\r","[the, proposed, ural, altaic, language, family, turanian, is, a, typical, example]","(1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...)"


In [100]:
# transform label column (source)
from pyspark.ml.feature import StringIndexer

si = StringIndexer(inputCol="source", outputCol="label")
si_model = si.fit(wiki_data)
wiki_data = si_model.transform(wiki_data)
wiki_data.sample(True, 0.05).toPandas()

Unnamed: 0,id,source,sentence,sentence_text,words,features,label
0,20220213090530841786,conservapedia,"blake david moore (born june 22, 1980) is an american politician and former diplomat from the state of utah.","blake david moore (born june 22, 1980) is an american politician and former diplomat from the state of utah.","[blake, david, moore, born, june, 22, 1980, is, an, american, politician, and, former, diplomat, from, the, state, of, utah]","(1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",2.0
1,20220213090530841821,conservapedia,he is the representative for utah's 1st congressional district in the united states house of representatives.==tenure==he was one of thirty-five republicans to vote for the partisan hack resolution calling for an investigation on capitol hill.https://clerk.house.gov/evs/2021/roll154.xml==references==category:utahcategory:moderate republicanscategory:rinoscategory:united states representativescategory:117th united states congresscategory:republican establishmentcategory:anti-trumpcategory:republican main street partnershipcategory:biden putschcategory:neoconservatives,he is the representative for utah's 1st congressional district in the united states house of representatives.==tenure==he was one of thirty-five republicans to vote for the partisan hack resolution calling for an investigation on capitol hill.https://clerk.house.gov/evs/2021/roll154.xml==references==category:utahcategory:moderate republicanscategory:rinoscategory:united states representativescategory:117th united states congresscategory:republican establishmentcategory:anti-trumpcategory:republican main street partnershipcategory:biden putschcategory:neoconservatives,"[he, is, the, representative, for, utah, s, 1st, congressional, district, in, the, united, states, house, of, representatives, tenure, he, was, one, of, thirty, five, republicans, to, vote, for, the, partisan, hack, resolution, calling, for, an, investigation, on, capitol, hill, https, clerk, house, gov, evs, 2021, roll154, xml, references, category, utahcategory, moderate, republicanscategory, rinoscategory, united, states, representativescategory, 117th, united, states, congresscategory, republican, establishmentcategory, anti, trumpcategory, republican, main, street, partnershipcategory, biden, putschcategory, neoconservatives]","(3.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",2.0
2,20220213090531026557,conservapedia,"westminster choir college is a small college of 330 undergraduates located in princeton, new jersey, close to princeton university.","westminster choir college is a small college of 330 undergraduates located in princeton, new jersey, close to princeton university.","[westminster, choir, college, is, a, small, college, of, 330, undergraduates, located, in, princeton, new, jersey, close, to, princeton, university]","(0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",2.0
3,20220213090531026603,conservapedia,westminster choir college is part of rider university.category:colleges,westminster choir college is part of rider university.category:colleges,"[westminster, choir, college, is, part, of, rider, university, category, colleges]","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",2.0


In [102]:
train, test = wiki_data.select("features", "label").randomSplit([0.9,0.1])
print(train.count())
print(test.count())

140
14


In [116]:
# Try Logistic regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()
lr_model = lr.fit(train)
lr_predictions = lr_model.transform(test)
lr_predictions.limit(5).toPandas()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(3.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",2.0,"[6.0180550513825715, -2.1288893500554638, -3.8891657013271077]","[0.9996606825191889, 0.00028952069548830955, 4.979678532262728e-05]",0.0
1,"(3.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[19.229031174473352, -9.130993920159074, -10.098037254314276]","[0.9999999999993343, 4.823892208271229e-13, 1.834070438506885e-13]",0.0
2,"(6.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",1.0,"[0.9636461588983751, 6.219928043979082, -7.183574202877459]","[0.005187598035133194, 0.9948108999526292, 1.5020122377173194e-06]",1.0
3,"(5.0, 3.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[14.177052647213666, -5.9923531018555884, -8.184699545358077]","[0.9999999980657697, 1.739956453071621e-09, 1.9427388104280267e-10]",0.0
4,"(1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0,"[5.307075217857534, -2.0466873750703427, -3.2603878427871917]","[0.9991703153081712, 0.0006396479430185538, 0.00019003674881021794]",0.0


In [117]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
print(f"Linear regression f1 {evaluator.evaluate(lr_predictions):.4f}")

Linear regression f1 0.8947


In [118]:
training_summary = lr_model.summary



False positive rate by label:
label 0: 0.0
label 1: 0.0
label 2: 0.0
True positive rate by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
Precision by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
Recall by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
F-measure by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
Accuracy: 1.0
FPR: 0.0
TPR: 1.0
F-measure: 1.0
Precision: 1.0
Recall: 1.0
