In [56]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct
from pyspark.ml.feature import StopWordsRemover, Tokenizer
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.feature import Normalizer
from pyspark.ml import Pipeline

spark = SparkSession.builder \
.appName('code_6_of_10_data_mine_giuseppe_schintu') \
.master('local[*]') \
.config('spark.sql.execution.arrow.pyspark.enabled', True) \
.config('spark.sql.session.timeZone', 'UTC') \
.config('spark.driver.memory','8G') \
.config('spark.ui.showConsoleProgress', True) \
.config('spark.sql.repl.eagerEval.enabled', True) \
.getOrCreate()

#may use in other places
rnd_seed = 42

#=>DataFrame from .csv
train_df = spark.read.csv("train.csv", header=True, inferSchema=True, escape='"', sep=",")

train_df.cache()

train_df.show(10, truncate=50)

train_df.select('author').distinct().show()


# Step 1: Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
# Step 2: Stop word removal
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
'''
Step.1 replace "text" in Tokenizer(inputCol="text", outputCol="tokens")
with the actual column name from your CSV file that contains the text data.
'''


23/07/13 11:56:25 WARN CacheManager: Asked to cache already cached data.


+-------+--------------------------------------------------+------+
|     id|                                              text|author|
+-------+--------------------------------------------------+------+
|id26305|This process, however, afforded me no means of ...|   EAP|
|id17569|It never once occurred to me that the fumbling ...|   HPL|
|id11008|In his left hand was a gold snuff box, from whi...|   EAP|
|id27763|How lovely is spring As we looked from Windsor ...|   MWS|
|id12958|Finding nothing else, not even gold, the Superi...|   HPL|
|id22965|A youth passed in solitude, my best years spent...|   MWS|
|id09674|The astronomer, perhaps, at this point, took re...|   EAP|
|id13515|       The surcingle hung in ribands from my body.|   EAP|
|id19322|I knew that you could not say to yourself 'ster...|   EAP|
|id00912|I confess that neither the structure of languag...|   MWS|
+-------+--------------------------------------------------+------+
only showing top 10 rows

+------+
|author|
+---

'\nStep.1 replace "text" in Tokenizer(inputCol="text", outputCol="tokens")\nwith the actual column name from your CSV file that contains the text data.\n'

### => `FEATURE EXTRACTION`

In [57]:
# Step 3: TF-IDF calculation
vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorized_tokens", vocabSize=500)
idf = IDF(inputCol="vectorized_tokens", outputCol="tfidf")
# Step 4: Normalization
normalizer = Normalizer(inputCol="tfidf", outputCol="normalized_features")
# Step 5: Create pipeline for chaining the text mining transformers
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, vectorizer, idf, normalizer])
# Step 6: Apply the pipeline to DataFrame
processed_data = pipeline.fit(train_df).transform(train_df)

processed_data.show(5, truncate=45)


                                                                                

+-------+---------------------------------------------+------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+
|     id|                                         text|author|                                       tokens|                              filtered_tokens|                            vectorized_tokens|                                        tfidf|                          normalized_features|
+-------+---------------------------------------------+------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+
|id26305|This process, however, afforded me no mean...|   EAP|[this, process,, however,, afforded, me, n...|[process,, ho

'\nstep.4 The processed_data object will contain the final processed features in the\n"normalized_features" column. use for machine learning tasks.\nStep.6 replace your_dataframe with the name of your DataFrame that holds the CSV data.\n'

### => user-defined lemmatize function

In [58]:
# Tokenize sentence (lemmatize words, remove stop words and punctuations, strip off html and digits)
# Returns a struct of tokens, tokens count, punctuation count

import string
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, ArrayType, IntegerType, StringType

wnl = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))
list_punct = set(string.punctuation)
url_pattern = re.compile(r'https?.+|[^(a-zA-Z)(0-9)\s]')
number_pattern = re.compile(r'\d+')

def lemmatize(text):
    """
    param: sentence
    return: tokens, tokens count, punctuation count
    """
    punctuation_table = str.maketrans('', '', string.punctuation)
    
    punct_count = text.translate(punctuation_table).count('')
    
    text = (unicodedata.normalize('NFKD', text)
            .encode('ascii', 'ignore')
            .decode('utf-8', 'ignore')
            .lower())
    
    # remove urls
    text = url_pattern.sub(' ', text)
    # remove numbers
    text = number_pattern.sub(' ', text)
    
    words = text.split()
    # remove stopwords and strings of length <= 2
    words = [wnl.lemmatize(word) for word in words if word not in stopwords_set and len(word) > 2]
    word_count = len(words)
    
    return words, word_count, punct_count


# Register lemmatizer as an UDF
lemma_schema = StructType([
    StructField("words", ArrayType(StringType()), False),
    StructField("word_count", IntegerType(), False),
    StructField("punct_count", IntegerType(), False)
])

udf_lemmatize = F.udf(lemmatize, lemma_schema)

In [59]:
# Use the UDF to extract the tokens and count features
lemma_train_df = (train_df
 .withColumn('lemmatize', udf_lemmatize('text'))
)

lemma_train_df.cache()

lemma_train_df = lemma_train_df.select(F.col("text"),
                    F.col("author"), 
                    F.col("lemmatize.words").alias("words"),
                    F.col("lemmatize.word_count").alias("word_count"),
                    F.col("lemmatize.punct_count").alias("punct_count")
                   ).cache()

lemma_train_df.show(5, truncate=30)

+------------------------------+------+------------------------------+----------+-----------+
|                          text|author|                         words|word_count|punct_count|
+------------------------------+------+------------------------------+----------+-----------+
|This process, however, affo...|   EAP|[process, however, afforded...|        21|        225|
|It never once occurred to m...|   HPL|[never, occurred, fumbling,...|         6|         71|
|In his left hand was a gold...|   EAP|[left, hand, gold, snuff, b...|        19|        196|
|How lovely is spring As we ...|   MWS|[lovely, spring, looked, wi...|        21|        203|
|Finding nothing else, not e...|   HPL|[finding, nothing, else, ev...|        16|        171|
+------------------------------+------+------------------------------+----------+-----------+
only showing top 5 rows



23/07/13 11:56:35 WARN CacheManager: Asked to cache already cached data.
23/07/13 11:56:35 WARN CacheManager: Asked to cache already cached data.


### => `Convert BoW to Word2Vec Vector`

BoW is a simpler technique that represents each word as a vector in a high-dimensional space, where each dimension corresponds to a unique word in the corpus vocabulary. The value at each dimension reflects the count or frequency of the corresponding word in a specific document.

Word2Vec, on the other hand, uses neural networks to learn word associations from a large corpus of text. Word2Vec provides word embeddings that capture a large number of precise syntactic and semantic word relationships

In [60]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=100, 
                    minCount=1, 
                    inputCol="words", 
                    outputCol="w2v_features",
                    maxIter=20,
                    seed=rnd_seed)

w2vec_model = word2Vec.fit(lemma_train_df)

w2v_train_df = w2vec_model.transform(lemma_train_df)

w2v_train_df.cache()

w2v_train_df.show(truncate=25)


                                                                                

+-------------------------+------+-------------------------+----------+-----------+-------------------------+
|                     text|author|                    words|word_count|punct_count|             w2v_features|
+-------------------------+------+-------------------------+----------+-----------+-------------------------+
|This process, however,...|   EAP|[process, however, aff...|        21|        225|[-0.04504347521634329,...|
|It never once occurred...|   HPL|[never, occurred, fumb...|         6|         71|[-0.022045572909216084...|
|In his left hand was a...|   EAP|[left, hand, gold, snu...|        19|        196|[0.12078562192618847,-...|
|How lovely is spring A...|   MWS|[lovely, spring, looke...|        21|        203|[-0.18042237772828057,...|
|Finding nothing else, ...|   HPL|[finding, nothing, els...|        16|        171|[0.15764092560857534,0...|
|A youth passed in soli...|   MWS|[youth, passed, solitu...|        40|        463|[-0.03963794133160264,...|
|The astro

                                                                                

### => `Append count features to Word2Vec features with VectorAssembler`

Transform and combine a list of specified columns from a DataFrame into a single vector column. This is a common preprocessing step required for many machine learning algorithms on Spark, which often require data to be formatted into a single feature vector

In [61]:
from pyspark.ml.feature import VectorAssembler

feature_cols = ["w2v_features", "word_count", "punct_count"]

# put features into a feature vector column
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

assembled_train_df = vector_assembler.transform(w2v_train_df)

assembled_train_df.cache()

assembled_train_df.select('author', 'features').show(5, truncate=75)

# check the original individual feature columns
print(assembled_train_df.select('w2v_features', 'word_count', 'punct_count').take(1))
print()
# make sure the original individual feature columns are assembled in the correct order
print(assembled_train_df.select('features').take(1))
print()


                                                                                

+------+---------------------------------------------------------------------------+
|author|                                                                   features|
+------+---------------------------------------------------------------------------+
|   EAP|[-0.04504347521634329,-0.22177825301020804,0.07188885051956666,-0.034261...|
|   HPL|[-0.022045572909216084,-0.055779277676871665,-0.07163621050616105,0.1384...|
|   EAP|[0.12078562192618847,-0.07928438121943097,-0.012853829680304777,-0.12579...|
|   MWS|[-0.18042237772828057,-0.08317899735023578,0.012952304418875082,0.172434...|
|   HPL|[0.15764092560857534,0.121783307637088,0.07108729949686676,0.08440010504...|
+------+---------------------------------------------------------------------------+
only showing top 5 rows

[Row(w2v_features=DenseVector([-0.045, -0.2218, 0.0719, -0.0343, -0.0214, -0.0365, 0.0321, -0.0047, -0.0503, 0.0149, -0.1625, -0.1104, 0.0544, 0.0497, -0.1745, -0.0659, 0.039, 0.0585, -0.1463, 0.0686, 0.1095, 0

### => `Transform Author attribute into MultiClass Labels with StringIndexer`

StringIndexer encodes a string column of labels to a column of label indices. The indices are in [0, numLabels], ordered by label frequencies, so the most frequent label gets index 0.
In our case, the author column (Category) will be encoded to label indices, from 0 to 2; the most frequent label (EAP) will be indexed as 0.

In [62]:
from pyspark.ml.feature import StringIndexer

label_indexer = StringIndexer(inputCol = "author", outputCol = "label")

label_indexer_model = label_indexer.fit(assembled_train_df)

labeled_train_df = label_indexer_model.transform(assembled_train_df)

labeled_train_df.cache()

labeled_train_df.select('text', 'author', 'features', 'label').show(10, truncate=50)

+--------------------------------------------------+------+--------------------------------------------------+-----+
|                                              text|author|                                          features|label|
+--------------------------------------------------+------+--------------------------------------------------+-----+
|This process, however, afforded me no means of ...|   EAP|[-0.04504347521634329,-0.22177825301020804,0.07...|  0.0|
|It never once occurred to me that the fumbling ...|   HPL|[-0.022045572909216084,-0.055779277676871665,-0...|  2.0|
|In his left hand was a gold snuff box, from whi...|   EAP|[0.12078562192618847,-0.07928438121943097,-0.01...|  0.0|
|How lovely is spring As we looked from Windsor ...|   MWS|[-0.18042237772828057,-0.08317899735023578,0.01...|  1.0|
|Finding nothing else, not even gold, the Superi...|   HPL|[0.15764092560857534,0.121783307637088,0.071087...|  2.0|
|A youth passed in solitude, my best years spent...|   MWS|[-0.0

## => `Train & Validation Split`

### => `Stratified Sampling with randomSplit`
randomSplit may not always give a perfect stratification as using sampleBy, but it is a better fit for larger datasets

In [63]:
#Stratified Sampling

# The fractions in randomSplit represent the proportion of data to go into each dataset.
labeled_train_df, labeled_validation_df = labeled_train_df.randomSplit([0.75, 0.25], seed=rnd_seed)

# recompute and print the distribution in each DataFrame
train_count = labeled_train_df.count()
valid_count = labeled_validation_df.count()
total_count = train_count + valid_count

train_pctg = train_count / total_count
valid_pctg = valid_count / total_count                                 
print(f'Ratio train:valid={train_pctg}:{valid_pctg}')

# recheck distribution
(labeled_train_df
 .groupBy('label')
 .count()
 .withColumn('%age', F.round(F.col('count') / train_count, 2))
 .show())

(labeled_validation_df
 .groupBy('label')
 .count()
 .withColumn('%age', F.round(F.col('count') / valid_count, 2))
 .show())


'''

(labeled_train_df
 .groupBy('label')
 .count()
 .withColumn('%age', F.round(F.col('count') / labeled_train_df.count(), 2))
 .show())

# specify the exact fraction desired from each key as a dictionary
fractions = {0: 0.25, 1: 0.25, 2: 0.25}
# create the validation set with 25% of the entire data and same distribution of author labels
labeled_validation_df = labeled_train_df.stat.sampleBy('label', fractions, seed=rnd_seed).cache()
# subtract the validation set from the original training set to get 75% of the entire data 
# and same distribution of author labels
labeled_train_df = labeled_train_df.subtract(labeled_validation_df).cache()
labeled_train_df.cache()
labeled_validation_df.cache()

total_count = labeled_train_df.count() + labeled_validation_df.count()

train_pctg = labeled_train_df.count() / total_count
valid_pctg = labeled_validation_df.count() / total_count                                 
print(f'Ratio train:valid={train_pctg}:{valid_pctg}')

#recheck distribution
(labeled_train_df
 .groupBy('label')
 .count()
 .withColumn('%age', F.round(F.col('count') / labeled_train_df.count(), 2))
 .show())

'''


Ratio train:valid=0.7517748608202666:0.24822513917973338
+-----+-----+----+
|label|count|%age|
+-----+-----+----+
|  0.0| 5906| 0.4|
|  1.0| 4548|0.31|
|  2.0| 4265|0.29|
+-----+-----+----+

+-----+-----+----+
|label|count|%age|
+-----+-----+----+
|  0.0| 1994|0.41|
|  1.0| 1496|0.31|
|  2.0| 1370|0.28|
+-----+-----+----+



"\n\n(labeled_train_df\n .groupBy('label')\n .count()\n .withColumn('%age', F.round(F.col('count') / labeled_train_df.count(), 2))\n .show())\n\n# specify the exact fraction desired from each key as a dictionary\nfractions = {0: 0.25, 1: 0.25, 2: 0.25}\n# create the validation set with 25% of the entire data and same distribution of author labels\nlabeled_validation_df = labeled_train_df.stat.sampleBy('label', fractions, seed=rnd_seed).cache()\n# subtract the validation set from the original training set to get 75% of the entire data \n# and same distribution of author labels\nlabeled_train_df = labeled_train_df.subtract(labeled_validation_df).cache()\nlabeled_train_df.cache()\nlabeled_validation_df.cache()\n\ntotal_count = labeled_train_df.count() + labeled_validation_df.count()\n\ntrain_pctg = labeled_train_df.count() / total_count\nvalid_pctg = labeled_validation_df.count() / total_count                                 \nprint(f'Ratio train:valid={train_pctg}:{valid_pctg}')\n\n#rech

### => `Train Logistic Regression`

In [64]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', 
                        labelCol='label', 
                        predictionCol='prediction',
                        probabilityCol='probability',
                        maxIter=20, 
                        regParam=0.3, 
                        elasticNetParam=0)

lrModel = lr.fit(labeled_train_df)

train_preds_labels = lrModel.transform(labeled_train_df)
validation_preds_labels = lrModel.transform(labeled_validation_df)

train_preds_labels.cache()
validation_preds_labels.cache()

train_preds_labels.select('text', 'author', 'features', 'label', 'probability', 'prediction').show(50)

+--------------------+------+--------------------+-----+--------------------+----------+
|                text|author|            features|label|         probability|prediction|
+--------------------+------+--------------------+-----+--------------------+----------+
|" Odenheimer, res...|   EAP|[0.01016970165073...|  0.0|[0.55332933137401...|       0.0|
|"'But,' says L'Et...|   EAP|[0.01173311382132...|  0.0|[0.69874174985508...|       0.0|
|"'Excellent man I...|   MWS|[0.43706783900658...|  1.0|[0.53730077941483...|       0.0|
|"'Heaven forbid E...|   MWS|[0.10945655529697...|  1.0|[0.20406489925011...|       1.0|
|"'How can I thank...|   MWS|[0.34089323746350...|  1.0|[0.23486886884413...|       1.0|
|"'They are kind t...|   MWS|[0.35700334608554...|  1.0|[0.35715926232373...|       1.0|
|"A considerable p...|   MWS|[0.02229289178337...|  1.0|[0.40276271338866...|       1.0|
|"A death's head" ...|   EAP|[0.11090118471871...|  0.0|[0.78785829808840...|       0.0|
|"A few days after...

## => `Training & Validation Accuracy`

In [65]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
train_f1_score = evaluator.evaluate(train_preds_labels)
validation_f1_score = evaluator.evaluate(validation_preds_labels)

print(f'Training F1 Score   :{train_f1_score}')
print(f'Validation F1 Score :{validation_f1_score}')

train_summary = lrModel.evaluate(labeled_train_df)
validation_summary = lrModel.evaluate(labeled_validation_df)


print(f'Training Accuracy   :{train_summary.accuracy}')
print(f'Validation Accuracy :{validation_summary.accuracy}')


# F1 score by label
train_summary.fMeasureByLabel(beta=1.0), validation_summary.fMeasureByLabel(beta=1.0)

# Precision by label
train_summary.precisionByLabel, validation_summary.precisionByLabel

# Recall by label
train_summary.recallByLabel, validation_summary.recallByLabel

                                                                                

Training F1 Score   :0.764780618882283
Validation F1 Score :0.7706831769016816
Training Accuracy   :0.7647938039268972
Validation Accuracy :0.7707818930041153


([0.7941076870978666, 0.7537379067722075, 0.7359906213364595],
 [0.8014042126379137, 0.7540106951871658, 0.7445255474452555])

## => `Test Predictions`

In [66]:
from pyspark.ml.functions import vector_to_array


test_df = spark.read.csv('test.csv', sep=',', escape='"', header=True)
test_df.cache()

# Use the UDF to extract the tokens and count features
lemma_test_df = (test_df
 .withColumn('lemmatize', udf_lemmatize('text'))
)

lemma_test_df.cache()

lemma_test_df = lemma_test_df.select(F.col("id"),
                    F.col("text"),
                    F.col("lemmatize.words").alias("words"),
                    F.col("lemmatize.word_count").alias("word_count"),
                    F.col("lemmatize.punct_count").alias("punct_count")
                   ).cache()
w2v_test_df = w2vec_model.transform(lemma_test_df)
w2v_test_df.cache()

w2v_test_df.show(truncate=25)

assembled_test_df = vector_assembler.transform(w2v_test_df)
assembled_test_df.cache()

test_preds = lrModel.transform(assembled_test_df)
test_preds.cache()

test_preds.select('id', 'text', 'features', 'probability', 
                  F.col('prediction').cast('integer').alias('prediction')).show(10)





23/07/13 11:58:56 WARN CacheManager: Asked to cache already cached data.
23/07/13 11:58:56 WARN CacheManager: Asked to cache already cached data.
23/07/13 11:58:56 WARN CacheManager: Asked to cache already cached data.


+-------+-------------------------+-------------------------+----------+-----------+-------------------------+
|     id|                     text|                    words|word_count|punct_count|             w2v_features|
+-------+-------------------------+-------------------------+----------+-----------+-------------------------+
|id02310|Still, as I urged our ...|[still, urged, leaving...|        10|        108|[-0.08468285130802543,...|
|id24541|If a fire wanted fanni...|[fire, wanted, fanning...|        28|        324|[0.032250970984543006,...|
|id00134|And when they had brok...|[broken, frail, door, ...|        17|        187|[0.07540232159526032,-...|
|id27757|While I was thinking h...|[thinking, possibly, m...|        20|        219|[0.00533018559217453,-...|
|id04081|I am not sure to what ...|[sure, limit, knowledg...|         5|         53|[0.08088463377207518,-...|
|id27337|"The thick and peculia...|[thick, peculiar, mist...|        17|        195|[0.13549282774329185,0...|
|

[Stage 1010:>                                                       (0 + 1) / 1]

+-------+--------------------+--------------------+--------------------+----------+
|     id|                text|            features|         probability|prediction|
+-------+--------------------+--------------------+--------------------+----------+
|id02310|Still, as I urged...|[-0.0846828513080...|[0.17227477585331...|         1|
|id24541|If a fire wanted ...|[0.03225097098454...|[0.47584078816432...|         0|
|id00134|And when they had...|[0.07540232159526...|[0.51718163434267...|         0|
|id27757|While I was think...|[0.00533018559217...|[0.39438913371620...|         2|
|id04081|I am not sure to ...|[0.08088463377207...|[0.83145845015370...|         0|
|id27337|"The thick and pe...|[0.13549282774329...|[0.41786481270991...|         2|
|id24265|That which is not...|[0.34294747561216...|[0.48872744201990...|         0|
|id25917|I sought for repo...|[0.02436212593844...|[0.16193972552752...|         1|
|id04951|Upon the fourth d...|[-0.0455556112408...|[0.73126790199237...|    

                                                                                

### => `Show probabilty by Author with Test Data`

In [67]:

test_preds_by_author_df = (test_preds
    .withColumn("probas", vector_to_array("probability"))
    .select(F.col('id'),
            F.col('probas')[0].alias('EAP'), 
            F.col('probas')[1].alias('MWS'), 
            F.col('probas')[2].alias('HPL'))
    ).cache()


test_preds_by_author_df.show(5, truncate=False)




+-------+-------------------+-------------------+-------------------+
|id     |EAP                |MWS                |HPL                |
+-------+-------------------+-------------------+-------------------+
|id02310|0.17227477585331627|0.7416188373929196 |0.08610638675376436|
|id24541|0.4758407881643216 |0.22428933408563767|0.29986987775004076|
|id00134|0.517181634342671  |0.12632207649333665|0.3564962891639923 |
|id27757|0.3943891337162043 |0.06612268227934405|0.5394881840044516 |
|id04081|0.831458450153707  |0.10452386214586175|0.06401768770043126|
+-------+-------------------+-------------------+-------------------+
only showing top 5 rows



### => `Show Good(green) and Bad(red) Predictions on Train and Test data`

In [68]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pandas as pd

author_map = {0: 'EAP', 1: 'MWS', 2: 'HLP'} 

# Define a UDF to map the label id back to author's name
def map_to_author(label_id):
    return author_map[label_id]

map_to_author_udf = udf(map_to_author, StringType())

# Apply the UDF to the 'prediction' column in train_preds_labels and test_preds
train_preds_labels = train_preds_labels.withColumn('predicted_author', map_to_author_udf('prediction'))
test_preds = test_preds.withColumn('predicted_author', map_to_author_udf('prediction'))

# Show the 'text', 'author', 'predicted_author' etc. to compare the predictions
train_preds_labels.select('text', 'author', 'features', 'label', 'probability', 'prediction', 'predicted_author').show(10)
test_preds.select('id', 'text', 'features', 'probability', 'prediction', 'predicted_author').show(10)


# Convert PySpark DataFrame to Pandas DataFrame
train_preds_labels_pd = train_preds_labels.select('text', 'author', 'label', 'prediction', 'predicted_author').toPandas()

# Function to color the mismatch in 'red', else 'black'
def color_mismatch(data):
    color = 'red' if data['author'] != data['predicted_author'] else 'green'
    return ['color: %s' % color]*len(data.values)

train_preds_labels_pd.head(100).style.apply(color_mismatch, axis=1)


+--------------------+------+--------------------+-----+--------------------+----------+----------------+
|                text|author|            features|label|         probability|prediction|predicted_author|
+--------------------+------+--------------------+-----+--------------------+----------+----------------+
|" Odenheimer, res...|   EAP|[0.01016970165073...|  0.0|[0.55332933137401...|       0.0|             EAP|
|"'But,' says L'Et...|   EAP|[0.01173311382132...|  0.0|[0.69874174985508...|       0.0|             EAP|
|"'Excellent man I...|   MWS|[0.43706783900658...|  1.0|[0.53730077941483...|       0.0|             EAP|
|"'Heaven forbid E...|   MWS|[0.10945655529697...|  1.0|[0.20406489925011...|       1.0|             MWS|
|"'How can I thank...|   MWS|[0.34089323746350...|  1.0|[0.23486886884413...|       1.0|             MWS|
|"'They are kind t...|   MWS|[0.35700334608554...|  1.0|[0.35715926232373...|       1.0|             MWS|
|"A considerable p...|   MWS|[0.02229289178337

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Unnamed: 0,text,author,label,prediction,predicted_author
0,""" Odenheimer, restaurateur.",EAP,0.0,0.0,EAP
1,"""'But,' says L'Etoile, 'if the body had been kept in its mangled state on shore until Tuesday night, some trace would be found on shore of the murderers.'",EAP,0.0,0.0,EAP
2,"""'Excellent man I thank you and accept your generous offer.",MWS,1.0,0.0,EAP
3,"""'Heaven forbid Even if you were really criminal, for that can only drive you to desperation, and not instigate you to virtue.",MWS,1.0,1.0,MWS
4,"""'How can I thank you, my best and only benefactor?",MWS,1.0,1.0,MWS
5,"""'They are kind they are the most excellent creatures in the world; but, unfortunately, they are prejudiced against me.",MWS,1.0,1.0,MWS
6,"""A considerable period elapsed before I discovered one of the causes of the uneasiness of this amiable family: it was poverty, and they suffered that evil in a very distressing degree.",MWS,1.0,1.0,MWS
7,"""A death's head"" echoed Legrand ""Oh yes well, it has something of that appearance upon paper, no doubt.",EAP,0.0,0.0,EAP
8,"""A few days after, the Turk entered his daughter's apartment and told her hastily that he had reason to believe that his residence at Leghorn had been divulged and that he should speedily be delivered up to the French government; he had consequently hired a vessel to convey him to Constantinople, for which city he should sail in a few hours.",MWS,1.0,1.0,MWS
9,"""A friend,"" replied Raymond in the same dialect.",MWS,1.0,1.0,MWS


In [None]:
#spark.stop()