# F5.news Trending News - Machine Learning Exploration

- News Article Sentiment
- Predict Trending Topics
- Topic Categorization

### Installs & Imports

In [4]:
%pip install -q -U boto3 hvac mlflow numpy "pyspark==3.5.0" python-dotenv "pymongo[srv]"

Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import hvac
import mlflow
import numpy as np

from dotenv import load_dotenv
from datetime import datetime, timedelta

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import Bucketizer, StringIndexer, VectorAssembler, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

from mlflow.models import infer_signature

load_dotenv()

True

### Connect to Vault for Mongo connection values

In [6]:
client = hvac.Client(
    url=os.environ.get('VAULT_ADDR'),
    token=os.environ.get('VAULT_TOKEN'),
)

print(client.is_authenticated())

if client.is_authenticated():
    try:
        secret_resp = client.secrets.kv.v2.read_secret_version(
            mount_point='kv', 
            path='f5.news', 
            raise_on_deleted_version=False
        )
        
        if secret_resp['data'] is not None:
            secret_values = secret_resp['data']['data']
            for secret, value in secret_values.items():
                os.environ[str(secret)] = str(value)
        else:
            print("The secret does not exist.")
    except hvac.exceptions.InvalidPath:
        print("The path is invalid or the permission is denied.")
    except hvac.exceptions.Forbidden:
        print("The permission is denied.")
    except hvac.exceptions.VaultError as e:
        print(f"Vault error occurred: {e}")
else:
    print("Failed to connect to HashiVault")

True


### Configs

In [7]:
# General
DEBUG = False
MODE = "local" # Supported -- local OR cluster
REG_PARAM_VALUE = 0.1 # Experimenting with this value can improve final accuracy
MAX_ITER = 20
DATASET_SPLIT = [0.9, 0.1] # Portion of data to split between training and test datasets
SAMPLE_TITLE = "Trump Says Some Migrants Are ‘Not People’ and Predicts a ‘Blood Bath’ if He Loses"

# Spark
SPARK_MASTER = "spark://localhost:7077"
SPARK_MEMORY = "4g"
os.environ["PYSPARK_PIN_THREAD"] = "false" # TODO: Move to .env

# Mongo
URI = os.environ['mongo_uri']
DATABASE = os.environ['database']
COLLECTION = os.environ['collection']

# MLflow
MLFLOW_API = "http://localhost:5000"
MODEL_NAME = "f5news_upvote_bucket_prediction"
EXPERIMENT_NAME = "f5news_upvote_bucket_prediction"

# Minio S3
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://localhost:9000"
os.environ['AWS_ACCESS_KEY_ID'] = "minio"
os.environ['AWS_SECRET_ACCESS_KEY'] = "minio123" # TODO: Move all of these to .env

### Pull F5 records using pymongo client

In [8]:
if DEBUG:
    # Create a new client and connect to the MongoDB server
    client = MongoClient(URI, server_api=ServerApi('1'))

    # Send a ping to confirm a successful connection
    try:
        client.admin.command('ping')
        print("Successfully connected to MongoDB...")
    except Exception as e:
        print(e)

    try:
        database = client[DATABASE]
        collection = database[COLLECTION]

        # Query all documents in the collection
        documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

        if(DEBUG == True):
            # Iterate over the cursor to access the documents
            for doc in documents:
                print(doc["title"])
                print(doc["fetchedAt"])
                print(doc["upvoteCount"], "upvotes")
                print()
        else:
            print("Mongo documents loaded successfully!")
    except Exception as e:
        print(e)

Successfully connected to MongoDB...
Joe Biden suddenly leads Donald Trump in multiple polls
2024-03-12 13:44:05.327000
42643 upvotes

Former Vice President Mike Pence will not endorse Trump for president
2024-03-16 19:49:05.432000
41067 upvotes

Biden cancels $6 billion in student loan debt for 78,000 public service workers
2024-03-22 12:34:05.723000
35657 upvotes

Nikki Haley Donors Switch to Joe Biden Over Donald Trump
2024-03-21 13:34:05.421000
33239 upvotes

Republicans reject motion to impeach Joe Biden
2024-03-21 16:54:05.358000
32646 upvotes

Trump crowd goes silent as he confuses Biden and Obama again
2024-03-04 16:44:05.262000
31825 upvotes

Trump has been unable to get bond for $464 million judgment, his lawyers say
2024-03-19 13:19:05.533000
29436 upvotes

Biden Just Delivered a Top Career Performance. He Needed It.
2024-03-09 14:59:05.814000
29299 upvotes

Republican Rep. Mike Gallagher will resign early, leaving House majority hanging by a thread
2024-03-23 18:24:05.03800

### Setup MLflow runner

In [9]:
global_run_name = None
global_run_id = None
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Set MLflow configs
mlflow.set_tracking_uri(MLFLOW_API)
mlflow.set_experiment(EXPERIMENT_NAME)

def start_mlflow_run(run_name: str = None):
    global global_run_name, global_run_id, start_time
    if run_name is None:
        run_name = start_time
    else:
        run_name = run_name + start_time
    global_run_name = run_name
    run = mlflow.start_run(run_name=run_name, description=EXPERIMENT_NAME)
    global_run_id = run.info.run_id

2024/03/24 19:12:34 INFO mlflow.tracking.fluent: Experiment with name 'f5news_upvote_bucket_prediction' does not exist. Creating a new experiment.


### Connect to Spark

In [10]:
# Create MLflow Run Instance
try:
    mlflow.end_run()
except:
    pass

start_mlflow_run()

# Log parameters
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
mlflow.log_param("start_time", start_time)

try:
    if MODE == "local":
        print("Starting Spark in local mode")
        spark = SparkSession.builder \
            .appName("F5news") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,org.mlflow:mlflow-spark:2.8.1") \
            .getOrCreate()
    elif MODE == "cluster":
        print("Starting Spark in cluster mode")
        spark = SparkSession.builder \
            .appName("F5news") \
            .master(SPARK_MASTER) \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,org.mlflow:mlflow-spark:2.8.1") \
            .getOrCreate()
    
    # Setup Spark AutoLog
    mlflow.autolog()

    # Get Spark version
    spark_version = spark.version
    print("Spark Version:", spark_version)

    # Check if the master URL indicates local mode or a specific cluster mode
    sc = spark.sparkContext
    master_url = sc.master
    
    if "local" and not "localhost" in master_url:
        print("PySpark is running in local mode.")
    else:
        print("PySpark is running in cluster mode with master URL:", master_url)

except Exception as e:
    print("Error:", str(e))

    # Stop SparkSession
    spark.stop()

    # End MLflow run
    mlflow.end_run()

Starting Spark in local mode
:: loading settings :: url = jar:file:/home/mgmtadmin/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/mgmtadmin/.ivy2/cache
The jars for the packages stored in: /home/mgmtadmin/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
org.mlflow#mlflow-spark added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e867f2f7-096e-45be-9c7d-295501ccfcbd;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
	found org.mlflow#mlflow-spark;2.8.1 in central
	found org.slf4j#slf4j-api;1.7.25 in central
:: resolution report :: resolve 83ms :: artifacts dl 3ms
	:: modules in use:
	org.mlflow#mlflow-spark;2.8.1 from central in [default]
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.sp

Spark Version: 3.5.0
PySpark is running in local mode.


### Load MongoDB as Dataset

In [11]:
# Declare Schema
schema = StructType([
    StructField("title", StringType(), nullable=True), # model input
    StructField("upvoteCount", DoubleType(), nullable=True), # used to bucketize for training
    StructField("fetchedAt", TimestampType(), nullable=True) # used to filter recent events
])

# Load data from MongoDB into a DataFrame
df = spark.read \
    .format("mongo") \
    .option("uri", URI) \
    .option("database", DATABASE) \
    .option("collection", COLLECTION) \
    .schema(schema) \
    .load()

print("Data loaded successfully from MongoDB!")

Data loaded successfully from MongoDB!


### Show Loaded Data

In [12]:
if DEBUG:
    df.show(5,truncate=False)

+------------------------------------------------------------------------------------------------+-----------+-----------------------+
|title                                                                                           |upvoteCount|fetchedAt              |
+------------------------------------------------------------------------------------------------+-----------+-----------------------+
|Donald Trump found to have fraudulently boosted value of Scots homes by up to £200m             |21842.0    |2024-03-01 23:44:19.914|
|LGBTQ group sues to block Texas AG Paxton's request for records about transgender children      |4610.0     |2024-03-01 23:39:19.997|
|Michigan communities will share a record $87M in marijuana tax revenue                          |6265.0     |2024-03-02 00:24:22.398|
|IRS launches crackdown on 125,000 wealthy ‘non-filers’                                          |20920.0    |2024-03-02 14:34:20.029|
|Blockbuster California storm to deliver crushing blow 

                                                                                

### Filter Out Recent Posts

In [13]:
# Get document initial count
print('Documents Loaded:', df.count())
mlflow.log_param("loaded_documents", df.count())

# Filter out new posts
oneDayAgo = d = datetime.today() - timedelta(days=1)
df = df.filter(df.fetchedAt < oneDayAgo)
print('Total Filtered Documents:', df.count())

mlflow.log_param("filtered_documents", df.count())

Documents Loaded: 8208
Total Filtered Documents: 7903


7903

### Preview a Random Sample of Bucketized Dataset

In [14]:
if DEBUG:
    sample_count = 10
    pandas_random_sample = df.toPandas().sample(n=sample_count) # Convert to pandas dataframe to take sample
    pyspark_random_sample = spark.createDataFrame(pandas_random_sample) # Convert back to pyspark dataframe
    pyspark_random_sample.show()
    df.printSchema()

+--------------------+-----------+--------------------+
|               title|upvoteCount|           fetchedAt|
+--------------------+-----------+--------------------+
|Anti-Kremlin Russ...|    12472.0|2024-03-13 12:04:...|
|Trump says there ...|     1328.0|2024-03-17 17:39:...|
|Armenia moves to ...|     2445.0|2024-03-09 16:14:...|
|Massive $1.2 tril...|       38.0|2024-03-21 20:39:...|
|Gaza hostage deal...|       58.0|2024-03-12 22:34:...|
|A Bolder American...|       16.0|2024-03-12 13:14:...|
|Russia systematic...|      637.0|2024-03-15 20:19:...|
|Hamas delegation ...|     1011.0|2024-03-04 03:44:...|
|Super Tuesday’s W...|        0.0|2024-03-06 14:24:...|
|Maddening New Pol...|    23939.0|2024-03-06 11:44:...|
+--------------------+-----------+--------------------+

root
 |-- title: string (nullable = true)
 |-- upvoteCount: double (nullable = true)
 |-- fetchedAt: timestamp (nullable = true)



In [15]:
# Bucketizer
bucketizer = Bucketizer(splits=[0, 1000, 5000, 10000, 25000, 50000, float('inf')], inputCol="upvoteCount", outputCol="bucket")
df = bucketizer.transform(df)

### Drop unnecessary columns before going into model

In [16]:
df.drop('upvoteCount')
df.drop('fetchedAt')

DataFrame[title: string, upvoteCount: double, bucket: double]

### Split dataset into training and test

In [17]:

(trainingData, testData) = df.randomSplit(DATASET_SPLIT, seed = 123456)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

mlflow.log_metric("trainingData", trainingData.count())
mlflow.log_metric("testData", testData.count())

Training Dataset Count: 7127
Test Dataset Count: 776


### Define Data Prep Pipeline Steps

- **Regular Expression Tokenizer**: Breaks title into array of words via regex
- **Stop Words Remover**: Removes undesireable words from Regex Tokenizer output
- **Bag of Words Counter**: Creates vector representation of the array of words extracted from original title string
- **Create Label**: Maps all possible values in bucket columns to numeric values (their index position in an array of unique bucket values)

In [18]:


# Tokenizer
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\\W")

# StopWordsRemover
add_stopwords = ["http","https","amp","reddit","subreddit"] # TODO: Update stopwords to match dataset
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# CountVectorizer
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=30000, minDF=5)

# Init linear regression model with column names
lr = LogisticRegression(featuresCol="features", labelCol="bucket")

# Create ParamGrid for Cross Validation
paramGrid = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
    .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
    .addGrid(lr.maxIter, [10, 20, 50]) #Number of iterations
    .build()
)

# define evaluator for cross validator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

# Create 5-fold CrossValidator
#cv = CrossValidator(
#    estimator=lr, \
#    estimatorParamMaps=paramGrid, \
#    evaluator=evaluator, \
#    numFolds=5
#)


### Assemble Data Prep Pipeline

Creates the `features` columns. We split titles to words, remove the words we don't want, vectorize the resulting array of words, then label based on bucket column.

In [19]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, lr])

### Run the Data Prep Pipeline

In [20]:
model = pipeline.fit(trainingData)
dataset = model.transform(trainingData)
dataset.show(10)

24/03/24 19:12:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/03/24 19:12:49 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "s3"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$Pipeline

+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|               title|upvoteCount|           fetchedAt|bucket|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|\nBiden’s 2023 St...|        5.0|2024-03-07 09:39:...|   0.0|[biden, s, 2023, ...|[biden, s, 2023, ...|(2863,[2,3,4,9,10...|[98.4011285853024...|[1.0,2.1372171424...|       0.0|
|\nItalian police ...|       47.0|2024-03-11 14:59:...|   0.0|[italian, police,...|[italian, police,...|(2863,[8,92,243,3...|[26.5572914011512...|[0.66665857118770...|       0.0|
|\nMichigan lawyer...|       21.0|2024-03-19 09:14:...|   0.0|[michigan, lawyer...|[michigan, lawyer...|(

### Evaluate the Model Using Test Data

- **Bucket 1**: 0 - 999 upvotes
- **Bucket 2**: 1,000 - 4,999 upvotes
- **Bucket 3**: 5,000 - 9,999 upvotes
- **Bucket 4**: 10,000 - 24,999 upvotes
- **Bucket 5**: 25,000 - 49,999 upvotes
- **Bucket 6**: > 50,000 upvotes

In [21]:
# Make Predictions for entire test data set
predictions = model.transform(testData)

# Show a few predictions
# - change filter params such as prediction == 1 # TODO: Document what this does
if DEBUG:
    display(predictions.select("title","bucket","probability","prediction","features").orderBy("probability", ascending=False).toPandas().sample(n=10))

# Calculate & Log RMSE
rmse = predictions.selectExpr("sqrt(avg(pow(bucket - prediction, 2))) as RMSE").collect()[0]["RMSE"]
print("Root Mean Squared Error (RMSE) on Test Data:", rmse) # TODO: Determine output label
mlflow.log_metric("rmse", rmse)

# Calculate & Log Accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="bucket")
lr_accuracy = evaluator.evaluate(predictions)
print("Logistical Regression Accuracy:", lr_accuracy)
mlflow.log_metric("lr_accuracy", lr_accuracy)

Unnamed: 0,title,bucket,probability,prediction,features
575,Fulton County DA Fani Willis must step aside o...,0.0,"[0.49858724027193374, 0.5014127597280662, 8.03...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
196,Biden campaign raises over $53 million in Febr...,0.0,"[1.0, 5.641803938631679e-58, 4.404985104542817...",0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
655,EU fines Apple €1.8bn for breaking streaming r...,0.0,"[2.0852314677174735e-13, 0.9999999999997915, 1...",1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
525,Diane Abbott calls Tory donor's comments frigh...,0.0,"[0.9999713967703387, 1.0040484037365618e-60, 3...",0.0,"(2.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
226,Russia is ready to use nuclear weapons if thre...,1.0,"[1.0, 1.016853694440211e-63, 1.032832252345287...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
519,/r/WorldNews Live Thread: Russian Invasion of ...,0.0,"[0.9999892216048913, 5.00860564812878e-33, 5.2...",0.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
730,Bloomberg: Ukraine's recent drone strikes hit ...,1.0,"[1.7371538863053626e-39, 1.857623813256569e-64...",3.0,"(0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
330,Judge rejects bid by Donald Trump to throw out...,0.0,"[1.0, 1.8754865677958137e-91, 2.19608415992991...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
528,What Has Biden Accomplished? Look at These 10 ...,1.0,"[0.9999408071890573, 5.9192810942512264e-05, 2...",0.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
462,"With the election behind him, Putin says Russi...",0.0,"[0.9999999999963132, 2.0855701356410555e-22, 3...",0.0,"(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."


Root Mean Squared Error (RMSE) on Test Data: 1.0502822399265612




Logistical Regression Accuracy: 0.6754909308940124


### Log Final Model to MLflow

In [22]:
mlflow.spark.log_model(
    spark_model = model, 
    artifact_path = "model",
#    signature = signature,
    registered_model_name = "f5news_upvote_bucket_prediction",
)

24/03/24 19:16:30 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "s3"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent(even

<mlflow.models.model.ModelInfo at 0x7f5271bc9de0>

### Save the Final Model to Disk

In [23]:
try: 
    top_level_dir = "models"
    os.makedirs(top_level_dir, exist_ok=True)

    model_dir = os.path.join(top_level_dir, EXPERIMENT_NAME)
    os.makedirs(model_dir, exist_ok=True)

    model.save(os.path.join(model_dir, start_time))
except Exception as e:
    print(f"Error saving the model to disk: {e}")

### Use the Model

In [26]:
model = mlflow.spark.load_model(f'runs:/{global_run_id}/model') # global_run_id is set by when an MLflow run is initiated

d2 = [{'title': 'this is a second test'}]
#test data
test = spark.createDataFrame(d2)
test.show()
predictions = model.transform(test)
predictions.show(5)


2024/03/24 19:22:13 INFO mlflow.spark: 'runs:/3215a562e85f477baad2c34eca08963a/model' resolved as 's3://mlflow/1/3215a562e85f477baad2c34eca08963a/artifacts/model'


2024/03/24 19:22:13 INFO mlflow.spark: URI 'runs:/3215a562e85f477baad2c34eca08963a/model/sparkml' does not point to the current DFS.
2024/03/24 19:22:13 INFO mlflow.spark: File 'runs:/3215a562e85f477baad2c34eca08963a/model/sparkml' not found on DFS. Will attempt to upload the file.


+--------------------+
|               title|
+--------------------+
|this is a second ...|
+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|               title|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|this is a second ...|[this, is, a, sec...|[this, is, a, sec...|(2863,[7,11,99,32...|[38.0593977203652...|[1.0,1.3585814972...|       0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+



### Close Out Sessions

In [None]:
# Stop SparkSession
try:
    spark.stop()
except:
    pass

# End MLflow run
mlflow.end_run()