# F5.news Trending News - Machine Learning Exploration

- News Article Sentiment
- Predict Trending Topics
- Topic Categorization

### Installs & Imports

In [43]:
%pip install -q -U "pymongo[srv]" mlflow pyspark hvac python-dotenv boto3

Note: you may need to restart the kernel to use updated packages.


In [44]:
import os
import hvac
import mlflow

from dotenv import load_dotenv
from datetime import datetime, timedelta

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

load_dotenv()

False

### Connect to Vault for Mongo connection values

In [45]:
client = hvac.Client(
    url=os.environ.get('VAULT_ADDR'),
    token=os.environ.get('VAULT_TOKEN'),
)

print(client.is_authenticated())

if client.is_authenticated():
    try:
        secret_resp = client.secrets.kv.v2.read_secret_version(
            mount_point='kv', 
            path='f5.news', 
            raise_on_deleted_version=False
        )
        
        if secret_resp['data'] is not None:
            secret_values = secret_resp['data']['data']
            for secret, value in secret_values.items():
                os.environ[str(secret)] = str(value)
        else:
            print("The secret does not exist.")
    except hvac.exceptions.InvalidPath:
        print("The path is invalid or the permission is denied.")
    except hvac.exceptions.Forbidden:
        print("The permission is denied.")
    except hvac.exceptions.VaultError as e:
        print(f"Vault error occurred: {e}")
else:
    print("Failed to connect to HashiVault")

True


### Configs

In [46]:
# General
DEBUG = False
REG_PARAM_VALUE = 0.1 # Experimenting with this value can improve final accuracy
MAX_ITER = 20
DATASET_SPLIT = [0.85, 0.15] # Portion of data to split between training and test datasets
os.environ["PYSPARK_PIN_THREAD"] = "false"

# Mongo
URI = os.environ['mongo_uri']
DATABASE = os.environ['database']
COLLECTION = os.environ['collection']

# MLflow
MLFLOW_API = "http://localhost:5000"
EXPERIMENT_NAME = "f5news_upvote_bucket_prediction"

# Minio S3
os.environ['MLFLOW_S3_ENDPOINT_URL'] = ""
os.environ['AWS_ACCESS_KEY_ID'] = ""
os.environ['AWS_SECRET_ACCESS_KEY'] = ""

### Pull F5 records using pymongo client

In [47]:
# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Successfully connected to MongoDB...")
except Exception as e:
    print(e)

try:
    database = client[DATABASE]
    collection = database[COLLECTION]

    # Query all documents in the collection
    documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

    if(DEBUG == True):
        # Iterate over the cursor to access the documents
        for doc in documents:
            print(doc["title"])
            print(doc["fetchedAt"])
            print(doc["upvoteCount"], "upvotes")
            print()
    else:
        print("Mongo documents loaded successfully!")
except Exception as e:
    print(e)

Successfully connected to MongoDB...
Mongo documents loaded successfully!


### Setup MLflow runner

In [48]:
global_run_name = None
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Set MLflow configs
mlflow.set_tracking_uri(MLFLOW_API)
mlflow.set_experiment(EXPERIMENT_NAME)

def start_mlflow_run(run_name: str = None):
    global global_run_name, start_time
    if run_name is None:
        run_name = start_time
    else:
        run_name = run_name + start_time
    global_run_name = run_name
    mlflow.start_run(run_name=run_name)

### Connect to Spark and load dataset

In [49]:
# Create MLflow Run Instance
start_mlflow_run()
mlflow.autolog()

# Log parameters
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
mlflow.log_param("start_time", start_time)

try:
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("F5news") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
        .getOrCreate()

    # Get Spark version
    spark_version = spark.version
    print("Spark Version:", spark_version)

    # Load data from MongoDB into a DataFrame
    df = spark.read.format("mongo").option("uri", URI).option("database", DATABASE).option("collection", COLLECTION).load()
    print("Data loaded successfully from MongoDB!")
except Exception as e:
    # Error occurred during data loading or model training
    print("Error:", str(e))

    # Stop SparkSession
    spark.stop()

    # End MLflow run
    mlflow.end_run()

2024/03/15 04:42:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2024/03/15 04:42:40 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf.
2024/03/15 04:42:40 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.
'JavaPackage' object is not callable


Spark Version: 3.5.1


[Stage 0:>                                                          (0 + 1) / 1]

Data loaded successfully from MongoDB!


                                                                                

### Show Loaded Data

In [50]:
df.show(5,truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+---+--------------------------+--------------+------------+----------------------------------------------------------------------------+-----------+----------------+-----------------------+-------+--------+-----+---------+--------+-------------+----+---------+------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------+
|__v|_id                       |author        |commentCount|commentLink                                                                 |created_utc|domain          |fetchedAt              |is_self|is_video|media|post_hint|selftext|selftext_html|sub |thumbnail|title                                                                                           |upvoteCount|upvote_ratio|url                                                                                             

                                                                                

### Filter Out Recent Posts

In [51]:
# Get document initial count
print('Documents Loaded:', df.count())
mlflow.log_param("loaded_documents", df.count())

# Convert to SQL for familiar data query ability
df.createOrReplaceTempView("temp")
df = spark.sql("SELECT title, upvoteCount, fetchedAt from temp") 

# Filter out new posts
oneDayAgo = d = datetime.today() - timedelta(days=1)
df = df.filter(df.fetchedAt < oneDayAgo)
print('Total Filtered Documents:', df.count())

mlflow.log_param("filtered_documents", df.count())

                                                                                

Documents Loaded: 4894


'JavaPackage' object is not callable
'JavaPackage' object is not callable
                                                                                

Total Filtered Documents: 4372


'JavaPackage' object is not callable
'JavaPackage' object is not callable


4372

### Bucketize by Upvote Count

In [52]:
def upvoteCategorizer(upvotes):
    if upvotes < 1000:
        return "0-999"
    if upvotes < 5000:
        return "1000-4999"
    if upvotes < 10000:
        return "5000-9999"
    elif upvotes < 25000:
        return "10000-24999"
    elif upvotes < 50000:
        return "25000-49000"
    else: 
        return "50000+"
    
bucket_udf = udf(upvoteCategorizer, StringType() )
df = df.withColumn("bucket", bucket_udf("upvoteCount"))
df.groupBy("bucket").count().orderBy(col("count").desc()).show()

'JavaPackage' object is not callable
[Stage 14:>                                                         (0 + 1) / 1]

+-----------+-----+
|     bucket|count|
+-----------+-----+
|      0-999| 3479|
|  1000-4999|  573|
|  5000-9999|  189|
|10000-24999|  115|
|25000-49000|   16|
+-----------+-----+



                                                                                

### Preview random sample of bucketized dataset

In [53]:
sample_count = 20 # TODO: Determine sample size based on loaded data?
pandas_random_sample = df.toPandas().sample(n=sample_count) # Convert to pandas dataframe to take sample
pyspark_random_sample = spark.createDataFrame(pandas_random_sample) # Convert back to pyspark dataframe
pyspark_random_sample.show()

                                                                                

+--------------------+-----------+--------------------+-----------+
|               title|upvoteCount|           fetchedAt|     bucket|
+--------------------+-----------+--------------------+-----------+
|Mossad: Hamas tou...|       1606|2024-03-10 18:09:...|  1000-4999|
|US dentist may lo...|         39|2024-03-08 19:19:...|      0-999|
|(Iran) Islamic Re...|         24|2024-03-12 22:14:...|      0-999|
|‘No legal basis t...|        155|2024-03-08 20:44:...|      0-999|
|Donald Trump Edge...|       3795|2024-03-03 18:44:...|  1000-4999|
|Hong Kong Country...|          0|2024-03-13 13:24:...|      0-999|
|The Supreme Court...|       1480|2024-03-13 22:54:...|  1000-4999|
|Hikers To Be Char...|         28|2024-03-05 08:44:...|      0-999|
|Israel may have j...|      11422|2024-03-03 16:29:...|10000-24999|
|India says it sei...|         19|2024-03-12 15:39:...|      0-999|
|Democratic Presid...|       5689|2024-03-05 14:34:...|  5000-9999|
|Putin allies tell...|      12002|2024-03-02 15:

### Define Data Prep Pipeline Steps

- **Regular Expression Tokenizer**: Breaks title into array of words via regex
- **Stop Words Remover**: Removes undesireable words from Regex Tokenizer output
- **Bag of Words Counter**: Creates vector representation of the array of words extracted from original title string
- **Create Label**: Maps all possible values in bucket columns to numeric values (their index position in an array of unique bucket values)

In [54]:
# Regular Expression Tokenizer
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\\W")

# Stop Words Remover
add_stopwords = ["http","https","amp","rt","t","c","the"] # TODO: Update stopwords to match dataset
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# Bag of Words Counter
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=30000, minDF=5)

# Create Label
label_stringIdx = StringIndexer(inputCol = "bucket", outputCol = "label")

### Assemble Data Prep Pipeline

Creates the `features` columns. We split titles to words, remove the words we don't want, vectorize the resulting array of words, then label based on bucket column.

In [55]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

### Run the Data Prep Pipeline

In [56]:
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

'JavaPackage' object is not callable


### Preview Dataset Before Training

In [57]:
dataset.show(5)

+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+-----+
|               title|upvoteCount|           fetchedAt|bucket|               words|            filtered|            features|label|
+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+-----+
|Ultra-conservativ...|         35|2024-03-01 22:09:...| 0-999|[ultra, conservat...|[ultra, conservat...|(1966,[4,39,555,5...|  0.0|
|Joe Biden has rai...|         49|2024-03-01 22:14:...| 0-999|[joe, biden, has,...|[joe, biden, has,...|(1966,[2,5,8,10,1...|  0.0|
|Oregon takes mass...|          4|2024-03-01 22:14:...| 0-999|[oregon, takes, m...|[oregon, takes, m...|(1966,[408,452,56...|  0.0|
|DC Circuit tosses...|         21|2024-03-01 22:19:...| 0-999|[dc, circuit, tos...|[dc, circuit, tos...|(1966,[0,1,3,4,6,...|  0.0|
|Shervin Hajipour:...|         34|2024-03-01 22:19:...| 0-999|[shervin, haji

### Split Data into Training and Test datasets

In [58]:
(trainingData, testData) = dataset.randomSplit(DATASET_SPLIT, seed = 123456)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3722
Test Dataset Count: 650


### Train a Logistic Regression Model

In [59]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=MAX_ITER, regParam=REG_PARAM_VALUE, elasticNetParam=0)
lr_model = lr.fit(trainingData)

'JavaPackage' object is not callable
24/03/15 04:43:24 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "s3"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.m

### Evaluate the Model Using Test Data

- **Bucket 1**: 0 - 999 upvotes
- **Bucket 2**: 1,000 - 4,999 upvotes
- **Bucket 3**: 5,000 - 9,999 upvotes
- **Bucket 4**: 10,000 - 24,999 upvotes
- **Bucket 5**: 25,000 - 49,000 upvotes
- **Bucket 6**: > 50,000 upvotes

In [60]:
# Make Predictions for entire test data set
predictions = lr_model.transform(testData)

# Show a few predictions
# - change filter params such as prediction == 1 # TODO: Document what this does
predictions.filter(predictions['prediction'] == 1).select("title","bucket","probability","label","prediction") \
.orderBy("probability", ascending=False).show(n = 20, truncate = 50)

# Calculate & Log RMSE
rmse = predictions.selectExpr("sqrt(avg(pow(label - prediction, 2))) as RMSE").collect()[0]["RMSE"]
print("Root Mean Squared Error (RMSE) on Test Data:", rmse) # TODO: Determine output label
mlflow.log_metric("rmse", rmse)

# Calculate & Log Accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_accuracy = evaluator.evaluate(predictions)
print("Logistical Regression Accuracy:", lr_accuracy)
mlflow.log_metric("lr_accuracy", lr_accuracy)

'JavaPackage' object is not callable
'JavaPackage' object is not callable


+--------------------------------------------------+-----------+--------------------------------------------------+-----+----------+
|                                             title|     bucket|                                       probability|label|prediction|
+--------------------------------------------------+-----------+--------------------------------------------------+-----+----------+
|China's housing minister: Property developers m...|      0-999|[0.46728809876080246,0.47739416121155454,0.0297...|  0.0|       1.0|
|Emmanuel Macron Announces Plans for Permanent D...|      0-999|[0.4613024204557441,0.5071161491550793,0.016280...|  0.0|       1.0|
|Biden says he regrets using the word ‘illegal’ ...|      0-999|[0.4603984421633158,0.4914260323344008,0.029623...|  0.0|       1.0|
|"Soldiers from NATO countries are already in Uk...|  1000-4999|[0.4489302219786712,0.4930614654787345,0.036028...|  1.0|       1.0|
|Docs reveal new details of Trump lawyer’s fring...|      0-999|[0.44

'JavaPackage' object is not callable
'JavaPackage' object is not callable


Logistical Regression Accuracy: 0.6899458672966583


### Log Final Model to MLflow

In [61]:
# Log trained model
# mlflow.spark.log_model(lr_model, "model") # TODO: Needs further S3 setup

### Save the Final Model to Disk

In [62]:
top_level_dir = "models"
os.makedirs(top_level_dir, exist_ok=True)

model_dir = os.path.join(top_level_dir, EXPERIMENT_NAME)
os.makedirs(model_dir, exist_ok=True)

lr_model.save(os.path.join(model_dir, start_time))

### Close Out Sessions

In [63]:
# Stop SparkSession
try:
    spark.stop()
except:
    pass

# End MLflow run
mlflow.end_run()