# F5.news Trending News - Machine Learning Exploration

- News Article Sentiment
- Predict Trending Topics
- Topic Categorization

### Installs & Imports

In [1]:
%pip install -q -U "pymongo[srv]" mlflow pyspark hvac python-dotenv boto3

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import hvac
import mlflow

from dotenv import load_dotenv
from datetime import datetime, timedelta

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

load_dotenv()

True

### Connect to Vault for Mongo connection values

In [19]:
client = hvac.Client(
    url=os.environ.get('VAULT_ADDR'),
    token=os.environ.get('VAULT_TOKEN'),
)

print(client.is_authenticated())

if client.is_authenticated():
    try:
        secret_resp = client.secrets.kv.v2.read_secret_version(
            mount_point='kv', 
            path='f5.news', 
            raise_on_deleted_version=False
        )
        
        if secret_resp['data'] is not None:
            secret_values = secret_resp['data']['data']
            for secret, value in secret_values.items():
                os.environ[str(secret)] = str(value)
        else:
            print("The secret does not exist.")
    except hvac.exceptions.InvalidPath:
        print("The path is invalid or the permission is denied.")
    except hvac.exceptions.Forbidden:
        print("The permission is denied.")
    except hvac.exceptions.VaultError as e:
        print(f"Vault error occurred: {e}")
else:
    print("Failed to connect to HashiVault")

True


### Configs

In [2]:
DEBUG = False
URI = os.environ['mongo_uri']
DATABASE = os.environ['database']
COLLECTION = os.environ['collection']
MLFLOW_API = "http://localhost:5000"

REG_PARAM_VALUE = 0.1 # Experimenting with this value can improve final accuracy
MAX_ITER = 20
DATASET_SPLIT = [0.85, 0.15] # Portion of data to split between training and test datasets

### Pull F5 records using pymongo client

In [3]:
# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Successfully connected to MongoDB...")
except Exception as e:
    print(e)

try:
    database = client[DATABASE]
    collection = database[COLLECTION]

    # Query all documents in the collection
    documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

    if(DEBUG == True):
        # Iterate over the cursor to access the documents
        for doc in documents:
            print(doc["title"])
            print(doc["fetchedAt"])
            print(doc["upvoteCount"], "upvotes")
            print()
    else:
        print("Mongo documents loaded successfully!")
except Exception as e:
    print(e)

Successfully connected to MongoDB...
Mongo documents loaded successfully!


### Setup MLflow runner

In [5]:
global_run_name = None
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Set MLflow host API
mlflow.set_tracking_uri(MLFLOW_API)
mlflow.set_experiment("f5news_upvote_bucket_prediction")

def start_mlflow_run(run_name: str = None):
    global global_run_name, start_time
    if run_name is None:
        run_name = start_time
    else:
        run_name = run_name + start_time
    global_run_name = run_name
    mlflow.start_run(run_name=run_name)

2024/03/14 17:56:37 INFO mlflow.tracking.fluent: Experiment with name 'f5news_upvote_bucket_prediction' does not exist. Creating a new experiment.


### Connect to Spark and load dataset

In [6]:
# Create MLflow Run Instance
start_mlflow_run()

# Log parameters
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
mlflow.log_param("start_time", start_time)

try:
        # Create a SparkSession
    spark = SparkSession.builder \
        .appName("F5news") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
        .getOrCreate()

    # Load data from MongoDB into a DataFrame
    df = spark.read.format("mongo").option("uri", URI).option("database", DATABASE).option("collection", COLLECTION).load()
    print("Data loaded successfully from MongoDB!")
    
    # Show loaded data
    df.show(5,truncate=False)
except Exception as e:
    # Error occurred during data loading or model training
    print("Error:", str(e))

    # Stop SparkSession
    spark.stop()

    # End MLflow run
    mlflow.end_run()

:: loading settings :: url = jar:file:/home/mgmtadmin/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/mgmtadmin/.ivy2/cache
The jars for the packages stored in: /home/mgmtadmin/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-27db8e99-1c90-4bf4-8257-cc340851174c;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 64ms :: artifacts dl 2ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artif

Data loaded successfully from MongoDB!
+---+--------------------------+--------------+------------+----------------------------------------------------------------------------+-----------+----------------+-----------------------+-------+--------+-----+---------+--------+-------------+----+---------+------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------+
|__v|_id                       |author        |commentCount|commentLink                                                                 |created_utc|domain          |fetchedAt              |is_self|is_video|media|post_hint|selftext|selftext_html|sub |thumbnail|title                                                                                           |upvoteCount|upvote_ratio|url                                                      

                                                                                

### Filter Out Recent Posts

In [7]:
# Get document initial count
print('Documents Loaded:', df.count())
mlflow.log_param("loaded_documents", df.count())

# Convert to SQL for familiar data query ability
df.createOrReplaceTempView("temp")
df = spark.sql("SELECT title, upvoteCount, fetchedAt from temp") 

# Filter out new posts
oneDayAgo = d = datetime.today() - timedelta(days=1)
df = df.filter(df.fetchedAt < oneDayAgo)
print('Total Filtered Documents:', df.count())
mlflow.log_param("filtered_documents", df.count())

                                                                                

Documents Loaded: 4676
Total Filtered Documents: 4185


4185

### Bucketize by Upvote Count

In [27]:
def upvoteCategorizer(upvotes):
    if upvotes < 1000:
        return "0-999"
    if upvotes < 5000:
        return "1000-4999"
    if upvotes < 10000:
        return "5000-9999"
    elif upvotes < 25000:
        return "10000-24999"
    elif upvotes < 50000:
        return "25000-49000"
    else: 
        return "50000+"
    
bucket_udf = udf(upvoteCategorizer, StringType() )
df = df.withColumn("bucket", bucket_udf("upvoteCount"))
df.groupBy("bucket").count().orderBy(col("count").desc()).show()

[Stage 14:>                                                         (0 + 1) / 1]

+-----------+-----+
|     bucket|count|
+-----------+-----+
|      0-999| 3147|
|  1000-4999|  526|
|  5000-9999|  173|
|10000-24999|  104|
|25000-49000|   15|
+-----------+-----+



                                                                                

### Preview random sample of bucketized dataset

In [28]:
sample_count = 20 # TODO: Determine sample size based on loaded data?
pandas_random_sample = df.toPandas().sample(n=sample_count) # Convert to pandas dataframe to take sample
pyspark_random_sample = spark.createDataFrame(pandas_random_sample) # Convert back to pyspark dataframe
pyspark_random_sample.show()

+--------------------+-----------+--------------------+-----------+
|               title|upvoteCount|           fetchedAt|     bucket|
+--------------------+-----------+--------------------+-----------+
|Argentina: Tens o...|         28|2024-03-10 00:39:...|      0-999|
|Russia using Serb...|        722|2024-03-05 20:19:...|      0-999|
|How the anti-tax ...|          6|2024-03-08 00:29:...|      0-999|
|Could one of Penn...|          0|2024-03-04 13:19:...|      0-999|
|Russia: media, Re...|         13|2024-03-12 10:24:...|      0-999|
|France's Assemblé...|         28|2024-03-07 15:04:...|      0-999|
|France becomes wo...|        473|2024-03-04 23:14:...|      0-999|
|Just over half of...|         67|2024-03-09 04:44:...|      0-999|
|White House weigh...|         34|2024-03-05 17:44:...|      0-999|
|'No need to apolo...|         21|2024-03-06 14:09:...|      0-999|
|FDF: Finnish peac...|          8|2024-03-12 14:14:...|      0-999|
|Russian weatherma...|          0|2024-03-11 02:

### Define Data Prep Pipeline Steps

- **Regular Expression Tokenizer**: Breaks title into array of words via regex
- **Stop Words Remover**: Removes undesireable words from Regex Tokenizer output
- **Bag of Words Counter**: Creates vector representation of the array of words extracted from original title string
- **Create Label**: Maps all possible values in bucket columns to numeric values (their index position in an array of unique bucket values)

In [29]:
# Regular Expression Tokenizer
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\\W")

# Stop Words Remover
add_stopwords = ["http","https","amp","rt","t","c","the"] # TODO: Update stopwords to match dataset
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# Bag of Words Counter
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=30000, minDF=5)

# Create Label
label_stringIdx = StringIndexer(inputCol = "bucket", outputCol = "label")

### Assemble Data Prep Pipeline

Creates the `features` columns. We split titles to words, remove the words we don't want, vectorize the resulting array of words, then label based on bucket column.

In [30]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

### Run the Data Prep Pipeline

In [31]:
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

                                                                                

### Preview Dataset Before Training

In [32]:
dataset.show(5)

+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+-----+
|               title|upvoteCount|           fetchedAt|bucket|               words|            filtered|            features|label|
+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+-----+
|Ultra-conservativ...|         35|2024-03-01 22:09:...| 0-999|[ultra, conservat...|[ultra, conservat...|(1803,[4,34,504,5...|  0.0|
|Joe Biden has rai...|         49|2024-03-01 22:14:...| 0-999|[joe, biden, has,...|[joe, biden, has,...|(1803,[2,5,8,12,1...|  0.0|
|Oregon takes mass...|          4|2024-03-01 22:14:...| 0-999|[oregon, takes, m...|[oregon, takes, m...|(1803,[380,406,55...|  0.0|
|DC Circuit tosses...|         21|2024-03-01 22:19:...| 0-999|[dc, circuit, tos...|[dc, circuit, tos...|(1803,[0,1,3,4,6,...|  0.0|
|Shervin Hajipour:...|         34|2024-03-01 22:19:...| 0-999|[shervin, haji

                                                                                

### Split Data into Training and Test datasets

In [33]:
(trainingData, testData) = dataset.randomSplit(DATASET_SPLIT, seed = 123456)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3383
Test Dataset Count: 582


### Train a Logistic Regression Model

In [34]:
mlflow.log_param("max_iter", MAX_ITER)
mlflow.log_param("reg_param_value", REG_PARAM_VALUE)

lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=MAX_ITER, regParam=REG_PARAM_VALUE, elasticNetParam=0)
lr_model = lr.fit(trainingData)

                                                                                

### Evaluate the Model Using Test Data

- **Bucket 1**: 0 - 999 upvotes
- **Bucket 2**: 1,000 - 4,999 upvotes
- **Bucket 3**: 5,000 - 9,999 upvotes
- **Bucket 4**: 10,000 - 24,999 upvotes
- **Bucket 5**: 25,000 - 49,000 upvotes
- **Bucket 6**: > 50,000 upvotes

In [35]:
# Make Predictions for entire test data set
predictions = lr_model.transform(testData)

# Show a few predictions
# - change filter params such as prediction == 1 # TODO: Document what this does
predictions.filter(predictions['prediction'] == 1).select("title","bucket","probability","label","prediction") \
.orderBy("probability", ascending=False).show(n = 20, truncate = 50)

# Calculate & Log RMSE
rmse = predictions.selectExpr("sqrt(avg(pow(label - prediction, 2))) as RMSE").collect()[0]["RMSE"]
print("Root Mean Squared Error (RMSE) on Test Data:", rmse) # TODO: Determine output label
mlflow.log_metric("rmse", rmse)

# Calculate & Log Accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_accuracy = evaluator.evaluate(predictions)
print("Logistical Regression Accuracy:", lr_accuracy)
mlflow.log_metric("lr_accuracy", lr_accuracy)

+--------------------------------------------------+-----------+--------------------------------------------------+-----+----------+
|                                             title|     bucket|                                       probability|label|prediction|
+--------------------------------------------------+-----------+--------------------------------------------------+-----+----------+
|               NATO to fly Sweden’s flag on Monday|  1000-4999|[0.4415258510306355,0.46309236858189445,0.05613...|  1.0|       1.0|
|Trump posts $91M bond while appealing E. Jean C...|      0-999|[0.40400859597143796,0.4069466100875459,0.12734...|  0.0|       1.0|
|Black Sea no longer safe for Putin's Navy - UK ...|  1000-4999|[0.4008291136842319,0.5003146777865993,0.071793...|  1.0|       1.0|
|Burkina Faso prosecutor says some 170 people 'e...|      0-999|[0.389552225321024,0.5785639750572916,0.0202640...|  0.0|       1.0|
|February marked Earth's 9th consecutive warmest...|      0-999|[0.38

### Log Final Model to MLflow

In [36]:
# Log trained model
# mlflow.spark.log_model(lr_model, "model") # TODO: Needs further S3 setup

### Close Out Sessions

In [37]:
# Stop SparkSession
try:
    spark.stop()
except:
    pass

# End MLflow run
mlflow.end_run()