# F5.news Trending News - Machine Learning Exploration

- News Article Sentiment
- Predict Trending Topics
- Topic Categorization

### Installs & Imports

In [11]:
%pip install -q -U "pymongo[srv]" mlflow pyspark hvac python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import hvac
import mlflow

from dotenv import load_dotenv
from datetime import datetime, timedelta

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

load_dotenv()

True

### Connect to Vault for Mongo connection values

In [13]:
client = hvac.Client(
    url=os.environ.get('VAULT_ADDR'),
    token=os.environ.get('VAULT_TOKEN'),
)

print(client.is_authenticated())

if client.is_authenticated():
    try:
        secret_resp = client.secrets.kv.v2.read_secret_version(
            mount_point='kv', 
            path='f5.news', 
            raise_on_deleted_version=False
        )
        
        if secret_resp['data'] is not None:
            secret_values = secret_resp['data']['data']
            for secret, value in secret_values.items():
                os.environ[str(secret)] = str(value)
        else:
            print("The secret does not exist.")
    except hvac.exceptions.InvalidPath:
        print("The path is invalid or the permission is denied.")
    except hvac.exceptions.Forbidden:
        print("The permission is denied.")
    except hvac.exceptions.VaultError as e:
        print(f"Vault error occurred: {e}")
else:
    print("Failed to connect to HashiVault")

False
Failed to connect to HashiVault


### Configs

In [14]:
DEBUG = False
URI = os.environ['mongo_uri']
DATABASE = os.environ['database']
COLLECTION = os.environ['collection']

### Pull F5 records using pymongo client

In [7]:
# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Successfully connected to MongoDB...")
except Exception as e:
    print(e)

try:
    database = client[DATABASE]
    collection = database[COLLECTION]

    # Query all documents in the collection
    documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

    if(DEBUG == True):
        # Iterate over the cursor to access the documents
        for doc in documents:
            print(doc["title"])
            print(doc["fetchedAt"])
            print(doc["upvoteCount"], "upvotes")
            print()
    else:
        print("Mongo documents loaded successfully!")
except Exception as e:
    print(e)

Successfully connected to MongoDB...
Mongo documents loaded successfully!


### Connect to Spark and load dataset

In [15]:
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("param1", "value1")
    mlflow.log_param("param2", "value2")
    
    try:
         # Create a SparkSession
        spark = SparkSession.builder \
            .appName("F5news") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
            .getOrCreate()

        # Load data from MongoDB into a DataFrame
        df = spark.read.format("mongo").option("uri", URI).option("database", DATABASE).option("collection", COLLECTION).load()
        print("Data loaded successfully from MongoDB!")
        
        # Show loaded data
        df.printSchema()
        df.show(5,truncate=False)
    except Exception as e:
        # Error occurred during data loading or model training
        print("Error:", str(e))

        # Stop SparkSession
        spark.stop()

        # End MLflow run
        mlflow.end_run()

Data loaded successfully from MongoDB!
root
 |-- __v: integer (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- author: string (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- commentLink: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- fetchedAt: timestamp (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- is_video: boolean (nullable = true)
 |-- media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- post_hint: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- selftext_html: string (nullable = true)
 |-- sub: string (nullable = true)
 |-- thumbnail: string (nullable = true)
 |-- title: string (nullable = true)
 |-- upvoteCount: integer (nullable = true)
 |-- upvote_ratio: double (nullable = true)
 |-- url: string (nullable = true)

+---+---------------------

### Filter Out Recent Posts

In [16]:
# Get document initial count
print('Documents Loaded:', df.count())

# Convert to SQL for familiar data query ability
df.createOrReplaceTempView("temp")
df = spark.sql("SELECT title, upvoteCount, fetchedAt from temp") 

# Filter out new posts
oneDayAgo = d = datetime.today() - timedelta(days=1)
df = df.filter(df.fetchedAt < oneDayAgo)
print('Total Filtered Documents:', df.count())

Documents Loaded: 4411
Total Filtered Documents: 3900


### Bucketize by Upvote Count

In [17]:
def upvoteCategorizer(upvotes):
    if upvotes < 1000:
        return "0-999"
    if upvotes < 5000:
        return "1000-4999"
    if upvotes < 10000:
        return "5000-9999"
    elif upvotes < 25000:
        return "10000-24999"
    elif upvotes < 50000:
        return "25000-49000"
    else: 
        return "50000+"
    
bucket_udf = udf(upvoteCategorizer, StringType() )
df = df.withColumn("bucket", bucket_udf("upvoteCount"))
df.groupBy("bucket").count().orderBy(col("count").desc()).show()

+-----------+-----+
|     bucket|count|
+-----------+-----+
|      0-999| 3090|
|  1000-4999|  521|
|  5000-9999|  172|
|10000-24999|  102|
|25000-49000|   15|
+-----------+-----+



### Preview random sample of bucketized dataset

In [19]:
sample_count = 20
pandas_random_sample = df.toPandas().sample(n=sample_count) # Convert to pandas dataframe to take sample
pyspark_random_sample = spark.createDataFrame(pandas_random_sample) # Convert back to pyspark dataframe
pyspark_random_sample.show()

                                                                                

+--------------------+-----------+--------------------+---------+
|               title|upvoteCount|           fetchedAt|   bucket|
+--------------------+-----------+--------------------+---------+
|CCTV footage of p...|          7|2024-03-09 06:49:...|    0-999|
|Trump endorses Ma...|          0|2024-03-03 23:49:...|    0-999|
|The DOJ has opene...|       8905|2024-03-12 11:09:...|5000-9999|
|Ukrainians to pay...|         13|2024-03-06 21:44:...|    0-999|
|I love President ...|          0|2024-03-05 10:49:...|    0-999|
|AOC confronted by...|          0|2024-03-05 15:24:...|    0-999|
|Russian volunteer...|         84|2024-03-12 11:29:...|    0-999|
|Nikki Haley will ...|         23|2024-03-06 12:59:...|    0-999|
|"Like someone pul...|       3913|2024-03-02 23:14:...|1000-4999|
|Federal judiciary...|        130|2024-03-12 21:04:...|    0-999|
|George Santos pla...|         15|2024-03-08 01:44:...|    0-999|
|‘Look me in the e...|         45|2024-03-01 23:39:...|    0-999|
|Biden, Tr

### Define data prep pipeline steps

In [20]:
# Regular Expression Tokenizer
# - Breaks title into array of words via regex
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\\W")

# Stop Words Remover
# - Removes undesireable words from ouput of Regex Tokenizer
add_stopwords = ["http","https","amp","rt","t","c","the"] # TODO: Update stopwords to match dataset
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# Bag of Words Counter
# - Creates vector representation of the array of words extracted from original title string
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=30000, minDF=5)

# Create label
# - Maps all possible values in bucket columns to numeric values (their index position in an array of unique bucket values)
label_stringIdx = StringIndexer(inputCol = "bucket", outputCol = "label")



### Assemble data prep pipeline

In [21]:
# Build pipeline using previously prepared steps
# - this is where we get our 'features' columns from. We split titles to words, remove the words we don't want, then vectorize the resulting array of words, then label based on bucket col
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])


### Run the data prep pipeline.

In [26]:
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
# Preview dataset before training
dataset.show(5)

+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+-----+
|               title|upvoteCount|           fetchedAt|bucket|               words|            filtered|            features|label|
+--------------------+-----------+--------------------+------+--------------------+--------------------+--------------------+-----+
|Ultra-conservativ...|         35|2024-03-01 22:09:...| 0-999|[ultra, conservat...|[ultra, conservat...|(1782,[4,35,496,5...|  0.0|
|Joe Biden has rai...|         49|2024-03-01 22:14:...| 0-999|[joe, biden, has,...|[joe, biden, has,...|(1782,[2,5,8,12,1...|  0.0|
|Oregon takes mass...|          4|2024-03-01 22:14:...| 0-999|[oregon, takes, m...|[oregon, takes, m...|(1782,[376,396,53...|  0.0|
|DC Circuit tosses...|         21|2024-03-01 22:19:...| 0-999|[dc, circuit, tos...|[dc, circuit, tos...|(1782,[0,1,3,4,6,...|  0.0|
|Shervin Hajipour:...|         34|2024-03-01 22:19:...| 0-999|[shervin, haji

### Split Data into Training and Test datasets

In [38]:
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

                                                                                

Training Dataset Count: 3150


[Stage 162:>                                                        (0 + 1) / 1]

Test Dataset Count: 750


                                                                                

### Train a Logistic Regression Model

In [39]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=20, regParam=0.3, elasticNetParam=0)
lr_model = lr.fit(trainingData)

                                                                                

### Evaluate the Model Using Test Data

In [42]:
# Make Predictions for entire test data set
predictions = lr_model.transform(testData)

# Show a few predictions
# - change filter params such as prediction == 1
predictions.filter(predictions['prediction'] == 1).select("title","bucket","probability","label","prediction")\
.orderBy("probability", ascending=False).show(n = 20, truncate = 50)

# Calculate RMSE
rmse = predictions.selectExpr("sqrt(avg(pow(upvoteCount - prediction, 2))) as RMSE").collect()[0]["RMSE"]
print("Root Mean Squared Error (RMSE) on Test Data:", rmse) # TODO: Determine output label

# Calculate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lrAccuracy = evaluator.evaluate(predictions)
print(lrAccuracy)

                                                                                

+--------------------------------------------------+-----------+--------------------------------------------------+-----+----------+
|                                             title|     bucket|                                       probability|label|prediction|
+--------------------------------------------------+-----------+--------------------------------------------------+-----+----------+
|Nasrallah's grandson killed in IDF targeted att...|  1000-4999|[0.4645782027380458,0.47104484094916904,0.03935...|  1.0|       1.0|
|Pope criticised for saying Ukraine should ‘rais...|10000-24999|[0.4490831147920505,0.4919561890163278,0.034715...|  3.0|       1.0|
|How clean is the dirt on Hunter Biden? A key Re...|      0-999|[0.4321101142891121,0.4759676440038168,0.068277...|  0.0|       1.0|
|Army intelligence analyst charged with selling ...|  1000-4999|[0.4040768731806385,0.4951014326298967,0.063961...|  1.0|       1.0|
|Taylor Swift can teach us all a lesson in how d...|      0-999|[0.39

[Stage 201:>                                                        (0 + 1) / 1]

### Log Final Model Metrics to MLflow

In [54]:
# Log metrics
mlflow.log_metric("rmse", rmse)

# Log trained model
mlflow.spark.log_model(lr_model, "model")

<mlflow.models.model.ModelInfo at 0x7f29708739d0>

### Close Out Sessions

In [55]:
# Stop SparkSession
spark.stop()

# End MLflow run
mlflow.end_run()