# F5.news Trending News - Machine Learning Exploration

- News Article Sentiment
- Predict Trending Topics
- Topic Categorization

### Installs & Imports

In [39]:
%pip install -q -U "pymongo[srv]" mlflow pyspark hvac python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [40]:
import os
import hvac
import mlflow

from dotenv import load_dotenv
from datetime import datetime, timedelta

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

load_dotenv()

False

### Connect to Vault for Mongo connection values

In [41]:
client = hvac.Client(
    url=os.environ.get('VAULT_ADDR'),
    token=os.environ.get('VAULT_TOKEN'),
)

print(client.is_authenticated())

if client.is_authenticated():
    try:
        secret_resp = client.secrets.kv.v2.read_secret_version(
            mount_point='kv', 
            path='f5.news', 
            raise_on_deleted_version=False
        )
        
        if secret_resp['data'] is not None:
            secret_values = secret_resp['data']['data']
            for secret, value in secret_values.items():
                os.environ[str(secret)] = str(value)
        else:
            print("The secret does not exist.")
    except hvac.exceptions.InvalidPath:
        print("The path is invalid or the permission is denied.")
    except hvac.exceptions.Forbidden:
        print("The permission is denied.")
    except hvac.exceptions.VaultError as e:
        print(f"Vault error occurred: {e}")
else:
    print("Failed to connect to HashiVault")

True


### Configs

In [42]:
DEBUG = False
URI = os.environ['mongo_uri']
DATABASE = os.environ['database']
COLLECTION = os.environ['collection']

### Pull F5 records using pymongo client

In [43]:
# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Successfully connected to MongoDB...")
except Exception as e:
    print(e)

try:
    database = client[DATABASE]
    collection = database[COLLECTION]

    # Query all documents in the collection
    documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

    if(DEBUG == True):
        # Iterate over the cursor to access the documents
        for doc in documents:
            print(doc["title"])
            print(doc["fetchedAt"])
            print(doc["upvoteCount"], "upvotes")
            print()
    else:
        print("Mongo documents loaded successfully!")
except Exception as e:
    print(e)

Successfully connected to MongoDB
Mongo documents loaded successfully


### Connect to Spark

In [44]:
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("param1", "value1")
    mlflow.log_param("param2", "value2")
    
    try:
         # Create a SparkSession
        spark = SparkSession.builder \
            .appName("F5news") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
            .getOrCreate()

        # Load data from MongoDB into a DataFrame
        df = spark.read.format("mongo").option("uri", URI).option("database", DATABASE).option("collection", COLLECTION).load()
        print("Data loaded successfully from MongoDB!")
        
        # Show loaded data
        df.printSchema()
        df.show(5,truncate=False)
    except Exception as e:
        # Error occurred during data loading or model training
        print("Error:", str(e))

        # Stop SparkSession
        spark.stop()

        # End MLflow run
        mlflow.end_run()

                                                                                

Data loaded successfully from MongoDB!
root
 |-- __v: integer (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- author: string (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- commentLink: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- fetchedAt: timestamp (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- is_video: boolean (nullable = true)
 |-- media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- post_hint: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- selftext_html: string (nullable = true)
 |-- sub: string (nullable = true)
 |-- thumbnail: string (nullable = true)
 |-- title: string (nullable = true)
 |-- upvoteCount: integer (nullable = true)
 |-- upvote_ratio: double (nullable = true)
 |-- url: string (nullable = true)



[Stage 1:>                                                          (0 + 1) / 1]

+---+--------------------------+--------------+------------+----------------------------------------------------------------------------+-----------+----------------+-----------------------+-------+--------+-----+---------+--------+-------------+----+---------+------------------------------------------------------------------------------------------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------+
|__v|_id                       |author        |commentCount|commentLink                                                                 |created_utc|domain          |fetchedAt              |is_self|is_video|media|post_hint|selftext|selftext_html|sub |thumbnail|title                                                                                           |upvoteCount|upvote_ratio|url                                                                                             

                                                                                

### Filter Out Recent Posts

In [45]:
# Get document initial count
print('Documents Loaded:', df.count())

# Convert to SQL to ensure proper typing
df.createOrReplaceTempView("temp")
df = spark.sql("SELECT title, upvoteCount, fetchedAt from temp") 

# Filter out new posts
oneDayAgo = d = datetime.today() - timedelta(days=1)
df = df.filter(df.fetchedAt < oneDayAgo)
print('Total Filtered Documents:', df.count())

                                                                                

Documents Loaded: 4384


[Stage 5:>                                                          (0 + 1) / 1]

Total Filtered Documents: 3878


                                                                                

### Bucketize by Upvote Count

In [46]:
def upvoteCategorizer(upvotes):
    if upvotes < 1000:
        return "0-999"
    if upvotes < 5000:
        return "1000-4999"
    if upvotes < 10000:
        return "5000-9999"
    elif upvotes < 25000:
        return "10000-24999"
    elif upvotes < 50000:
        return "25000-49000"
    else: 
        return "50000+"
    
bucket_udf = udf(upvoteCategorizer, StringType() )
df = df.withColumn("bucket", bucket_udf("upvoteCount"))
df.groupBy("bucket").count().orderBy(col("count").desc()).show()

[Stage 8:>                                                          (0 + 1) / 1]

+-----------+-----+
|     bucket|count|
+-----------+-----+
|      0-999| 3070|
|  1000-4999|  520|
|  5000-9999|  172|
|10000-24999|  101|
|25000-49000|   15|
+-----------+-----+



                                                                                

### Prepare Sample Dataset


In [47]:
sample_count = 20 # TODO: Calculate proper sample size instead of hardcoded value
pandas_random_sample = df.toPandas().sample(n=sample_count) # Convert to pandas dataframe to take sample
pyspark_random_sample = spark.createDataFrame(pandas_random_sample) # Convert back to pyspark dataframe
pyspark_random_sample.show()

                                                                                

+--------------------+-----------+--------------------+-----------+
|               title|upvoteCount|           fetchedAt|     bucket|
+--------------------+-----------+--------------------+-----------+
|Trump Ally and Da...|         82|2024-03-08 19:54:...|      0-999|
|Europe starts war...|        162|2024-03-05 19:04:...|      0-999|
|US destroyer shoo...|        201|2024-03-06 09:29:...|      0-999|
|Trump Says '100 P...|          3|2024-03-10 09:09:...|      0-999|
|Zelenskiy Calls O...|       1978|2024-03-04 12:24:...|  1000-4999|
|Alabama lawmakers...|          9|2024-03-07 04:49:...|      0-999|
|Russian missile s...|       2735|2024-03-07 17:29:...|  1000-4999|
|Across the Board,...|          0|2024-03-04 16:19:...|      0-999|
|Boeing whistleblo...|        435|2024-03-11 22:44:...|      0-999|
|UN Special Repres...|         48|2024-03-04 19:39:...|      0-999|
|Smirking and smil...|          6|2024-03-07 13:59:...|      0-999|
|Pope Encourages U...|          0|2024-03-10 07:

### Tokenize and Remove Stop Words

In [48]:
# Regular Expression Tokenizer
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\\W")

# Stop Words Remover
add_stopwords = ["http","https","amp","rt","t","c","the"] # TODO: Update stopwords to match dataset
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# Bag of Words Counter
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=30000, minDF=5)

### Prepare Data Processing Pipeline

In [49]:
label_stringIdx = StringIndexer(inputCol = "bucket", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

### Fit the pipeline to training documents.

In [50]:
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+-----------+--------------------+---------+--------------------+--------------------+--------------------+-----+
|               title|upvoteCount|           fetchedAt|   bucket|               words|            filtered|            features|label|
+--------------------+-----------+--------------------+---------+--------------------+--------------------+--------------------+-----+
|Ultra-conservativ...|         35|2024-03-01 22:09:...|    0-999|[ultra, conservat...|[ultra, conservat...|(1777,[4,35,494,6...|  0.0|
|Joe Biden has rai...|         49|2024-03-01 22:14:...|    0-999|[joe, biden, has,...|[joe, biden, has,...|(1777,[2,5,8,12,1...|  0.0|
|Oregon takes mass...|          4|2024-03-01 22:14:...|    0-999|[oregon, takes, m...|[oregon, takes, m...|(1777,[375,426,53...|  0.0|
|DC Circuit tosses...|         21|2024-03-01 22:19:...|    0-999|[dc, circuit, tos...|[dc, circuit, tos...|(1777,[0,1,3,4,6,...|  0.0|
|Shervin Hajipour:...|         34|2024-03-01 22:19:...|

                                                                                

### Split Data into Training and Test datasets

In [51]:
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

                                                                                

Training Dataset Count: 3132


[Stage 26:>                                                         (0 + 1) / 1]

Test Dataset Count: 746


                                                                                

### Train a Linear Regression Model

In [52]:
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(trainingData)

24/03/13 23:19:57 WARN Instrumentation: [b736b51f] regParam is zero, which might cause numerical instability and overfitting.
24/03/13 23:20:04 WARN Instrumentation: [b736b51f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

### Evaluate the Model Using Test Data

In [53]:
predictions = lr_model.transform(testData)
rmse = predictions.selectExpr("sqrt(avg(pow(upvoteCount - prediction, 2))) as RMSE").collect()[0]["RMSE"]
print("Root Mean Squared Error (RMSE) on Test Data:", rmse) # TODO: Determine output label

[Stage 31:>                                                         (0 + 1) / 1]

Root Mean Squared Error (RMSE) on test data: 4779.478962694921


                                                                                

### Log Final Model Metrics to MLflow

In [54]:
# Log metrics
mlflow.log_metric("rmse", rmse)

# Log trained model
mlflow.spark.log_model(lr_model, "model")

<mlflow.models.model.ModelInfo at 0x7f29708739d0>

### Close Out Sessions

In [55]:
# Stop SparkSession
spark.stop()

# End MLflow run
mlflow.end_run()