# F5.news Trending News - Machine Learning Exploration

- News Article Sentiment
- Predict Trending Topics
- Topic Categorization

### Installs & Imports

In [2]:
%pip install -q -U "pymongo[srv]" mlflow pyspark hvac

Note: you may need to restart the kernel to use updated packages.


In [34]:
import os
import hvac
import mlflow

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

### Connect to Vault for Mongo connection values

In [32]:
client = hvac.Client(
    url=os.environ.get('VAULT_ADDR'),
    token=os.environ.get('VAULT_TOKEN'),
)

# print(client.is_authenticated())

try:
    secret_resp = client.secrets.kv.v2.read_secret_version(
        mount_point='kv', 
        path='f5.news', 
        raise_on_deleted_version=False
    )
    
    if secret_resp['data'] is not None:
        secret_values = secret_resp['data']['data']
        for secret, value in secret_values.items():
            os.environ[str(secret)] = str(value)
    else:
        print("The secret does not exist.")
except hvac.exceptions.InvalidPath:
    print("The path is invalid or the permission is denied.")
except hvac.exceptions.Forbidden:
    print("The permission is denied.")
except hvac.exceptions.VaultError as e:
    print(f"Vault error occurred: {e}")

### Configs

In [31]:
URI = os.environ['mongo_uri']
DATABASE = os.environ['database']
COLLECTION = os.environ['collection']

### Pull F5 records using pymongo client

In [7]:
# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Successfully connected to MongoDB")
except Exception as e:
    print(e)

try:
    database = client[DATABASE]
    collection = database[COLLECTION]

    # Query all documents in the collection
    documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

    # Iterate over the cursor to access the documents
    for doc in documents:
        print(doc["title"])
        print(doc["fetchedAt"])
        print(doc["upvoteCount"], "upvotes")
        print()
except Exception as e:
    print(e)

Joe Biden suddenly leads Donald Trump in multiple polls
2024-03-12 13:44:05.327000
42643 upvotes

Trump crowd goes silent as he confuses Biden and Obama again
2024-03-04 16:44:05.262000
31825 upvotes

Biden Just Delivered a Top Career Performance. He Needed It.
2024-03-09 14:59:05.814000
29299 upvotes

Biden: 'You can't love your country only when you win'
2024-03-08 18:39:05.353000
27371 upvotes

Mark Cuban Backs Biden in 2024, Urging More Action on Drug Costs
2024-03-05 16:49:05.281000
27178 upvotes

Biden proposes tax increase on fuel for private jets, casting it as making wealthy pay their share
2024-03-12 16:49:05.395000
26928 upvotes

Dr. John Gartner: The world is watching "a fundamental breakdown in Trump’s ability to use language"
2024-03-08 06:49:05.200000
26550 upvotes

Maddening New Poll: Voters Are Unaware of Trump “Dictator” Threats
2024-03-06 11:44:05.362000
23939 upvotes

Biden said Republicans oppose women's rights — Katie Britt's "tradwife" response proved him right
2

### Connect to Spark

In [38]:
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("param1", "value1")
    mlflow.log_param("param2", "value2")
    
    try:
        # Create a SparkSession
        spark = SparkSession.builder \
            .appName("F5-MLflow-Spark") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
            .getOrCreate()
    
        # Sample DataFrame with "title" and "upvoteCount" columns
        data = spark.createDataFrame([
            ("Title 1", 10.0),
            ("Title 2", 15.0),
            ("Title 3", 20.0)
        ], ["title", "upvoteCount"])

        # Convert "title" column to numerical features using TF-IDF or other text vectorization techniques
        # For demonstration purposes, we'll just use a simple one-hot encoding
        title_vectorizer = VectorAssembler(inputCols=["title"], outputCol="title_features")
        title_vectorized = title_vectorizer.transform(data)

        # Combine "title" and "upvoteCount" columns into a single feature vector
        assembler = VectorAssembler(inputCols=["title_features", "upvoteCount"], outputCol="features")
        data_with_features = assembler.transform(title_vectorized)

        # Split data into training and testing sets
        train_data, test_data = data_with_features.randomSplit([0.8, 0.2], seed=123)

        # Train a linear regression model
        lr = LinearRegression(featuresCol="features", labelCol="upvoteCount")
        lr_model = lr.fit(train_data)

        # Evaluate the model on test data
        predictions = lr_model.transform(test_data)
        rmse = predictions.selectExpr("sqrt(avg(pow(upvoteCount - prediction, 2))) as RMSE").collect()[0]["RMSE"]
        print("Root Mean Squared Error (RMSE) on test data:", rmse)
        
        # Log metrics
        mlflow.log_metric("rmse", rmse)
        
        # Log trained model
        mlflow.spark.log_model(lr_model, "model")
        
    except Exception as e:
        # Error occurred during data loading or model training
        print("Error:", str(e))
    
    finally:
        # Stop SparkSession
        spark.stop()
        # End MLflow run
        mlflow.end_run()

[Stage 10:>                                                         (0 + 1) / 1]

Data loaded successfully from MongoDB!
root
 |-- __v: integer (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- author: string (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- commentLink: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- fetchedAt: timestamp (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- is_video: boolean (nullable = true)
 |-- media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- post_hint: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- selftext_html: string (nullable = true)
 |-- sub: string (nullable = true)
 |-- thumbnail: string (nullable = true)
 |-- title: string (nullable = true)
 |-- upvoteCount: integer (nullable = true)
 |-- upvote_ratio: double (nullable = true)
 |-- url: string (nullable = true)

['__v', '_id', 'author', '

                                                                                