# F5.news Trending News - Machine Learning Exploration

- News Article Sentiment
- Predict Trending Topics
- Topic Categorization

### Installs & Imports

In [None]:
%pip install -q -U "pymongo[srv]" mlflow pyspark hvac python-dotenv

In [None]:
import os
import hvac
import mlflow

from dotenv import load_dotenv

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from datetime import datetime, timedelta
from pyspark.sql.types import *

load_dotenv()

### Connect to Vault for Mongo connection values

In [None]:
client = hvac.Client(
    url=os.environ.get('VAULT_ADDR'),
    token=os.environ.get('VAULT_TOKEN'),
)

print(client.is_authenticated())

if client.is_authenticated():
    try:
        secret_resp = client.secrets.kv.v2.read_secret_version(
            mount_point='kv', 
            path='f5.news', 
            raise_on_deleted_version=False
        )
        
        if secret_resp['data'] is not None:
            secret_values = secret_resp['data']['data']
            for secret, value in secret_values.items():
                os.environ[str(secret)] = str(value)
        else:
            print("The secret does not exist.")
    except hvac.exceptions.InvalidPath:
        print("The path is invalid or the permission is denied.")
    except hvac.exceptions.Forbidden:
        print("The permission is denied.")
    except hvac.exceptions.VaultError as e:
        print(f"Vault error occurred: {e}")
else:
    print("Failed to connect to HashiVault")

### Configs

In [None]:
URI = os.environ['mongo_uri']
DATABASE = os.environ['database']
COLLECTION = os.environ['collection']

### Pull F5 records using pymongo client

In [None]:
# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Successfully connected to MongoDB")
except Exception as e:
    print(e)

try:
    database = client[DATABASE]
    collection = database[COLLECTION]

    # Query all documents in the collection
    documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

    # Iterate over the cursor to access the documents
    for doc in documents:
        print(doc["title"])
        print(doc["fetchedAt"])
        print(doc["upvoteCount"], "upvotes")
        print()
except Exception as e:
    print(e)

### Connect to Spark

In [None]:
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("param1", "value1")
    mlflow.log_param("param2", "value2")
    
    try:
         # Create a SparkSession
        spark = SparkSession.builder \
            .appName("F5news") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
            .getOrCreate()

        # Load data from MongoDB into a DataFrame
        df = spark.read.format("mongo").option("uri", URI).option("database", DATABASE).option("collection", COLLECTION).load()
        print("Data loaded successfully from MongoDB!")
        
        # Show loaded data
        df.printSchema()
        print('Total Count:', df.count())
        df.show(1,truncate=False)
    except Exception as e:
        # Error occurred during data loading or model training
        print("Error:", str(e))

        # Stop SparkSession
        spark.stop()

        # End MLflow run
        mlflow.end_run()

In [None]:
# useful because there are type issues when using the pyspark dataframe issue directly (maybe because of schema? or null objects? or maybe sometimes a column has an int and sometimes a string, idk..)
df.createOrReplaceTempView("temp")
df = spark.sql("SELECT title, upvoteCount, fetchedAt from temp") 


# filter out new posts
timeAgo = d = datetime.today() - timedelta(days=1)
df = df.filter(df.fetchedAt < timeAgo)
print('count after date filter', df.count())


# Bucket by upvote count
def upvoteCategorizer(upvotes):
    if upvotes < 1000:
        return "0-999"
    if upvotes < 5000:
        return "1000-4999"
    if upvotes < 10000:
        return "5000-9999"
    elif upvotes < 25000:
        return "10000-24999"
    elif upvotes < 50000:
        return "25000-49000"
    else: 
        return "50000+"
    
from pyspark.sql.functions import udf
bucket_udf = udf(upvoteCategorizer, StringType() )
df = df.withColumn("bucket", bucket_udf("upvoteCount"))
from pyspark.sql.functions import col
df.groupBy("bucket").count().orderBy(col("count").desc()).show()

# select sample dataset
sample_count = 20
pandas_random_sample = df.toPandas().sample(n=sample_count) # convert to pandas dataframe to take sample
pyspark_random_sample = spark.createDataFrame(pandas_random_sample) # convert back to pyspark dataframe
pyspark_random_sample.show()


# tokenize and remove stop words

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\\W")

# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 

stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=30000, minDF=5)


from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
label_stringIdx = StringIndexer(inputCol = "bucket", outputCol = "label")

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(25)

# Split data into training and testing sets
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

# Train a linear regression model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(trainingData)

# Evaluate the model on test data
predictions = lr_model.transform(testData)
rmse = predictions.selectExpr("sqrt(avg(pow(upvoteCount - prediction, 2))) as RMSE").collect()[0]["RMSE"]
print("Root Mean Squared Error (RMSE) on test data:", rmse)

# Log metrics
mlflow.log_metric("rmse", rmse)

# Log trained model
mlflow.spark.log_model(lr_model, "model")

In [None]:
# Stop SparkSession
spark.stop()

# End MLflow run
mlflow.end_run()