In [None]:
%pip install -q -U "pymongo[srv]" mlflow pyspark

In [1]:
import mlflow

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [2]:
URI = "mongodb+srv://spark:7UmGth9QtUpY1tCv@f5-news-dev-cluster.vgifyn6.mongodb.net/?retryWrites=true&w=majority&appName=f5-news-dev-cluster"

DATABASE = "f5-news-db"
COLLECTION = "newposts"

In [None]:
# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Successfully connected to MongoDB")
except Exception as e:
    print(e)

In [None]:
try:
    database = client[DATABASE]
    collection = database[COLLECTION]

    # Query all documents in the collection
    documents = collection.find({"sub": "politics"}).sort({"upvoteCount": -1, "fetchedAt": -1})

    # Iterate over the cursor to access the documents
    for doc in documents:
        print(doc["title"])
        print(doc["fetchedAt"])
        print(doc["upvoteCount"], "upvotes")
        print()
except Exception as e:
    print(e)

In [4]:
# Start MLflow run

print(URI)

with mlflow.start_run():
    # Log parameters
    mlflow.log_param("param1", "value1")
    mlflow.log_param("param2", "value2")
    
    try:
        # Create a SparkSession
        spark = SparkSession.builder \
            .appName("F5-MLflow-Spark") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
            .config("spark.mongodb.read.connection.uri", URI) \
            .config("spark.mongodb.write.connection.uri", URI) \
            .getOrCreate()
    
        # Load data from MongoDB into a DataFrame
        df = spark.read.format("mongodb").option("database", DATABASE).option("collection", COLLECTION).load()
        #df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

        # Data loading successful
        print("Data loaded successfully from MongoDB!")
        
        # Show loaded data
        df.printSchema()

        # Your Spark code here
    
        # Prepare data, train a model, etc.
        # For example, train a linear regression model
        lr = LinearRegression(featuresCol="features", labelCol="label")
        lr_model = lr.fit(df)
        
        # Log metrics
        mlflow.log_metric("rmse", lr_model.summary.rootMeanSquaredError)
        
        # Log trained model
        mlflow.spark.log_model(lr_model, "model")
        
        # Stop SparkSession
        spark.stop()

        # MLflow end session
        mlflow.end_run()
    except Exception as e:
        # Error occurred during data loading
        print("Error loading data from MongoDB:", str(e))
        mlflow.end_run()


mongodb+srv://spark:7UmGth9QtUpY1tCv@f5-news-dev-cluster.vgifyn6.mongodb.net/?retryWrites=true&w=majority&appName=f5-news-dev-cluster
Data loaded successfully from MongoDB!
root
 |-- __v: integer (nullable = true)
 |-- _id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- commentLink: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- fetchedAt: timestamp (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- is_video: boolean (nullable = true)
 |-- media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- post_hint: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- selftext_html: string (nullable = true)
 |-- sub: string (nullable = true)
 |-- thumbnail: string (nullable = true)
 |-- title: string (nullable = true)
 |-- upvoteCount: integer (nullable = true)
 |-- upvote_