In [None]:
#!/usr/bin/env python3
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, collect_list
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

def create_spark_session(workers):
    """
    Creates and returns a Spark session with dynamic allocation and RDD settings.
    """
    spark = SparkSession.builder \
        .appName("Lyrics Sentiment Analysis") \
        .config("spark.executor.instances", workers) \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    return spark

# Sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    score = analyzer.polarity_scores(text)["compound"]
    return "positive" if score > 0.05 else "negative" if score < -0.05 else "neutral"

def run_experiment(workers, data_fraction, df):
    """
    Runs the sentiment analysis experiment with the given number of workers and data fraction.
    """
    spark = create_spark_session(workers)

    # Ensure the DataFrame is clean before proceeding
    df['track_id'] = df['track_id'].astype(str)
    df['mxm_tid'] = df['mxm_tid'].astype(str)
    df['word'] = df['word'].astype(str)
    df['count'] = pd.to_numeric(df['count'], errors='coerce')  # Ensure 'count' is numeric
    df = df.dropna(subset=['count'])  # Drop rows with NaN in 'count'

    # If the DataFrame is still empty after cleaning, return early
    if df.empty:
        print("Error: After cleaning, the Pandas DataFrame is empty.")
        return

    # Apply sampling (fraction) to the Pandas DataFrame before converting to Spark DataFrame
    sampled_df = df.sample(frac=data_fraction, random_state=42)

    # Define the schema for Spark DataFrame to ensure correct types
    schema = StructType([
        StructField("track_id", StringType(), True),
        StructField("mxm_tid", StringType(), True),
        StructField("word", StringType(), True),
        StructField("count", IntegerType(), True)
    ])

    try:
        # Convert the cleaned and sampled Pandas DataFrame to a Spark DataFrame with the defined schema
        full_lyrics_df = spark.createDataFrame(sampled_df, schema=schema)
    except Exception as e:
        print(f"Error converting Pandas DataFrame to Spark DataFrame: {e}")
        return

    print("Reconstructing lyrics...")
    lyrics_df = full_lyrics_df.groupBy("track_id") \
        .agg(concat_ws(" ", collect_list("word")).alias("lyrics"))
    print("Lyrics reconstructed.")

    # Time the sentiment analysis
    start_time = time.time()

    # Register sentiment UDF
    sentiment_udf = spark.udf.register("sentiment", analyze_sentiment)
    sentiment_df = lyrics_df.withColumn("sentiment", sentiment_udf(col("lyrics")))

    print(f"Total rows: {sentiment_df.count()}")

    # Count sentiment distribution
    sentiment_counts = sentiment_df.groupBy("sentiment").count().take(10)
    print(sentiment_counts)

    elapsed_time = time.time() - start_time
    print(f"Time: {elapsed_time:.2f}s")

    # Save the sentiment counts to a text file
    with open("sentiment_output.txt", "w") as output_file:
        for row in sentiment_counts:
            output_file.write(f"{row['sentiment']}: {row['count']}\n")
    print("Sentiment counts saved to sentiment_output.txt.")

In [None]:
def load_data_from_file(file_path):
    """
    Loads data from a text file and returns a Pandas DataFrame.
    """
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.startswith("#") or not line.strip():  # Skip comments and empty lines
                continue
            parts = line.strip().split(",")  # Split by commas
            if len(parts) < 3:  # Skip rows without word-count data
                continue
            track_id, mxm_tid = parts[:2]  # First two columns
            words_counts = [wc.split(":") for wc in parts[2:] if ":" in wc]  # Ensure word:count format

            for wc in words_counts:
                if len(wc) == 2:
                    word, count = wc
                    if count.isdigit():  # Ensure count is a valid integer
                        data.append([track_id, mxm_tid, word, int(count)])

    # Create Pandas DataFrame
    df = pd.DataFrame(data, columns=["track_id", "mxm_tid", "word", "count"])
    return df

In [None]:
df = load_data_from_file("mxm_dataset_train.txt")

In [None]:
workers = X
data_fraction = X

run_experiment(workers, data_fraction, df)