In [None]:
# --- CONFIGURATION ---

# Java configuration
jarConfigPath = ""

# Spark configuration
allocated_memory = "6g"  
allocated_cores = "6"  

# Database configuration
database_url = "jdbc:postgresql://localhost:5432/musicbrainz"
properties = {"user": "musicbrainz", "password": "musicbrainz", "driver": "org.postgresql.Driver"}

# --- END OF CONFIGURATION ---

#### Setup

- Initialize Spark session connecting to the Postgres DB

In [None]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()

from pyspark.sql import SparkSession

# Initialize Spark session
# We attach more memory to the driver and executors(https://spark.apache.org/docs/latest/tuning.html#memory-management-overview)
# We use the G1 garbage collector for better performance(https://spark.apache.org/docs/latest/tuning.html#garbage-collection-tuning)
# We add more cores to the driver and executors(https://spark.apache.org/docs/latest/tuning.html#level-of-parallelism)
spark = SparkSession \
    .builder \
    .appName("MusicBrainz PostgreSQL Connection") \
    .config("spark.jars", jarConfigPath) \
    .config("spark.executor.memory", allocated_memory) \
    .config("spark.driver.memory", allocated_memory) \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.cores", allocated_cores) \
    .config("spark.driver.cores", allocated_cores) \
    .getOrCreate()

#### Data Collection

- Get the relevant data from Postgres
- Already do cleaning in this stage by only selecting relevant columns

First get general Artist and Area(The country to predict) and additional Artist/Country information that could hint about the artist country:

In [None]:
from pyspark.sql.functions import broadcast

# Read data from artist and area tables with only necessary columns
artist_df = spark.read.jdbc(url=database_url, table="artist", properties=properties).select("id", "name", "area")
area_df = spark.read.jdbc(url=database_url, table="area", properties=properties).select("id", "name")

# Assuming area_df is smaller and can be broadcasted
# Broadcast join for artist and area tables
artist_country_df = artist_df.join(broadcast(area_df), artist_df.area == area_df.id)

# Select relevant columns
artist_country_df = artist_country_df.select(artist_df.name, area_df.name.alias("country"))

# Read more that could be useful for the analysis
language_df = spark.read.jdbc(url=database_url, table="language", properties=properties).select("id", "name")
alias_df = spark.read.jdbc(url=database_url, table="artist_alias", properties=properties).select("artist", "name")

# Join tables...
# Use explicit column names to avoid ambiguity
artist_language_df = artist_df.join(language_df, artist_df.id == language_df.id).select(artist_df.name, language_df.name.alias("language"))
artist_alias_df = artist_df.join(alias_df, artist_df.id == alias_df.artist).select(artist_df.name, alias_df.name.alias("alias"))

# Combining all data into one dataframe with left outer join
combined_artist_df = artist_country_df \
    .join(artist_alias_df, ["name"], "left_outer") \
    .join(artist_language_df, ["name"], "left_outer")

#### Data preprocessing

##### Data cleaning

Handle missing data. F.ex all the NULLs:

In [None]:
from pyspark.sql.functions import col, when

# Dropping rows where 'country', 'name' is null or empty
combined_artist_df = combined_artist_df.filter(combined_artist_df.country.isNotNull())
combined_artist_df = combined_artist_df.filter(combined_artist_df.name.isNotNull())

# Remove all rows in combined_artist_df that have null values
combined_artist_df = combined_artist_df.na.drop()

combined_artist_df = combined_artist_df.limit(10000)
combined_artist_df.count()

##### Feature transformation

Transform feature strings into more suitable formats. To do this:

1. Use `StringIndexer` to convert the strings in the columns into indices(Like unique IDs)
2. Then use `OneHotEncoder` to convert the categorical indices into a binary vector(F.ex `[0,1,0,...]`)


In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# String Indexing for all categorical columns
# Features:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(combined_artist_df) 
            for column in ["name", "language", "alias"]]  # Exclude 'country'
# Label:
label_stringIdx = StringIndexer(inputCol="country", outputCol="country_index")

# One-Hot Encoding for all indexed columns
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=indexer.getOutputCol()+"_vec") 
            for indexer in indexers]

##### Feature normalization

Scale transformed values to fixed range. To do this:

1. Use `VectorAssembler` to combine multiple columns into a single vector column. Helps with machine learning algorithms
2. Then apply `StandardScaler`. It helps, to make sure that the model is not influenced by features with larger scales

In [None]:
# Vector Assembling all the features
assemblerInputs = [encoder.getOutputCol() for encoder in encoders]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# Feature normalization
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

Finally combine all steps into one transformation / normalization pipeline and run it:

In [None]:
# Building a Pipeline for transformations
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, label_stringIdx])

# Transforming the data
model = pipeline.fit(combined_artist_df)
transformed_df = model.transform(combined_artist_df)

#### Data Splitting

- Generate test, train and validation datasets

In [None]:
# Splitting the data into training, validation, and testing sets
train_data, val_data, test_data = transformed_df.randomSplit([0.7, 0.15, 0.15], seed=42)

# Show the count of each dataset
print(f"Training Data Count: {train_data.count()}")
print(f"Validation Data Count: {val_data.count()}")
print(f"Testing Data Count: {test_data.count()}")


#### Training

- Select model
- Train model

In [None]:
from pyspark.ml.classification import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="country_index")

# Fit the model on the training data
lrModel = lr.fit(train_data)

# Print the coefficients and intercept
print("Coefficients: " + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary
print("Accuracy: ", trainingSummary.accuracy)
print("False Positive Rate: ", trainingSummary.weightedFalsePositiveRate)
print("True Positive Rate: ", trainingSummary.weightedTruePositiveRate)
print("F-Measure: ", trainingSummary.weightedFMeasure())
print("Precision: ", trainingSummary.weightedPrecision)
print("Recall: ", trainingSummary.weightedRecall)


#### Evaluation

- Validate model performance
- Adjust parameters respectively (Hyperparameter Tuning, ...)

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a ParamGrid for tuning parameters
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.maxIter, [10, 50, 100]) \
    .build()

# Create a CrossValidator
cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(labelCol="country_index", predictionCol="prediction"), 
                    numFolds=3)

# Run cross-validation, and choose the best set of parameters.
cvModel = cv.fit(train_data)

# Use the best model to make predictions on the validation data
val_predictions = cvModel.transform(val_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="country_index", predictionCol="prediction")
accuracy = evaluator.evaluate(val_predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(val_predictions, {evaluator.metricName: "f1"})

print(f"Validation Accuracy: {accuracy}")
print(f"Validation F1 Score: {f1}")

#### Testing

- On unseen Data

In [None]:
# Use the best model to make predictions on the test data
test_predictions = cvModel.transform(test_data)

# Evaluate the model on test data
test_accuracy = evaluator.evaluate(test_predictions, {evaluator.metricName: "accuracy"})
test_f1 = evaluator.evaluate(test_predictions, {evaluator.metricName: "f1"})

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")