In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=6365f89595112a6e70f3189c94993e32c09ce7a5102ca4b71578af70f78dffb0
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, mean
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [3]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:

# Create a Spark session
spark = SparkSession.builder.appName("ReadingCsvExample").getOrCreate()
spark = SparkSession.builder.appName("SaveModelExample").getOrCreate()

# Load your CSV file into a DataFrame
# Replace 'your_file.csv' with the actual file path
data= spark.read.csv("/content/drive/MyDrive/music_data.csv",header=True, inferSchema=True)


In [5]:
# Display the schema and first few rows of the dataset
print("Dataset Schema:")
data.printSchema()
print("First few rows of the dataset:")
data.show(5)

Dataset Schema:
root
 |-- track_name: string (nullable = true)
 |-- track_uri: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- Mood: string (nullable = true)

First few rows of the dataset:
+--------------------+--------------------+-----------+------------+------------+------+----------------+--------+--------+-----------+-------+-------+----+
|          track_name|           track_uri|artist_name|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|  tempo|valence|Mood|
+--------------------+--------------------+-----------+------------+------------+------+----------

In [6]:
for k in data.columns:
    print(k,data.where(col(k).isNull()).count())



track_name 0
track_uri 0
artist_name 0
acousticness 0
danceability 0
energy 0
instrumentalness 0
liveness 0
loudness 0
speechiness 0
tempo 0
valence 0
Mood 0


In [7]:
selected_features=['danceability','acousticness','energy','instrumentalness','liveness','valence','loudness','speechiness','tempo','mood']


music_data =data.select(selected_features)



In [8]:
#converting the string data type into double

music_data  = music_data.withColumn("energy", col("energy").cast("double"))
music_data  = music_data.withColumn("danceability", col("danceability").cast("double"))

music_data  = music_data.withColumn("instrumentalness", col("instrumentalness").cast("double"))
music_data  = music_data.withColumn("liveness", col("liveness").cast("double"))

music_data  = music_data.withColumn("loudness", col("loudness").cast("double"))
music_data  = music_data.withColumn("speechiness", col("speechiness").cast("double"))


music_data  = music_data.withColumn("tempo", col("tempo").cast("double"))


In [9]:


categorical_column = "mood"
string_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index")
music_data= string_indexer.fit(music_data).transform(music_data)

music_data.show()


+------------+------------+------+----------------+--------+-------+--------+-----------+-------+----+----------+
|danceability|acousticness|energy|instrumentalness|liveness|valence|loudness|speechiness|  tempo|mood|mood_index|
+------------+------------+------+----------------+--------+-------+--------+-----------+-------+----+----------+
|       0.509|       0.396|  0.57|             0.0|  0.0986|  0.324|    -6.7|     0.0284| 80.134| Sad|       0.0|
|       0.756|       0.949| 0.269|          0.0801|   0.157|  0.229|  -9.164|     0.0323| 86.409| Sad|       0.0|
|       0.276|        0.76| 0.357|             0.0|  0.0796|  0.326|  -9.717|     0.0434|201.512| Sad|       0.0|
|       0.541|       0.143| 0.416|             0.0|  0.0907|  0.286|  -7.745|     0.0271|116.466| Sad|       0.0|
|       0.542|       0.978|0.0766|         3.41E-4|   0.106| 0.0797| -18.349|     0.0401| 76.732| Sad|       0.0|
|       0.572|       0.561| 0.582|             0.0|   0.162|  0.288|  -6.748|     0.0358

In [10]:

for k in music_data.columns:
    print(k,music_data.where(col(k).isNull()).count())



danceability 0
acousticness 0
energy 0
instrumentalness 0
liveness 0
valence 0
loudness 0
speechiness 0
tempo 0
mood 0
mood_index 0


In [11]:
music_data=music_data.dropna()

In [12]:

for k in music_data.columns:
    print(k,music_data.where(col(k).isNull()).count())



danceability 0
acousticness 0
energy 0
instrumentalness 0
liveness 0
valence 0
loudness 0
speechiness 0
tempo 0
mood 0
mood_index 0


In [13]:

#dropping the mood (categorical) column
column_to_drop = "mood"
music_data=music_data.drop(column_to_drop)


In [14]:

#describe
# Prepare features by assembling them into a vector
#converting mood("datatype double") into mood(Integer type)

music_data=music_data.withColumn("mood_index", col("mood_index").cast("integer"))


In [15]:
feature_columns = music_data.columns[:-1]  # Exclude the label column


In [16]:

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
#assembled_data = assembler.transform(music_data)


In [17]:
classifier = RandomForestClassifier(featuresCol="features", labelCol="mood_index", numTrees=10)


In [18]:
# Split the data into training and testing sets
(train_data, test_data) = music_data.randomSplit([0.8, 0.2], seed=123)


In [19]:
pipeline = Pipeline(stages=[assembler, classifier])


In [20]:
# Train the model
model = pipeline.fit(train_data)

In [21]:
# Make predictions on the test set
predictions = model.transform(test_data)



In [22]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="mood_index", predictionCol="prediction", metricName="accuracy")


In [23]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2%}".format(accuracy))

# Stop the Spark session
#spark.stop()

Accuracy: 70.14%


In [24]:
# Save the model to a local directory
model.save("file////content/drive/MyDrive/Datasets1/mlmodel1")


In [27]:

# Load the model
loaded_model = model.load("file////content/drive/MyDrive/Datasets1/mlmodel1")



In [28]:
spark = SparkSession.builder.appName("ReadingCsvExample").getOrCreate()


In [29]:
test_data_new= spark.read.csv("/content/drive/MyDrive/music_data_testing.csv",header=True, inferSchema=True)


In [30]:
test_data_new.printSchema()

root
 |-- acousticness: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)



In [31]:

test_data_new = music_data.withColumn("acousticness", col("acousticness").cast("double"))
test_data_new= music_data.withColumn("danceability", col("danceability").cast("double"))


In [32]:
columns_to_drop=['_c9','_c10','_c11','c12','c13','c14']

In [33]:
test_data_new=test_data.drop("columns_to_drop")

In [34]:
test_data_new.printSchema()

root
 |-- danceability: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- mood_index: integer (nullable = true)



In [35]:

for k in test_data_new.columns:
    print(k,test_data_new.where(col(k).isNull()).count())



danceability 0
acousticness 0
energy 0
instrumentalness 0
liveness 0
valence 0
loudness 0
speechiness 0
tempo 0
mood_index 0


In [36]:
# Make predictions using the loaded model
predictions_new = loaded_model.transform(test_data_new)


In [37]:
predictions_new.show()

+------------+------------+------+----------------+--------+-------+--------+-----------+-------+----------+--------------------+--------------------+--------------------+----------+
|danceability|acousticness|energy|instrumentalness|liveness|valence|loudness|speechiness|  tempo|mood_index|            features|       rawPrediction|         probability|prediction|
+------------+------------+------+----------------+--------+-------+--------+-----------+-------+----------+--------------------+--------------------+--------------------+----------+
|       0.188|       0.174| 0.411|           0.153|  0.0843|  0.159|  -9.733|     0.0484|187.376|         0|[0.188,0.174,0.41...|[6.30791489094628...|[0.63079148909462...|       0.0|
|       0.205|       0.646| 0.687|         0.00501|   0.173|   0.66|  -8.934|     0.0592|203.243|         1|[0.205,0.646,0.68...|[5.09723669414028...|[0.50972366941402...|       0.0|
|       0.223|       0.845| 0.359|             0.0|   0.133|  0.193|  -7.569|     0.0

In [38]:
accuracy = evaluator.evaluate(predictions_new)
print("Accuracy: {:.2%}".format(accuracy))


Accuracy: 70.14%
