DATA INGESTION

In [1]:
from pyspark.sql import SparkSession

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("CSVtoTXT") \
    .getOrCreate()

In [None]:
# Define HDFS file paths
csv_path_music = "hdfs:///mydata/music_info.csv" 
csv_path_history = "hdfs:///mydata/User_Listening_History.csv" 
txt_output_path_music = "hdfs:///mydata/music_info_unstructured_semicolon.txt"
txt_output_path_history = "hdfs:///mydata/user_listening_history_unstructured_semicolon.txt"

In [None]:
# Load CSV files from HDFS
df_music = spark.read.option("header", "true").csv(csv_path_music)
df_history = spark.read.option("header", "true").csv(csv_path_history)

                                                                                

In [None]:
# Transform each row by joining fields with ";" and handle None values by replacing them with empty strings
formatted_rdd_music = df_music.rdd.map(lambda row: ";".join([str(field) if field is not None else "" for field in row]))
formatted_rdd_history = df_history.rdd.map(lambda row: ";".join([str(field) if field is not None else "" for field in row]))

In [None]:
# Convert each RDD row to a single-element tuple for DataFrame compatibility
formatted_rdd_music_df = formatted_rdd_music.map(lambda row: (row,)).toDF(["unstructured_data"])
formatted_rdd_history_df = formatted_rdd_history.map(lambda row: (row,)).toDF(["unstructured_data"])

# Show the first 5 rows of each transformed RDD
print("Sample from music_info_unstructured:")
formatted_rdd_music_df.show(5, truncate=False)

print("\nSample from user_listening_history_unstructured:")
formatted_rdd_history_df.show(5, truncate=False)

[Stage 2:>                                                          (0 + 1) / 1]

                                                                                

Sample from music_info_unstructured:
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|unstructured_data                                                                                                                                                                                                                                                                                                                               |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
num_partitions = rdd.getNumPartitions()
print(f"The RDD has {num_partitions} partitions.")


In [None]:
from subprocess import call

In [None]:
# Remove the existing output directories if they exist
call(["hadoop", "fs", "-rm", "-r", txt_output_path_music])
call(["hadoop", "fs", "-rm", "-r", txt_output_path_history])

Deleted hdfs:///mydata/music_info_unstructured_semicolon.txt
Deleted hdfs:///mydata/user_listening_history_unstructured_semicolon.txt


0

In [None]:
# Save the transformed RDDs as TXT files with semicolons in HDFS
formatted_rdd_music.saveAsTextFile(txt_output_path_music)
formatted_rdd_history.saveAsTextFile(txt_output_path_history)

                                                                                

In [None]:
# Stop the Spark session
spark.stop()