In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.ml.linalg import Vectors, VectorUDT
import re

def parse_vector(line):
    if line.startswith('[') and line.endswith(']'):
        return Vectors.dense([float(x) for x in line[1:-1].split(',')])
    else:
        pattern = r'\((\d+),\[(.*?)\],\[(.*?)\]\)'
        match = re.match(pattern, line)
        if match:
            size = int(match.group(1))
            indices = [int(x) for x in match.group(2).split(',')] if match.group(2) != '' else []
            values = [float(x) for x in match.group(3).split(',')] if match.group(3) != '' else []
            return Vectors.sparse(size, indices, values)

udf_parse_vector = udf(parse_vector, VectorUDT())

spark = SparkSession.builder.appName('DataLoader').getOrCreate()
file_path = '../data/processed/data.csv'
df = spark.read.option('header', 'true').csv(file_path, inferSchema=True)
df = df.withColumn('final_feature_vector', udf_parse_vector(col('final_feature_vector')))

# Gunakan aksi untuk memicu eksekusi UDF dan mengubah data
df.show(5, truncate=False)  # Menampilkan 5 baris pertama dengan representasi penuh

spark.stop()


In [None]:
df.count()

In [None]:
df.select("final_feature_vector").show(5, truncate=False)

In [None]:
print(parse_vector("[1.0, 2.0, 3.0]"))  # Dense
print(parse_vector("(3,[1,2],[2.0,3.0])"))  # Sparse

In [None]:
# Mengambil beberapa sampel vektor
vectors = df.select("final_feature_vector").take(5)
for row in vectors:
    print(row)