In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]


In [2]:


from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MovieLens EDA") \
    .master("local[*]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

25/11/07 13:18:22 WARN Utils: Your hostname, DESKTOP-8DIE4OP resolves to a loopback address: 127.0.1.1; using 172.20.177.130 instead (on interface eth0)
25/11/07 13:18:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/07 13:18:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

# Định nghĩa schema cho ratings.dat
ratings_schema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("MovieID", IntegerType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Timestamp", IntegerType(), True)
])
from pyspark.sql.types import StringType

# users.dat
users_schema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Occupation", IntegerType(), True),
    StructField("Zipcode", StringType(), True)
])

# movies.dat
movies_schema = StructType([
    StructField("MovieID", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Genres", StringType(), True)
])



In [13]:

# Đọc ratings.dat
ratings = spark.read.csv(
    "../data/ml-1m/ratings.dat",
    sep="::",
    schema=ratings_schema,
    header=False
)

# Đọc users.dat
users = spark.read.csv(
    "../data/ml-1m/users.dat",
    sep="::",
    schema=users_schema,
    header=False
)

# Đọc movies.dat
movies = spark.read.csv(
    "../data/ml-1m/movies.dat",
    sep="::",
    schema=movies_schema,
    header=False
)

In [14]:

ratings.show(5)
ratings.printSchema()

+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|   1193|   5.0|978300760|
|     1|    661|   3.0|978302109|
|     1|    914|   3.0|978301968|
|     1|   3408|   4.0|978300275|
|     1|   2355|   5.0|978824291|
+------+-------+------+---------+
only showing top 5 rows

root
 |-- UserID: integer (nullable = true)
 |-- MovieID: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: integer (nullable = true)



In [15]:

users.show(5)
users.printSchema()



+------+------+---+----------+-------+
|UserID|Gender|Age|Occupation|Zipcode|
+------+------+---+----------+-------+
|     1|     F|  1|        10|  48067|
|     2|     M| 56|        16|  70072|
|     3|     M| 25|        15|  55117|
|     4|     M| 45|         7|  02460|
|     5|     M| 25|        20|  55455|
+------+------+---+----------+-------+
only showing top 5 rows

root
 |-- UserID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- Zipcode: string (nullable = true)



In [16]:

movies.show(5)
movies.printSchema()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|        Comedy|Drama|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

root
 |-- MovieID: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genres: string (nullable = true)



In [17]:
from pyspark.sql.functions import from_unixtime

ratings = ratings.withColumn("Datetime", from_unixtime("Timestamp"))


In [19]:
from pyspark.sql.functions import split

movies = movies.withColumn("GenresArray", split("Genres", r"\|"))


In [20]:
# Join ratings với users và movies
full_df = ratings.join(users, on="UserID").join(movies, on="MovieID")
full_df.show(5)


+-------+------+------+---------+-------------------+------+---+----------+-------+--------------------+--------------------+--------------------+
|MovieID|UserID|Rating|Timestamp|           Datetime|Gender|Age|Occupation|Zipcode|               Title|              Genres|         GenresArray|
+-------+------+------+---------+-------------------+------+---+----------+-------+--------------------+--------------------+--------------------+
|   1193|     1|   5.0|978300760|2001-01-01 05:12:40|     F|  1|        10|  48067|One Flew Over the...|               Drama|             [Drama]|
|    661|     1|   3.0|978302109|2001-01-01 05:35:09|     F|  1|        10|  48067|James and the Gia...|Animation|Childre...|[Animation, Child...|
|    914|     1|   3.0|978301968|2001-01-01 05:32:48|     F|  1|        10|  48067| My Fair Lady (1964)|     Musical|Romance|  [Musical, Romance]|
|   3408|     1|   4.0|978300275|2001-01-01 05:04:35|     F|  1|        10|  48067|Erin Brockovich (...|              

In [21]:
# Example: lọc rating ngoài 1-5
full_df = full_df.filter((full_df.Rating >= 1) & (full_df.Rating <= 5))
