In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\work\\spark-2.4.8-bin-hadoop2.7'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as func
import codecs

In [3]:
spark=SparkSession.builder.appName('movie').getOrCreate()

In [4]:
def loadMovieNames():
    movie={}
    with codecs.open("ml-100k/u.item","r",encoding="ISO-8859-1",errors='ignore') as f:
        for line in f:
            fields=line.split('|')
            movie[int(fields[0])]=fields[1]
    return movie

In [5]:
nameDict=spark.sparkContext.broadcast(loadMovieNames())
nameDict

<pyspark.broadcast.Broadcast at 0x195fef48ac8>

In [6]:
schema=StructType([
    StructField("userID",IntegerType(),True),
    StructField("movieID",IntegerType(),True),
    StructField("rating",IntegerType(),True),
    StructField("time",LongType(),True),
])

In [7]:
data=spark.read.schema(schema).csv("ml-100k/u.data",sep="\t")
data.show(3)

+------+-------+------+---------+
|userID|movieID|rating|     time|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
+------+-------+------+---------+
only showing top 3 rows



In [28]:
df=data.groupBy("movieID").agg(func.sum("rating").alias("rating"))
df.show(3)

+-------+------+
|movieID|rating|
+-------+------+
|    496|   952|
|    471|   798|
|    463|   274|
+-------+------+
only showing top 3 rows



In [13]:
df=data.groupBy("movieID").count().orderBy(func.desc("count"))
df.show(3)

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
+-------+-----+
only showing top 3 rows



In [29]:
def lookUpName(movieID):
    return nameDict.value[movieID]

In [32]:
lookUpNameUDF=func.udf(lookUpName)
lookUpNameUDF

<function __main__.lookUpName(movieID)>

In [34]:
dfNames=df.withColumn("Title",lookUpNameUDF(df.movieID))
dfNames.show(3)

+-------+------+--------------------+
|movieID|rating|               Title|
+-------+------+--------------------+
|    496|   952|It's a Wonderful ...|
|    471|   798|Courage Under Fir...|
|    463|   274|Secret of Roan In...|
+-------+------+--------------------+
only showing top 3 rows



10

[[Row(userID=308, movieID=1, rating=4, time=887736532),
  Row(userID=246, movieID=201, rating=5, time=884921594),
  Row(userID=234, movieID=1184, rating=2, time=892079237),
  Row(userID=291, movieID=118, rating=2, time=874833878),
  Row(userID=299, movieID=144, rating=4, time=877881320)],
 [Row(userID=10, movieID=16, rating=4, time=877888877),
  Row(userID=99, movieID=4, rating=5, time=886519097),
  Row(userID=22, movieID=377, rating=1, time=878887116),
  Row(userID=287, movieID=327, rating=5, time=875333916),
  Row(userID=97, movieID=194, rating=3, time=884238860)]]

+------+-------+------+---------+
|userID|movieID|rating|     time|
+------+-------+------+---------+
|   166|    346|     1|886397596|
|    22|    377|     1|878887116|
|   276|    796|     1|874791932|
|   181|   1081|     1|878962623|
|   244|     51|     2|880606923|
|   291|    118|     2|874833878|
|    62|    257|     2|879372434|
|   115|    265|     2|881171488|
|   194|    274|     2|879539794|
|    95|    546|     2|879196566|
|   102|    768|     2|883748450|
|   201|    979|     2|884114233|
|   234|   1184|     2|892079237|
|   224|     29|     3|888104457|
|   210|     40|     3|891035994|
|     6|     86|     3|883603013|
|    97|    194|     3|884238860|
|   196|    242|     3|881250949|
|    50|    246|     3|877052329|
|   186|    302|     3|891717742|
+------+-------+------+---------+
only showing top 20 rows



[[Row(userID=225, movieID=193, rating=4, time=879539727),
  Row(userID=234, movieID=1184, rating=2, time=892079237),
  Row(userID=291, movieID=1042, rating=4, time=874834944)],
 [Row(userID=10, movieID=16, rating=4, time=877888877),
  Row(userID=194, movieID=274, rating=2, time=879539794),
  Row(userID=287, movieID=327, rating=5, time=875333916),
  Row(userID=201, movieID=979, rating=2, time=884114233)]]

+------+-------+------+---------+
|userID|movieID|rating|     time|
+------+-------+------+---------+
|    22|    377|     1|878887116|
|   166|    346|     1|886397596|
|   181|   1081|     1|878962623|
|   276|    796|     1|874791932|
|    62|    257|     2|879372434|
|    95|    546|     2|879196566|
|   102|    768|     2|883748450|
|   115|    265|     2|881171488|
|   194|    274|     2|879539794|
|   201|    979|     2|884114233|
|   234|   1184|     2|892079237|
|   244|     51|     2|880606923|
|   291|    118|     2|874833878|
|     6|     86|     3|883603013|
|    50|    246|     3|877052329|
|    97|    194|     3|884238860|
|   178|    332|     3|882823437|
|   186|    302|     3|891717742|
|   196|    242|     3|881250949|
|   210|     40|     3|891035994|
+------+-------+------+---------+
only showing top 20 rows

