In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as F

In [2]:
def loadMovieNames():
    movieNames = {}
    with open("path") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [3]:
def parseInput(line):
    fields = line.split()
    return Row(movieID = int(field[1]), rating=float(fields[2]))

In [4]:
if __name__ == "__main__":
    #create a SparkSession
    spark = SparkSession.builder.appName("PopularMovies").getOrCreate()
    
    #load up movie ID -> name dictionary
    movieNames = loadMovieNames()
    
    #get raw data
    lines = spark.sparkContext.textFile('path')
        
    #convert into a RDD of Row objects with (movieID, rating)
    movies = lines.map(parseInput)
    
    #convert that to a dataframe
    movieDataset = spark.createDataFrame(movies)
    
    #compute average ratings for each movieID
    averageRatings = movieDataset.groupBy('movieID').avg('rating')
    
    #compute count of ratings for each movieID
    counts = movieDataset.groupBy('movieID').count()
    
    # join the two together
    averagesAndCounts = counts.join(averageRatings, 'movieID')
    
    # pull the top 10 results
    topTen = averageAndCounts.orderBy('avg(rating)').take(10)
    
    # print them out
    for movie in topTen:
        print (movieNames[movie[0], movie[1], movie[2]])
        
    # stop the session
    spark.stop()

FileNotFoundError: [Errno 2] No such file or directory: 'path'