In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions

def loadMovieNames():
    movieNames = {}
    with open('u.item', encoding="iso-8859-1") as f:
        for lines in f:
            fields = lines.split('|')
            movieNames[int(fields[0])] = fields[1]
            
    return movieNames


def parseInput(lines):
    fields = lines.split()
    # (movieID, (ratings, 1.0))
    return Row(movieId = int(fields[1]), ratings = (float(fields[2])))


if __name__ == "__main__":
    # creating spark session
    spark = SparkSession.builder.appName('Data Frame').getOrCreate()
    
    # loading the movie names
    movieNames = loadMovieNames()
    
    # creating rdd from u.data file
#     lines = sc.textFile('hdfs://localhost:9000/user/deepjyotiroy079/ml-100k/u.data')
    lines = spark.sparkContext.textFile('hdfs://localhost:9000/user/deepjyotiroy079/ml-100k/u.data')
    
    # (movieID, (ratings, 1.0))
    movieRatings = lines.map(parseInput)
    
    movieDataset = spark.createDataFrame(movieRatings)
    
    averageRatings = movieDataset.groupBy("movieID").avg("ratings")

    counts = movieDataset.groupBy("movieID").count()
    
    averagesAndCounts = counts.join(averageRatings, "movieID")
    
    # using the count property of dataframe to get the total counts of movies
    popularAveragesAndCounts = averagesAndCounts.filter('count > 20')
    
    topTen = popularAveragesAndCounts.orderBy("avg(ratings)").take(10)
    
    for result in topTen:
        print(f'Movie Name : {movieNames[result[0]]}, Ratings : {result[1]}')


Movie Name : Lawnmower Man 2: Beyond Cyberspace (1996), Ratings : 21
Movie Name : Free Willy 3: The Rescue (1997), Ratings : 27
Movie Name : Leave It to Beaver (1997), Ratings : 44
Movie Name : Bio-Dome (1996), Ratings : 31
Movie Name : Barb Wire (1996), Ratings : 30
Movie Name : Crow: City of Angels, The (1996), Ratings : 39
Movie Name : Mortal Kombat: Annihilation (1997), Ratings : 43
Movie Name : Showgirls (1995), Ratings : 23
Movie Name : Grease 2 (1982), Ratings : 24
Movie Name : Tales from the Hood (1995), Ratings : 27
