In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import collections
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars file:///home/jovyan/jdbc/postgresql-42.2.8.jar pyspark-shell'

In [None]:
conf = SparkConf()  # create the configuration

spark = SparkSession \
    .builder \
    .config(conf=conf) \
    .appName("Lista-Q-03") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
# Le um aqruivo texto para um data frame, e define o schema dos dados
dfReviews = spark.read.csv( \
    path='/home/jovyan/work/u.data', \
    sep='\t', \
    schema='userId int, movieId int, rating int, timestamp bigint')

In [3]:
# imprime o schema definifo
dfReviews.printSchema()
# imprime a quantidade de registros no dataframe
dfReviews.count()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: long (nullable = true)



100000

In [4]:
# cria uma view temporaria a partir do data frame
dfReviews.createOrReplaceTempView("reviews")

In [22]:
# What is the average rating of our top-10 movies?
dfSqlReviews = spark.sql("select movieId, sum(rating) as sumRatings, count(movieId) as amountRatings, sum(rating) / count(movieId) as average from reviews group by movieId order by sumRatings desc limit 10")

In [23]:
dfSqlReviews.take(10)

[Row(movieId=50, sumRatings=2541, amountRatings=583, average=4.3584905660377355),
 Row(movieId=100, sumRatings=2111, amountRatings=508, average=4.155511811023622),
 Row(movieId=181, sumRatings=2032, amountRatings=507, average=4.007889546351085),
 Row(movieId=258, sumRatings=1936, amountRatings=509, average=3.8035363457760316),
 Row(movieId=174, sumRatings=1786, amountRatings=420, average=4.252380952380952),
 Row(movieId=127, sumRatings=1769, amountRatings=413, average=4.283292978208232),
 Row(movieId=286, sumRatings=1759, amountRatings=481, average=3.656964656964657),
 Row(movieId=1, sumRatings=1753, amountRatings=452, average=3.8783185840707963),
 Row(movieId=98, sumRatings=1673, amountRatings=390, average=4.28974358974359),
 Row(movieId=288, sumRatings=1645, amountRatings=478, average=3.4414225941422596)]

In [24]:
# What is the average rating given by male reviewers? And what about females?
dfUsers = spark.read.csv( \
    path='/home/jovyan/work/u.user', \
    sep='|', \
    schema='userId int, age int, gender string, occupation string, zipCode string')

In [25]:
dfUsers.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zipCode: string (nullable = true)



In [26]:
dfUsers.createOrReplaceTempView("users")

In [29]:
dfSqlUsersReviews = spark.sql("select u.gender, sum(r.rating) as sumRatings, count(r.*) as amountRatings, sum(r.rating) / count(r.*) as average from reviews r inner join users u on u.userId = r.userId group by u.gender order by gender desc")

In [31]:
dfSqlUsersReviews.take(2)

[Row(gender='M', sumRatings=262085, amountRatings=74260, average=3.5292889846485322),
 Row(gender='F', sumRatings=90901, amountRatings=25740, average=3.5315073815073816)]

In [35]:
# Which movie can be considered the "best one"? And the worst one?
dfMovies = spark.read.csv( \
    path='/home/jovyan/work/u.item', \
    sep='|', \
    schema= 'movieId int, title string, releasedate string, videorelease string, imdbUrl string, '+
            'unknown int, action int, adventure int, animation int, children int, comedy int, '+
            'crime int, documentary int, drama int, fantasy int, noir int, horror int, musical int, ' +
            'mystery int, romance int, scifi int, thriller int, war int, western int')

In [36]:
dfMovies.createOrReplaceTempView("movies")

In [37]:
dfSqlMoviesRatings = spark.sql("select r.movieId, sum(r.rating) as sumRatings, count(r.*) as amountRatings, sum(r.rating) / count(r.*) as average from reviews r inner join movies m on m.movieId = r.movieId group by r.movieId")

In [39]:
dfSqlMoviesRatings.createOrReplaceTempView("movies_ratings")

In [64]:
dfBestMovie = spark.sql("select mr.movieId, m.title, max(mr.average) as maxAverage, sum(mr.amountRatings) as totalRatings from movies_ratings mr inner join movies m on m.movieId = mr.movieId group by mr.movieId, m.title having totalRatings > 300 order by maxAverage desc, totalRatings desc")

In [66]:
dfBestMovie.take(1)

[Row(movieId=50, title='Star Wars (1977)', maxAverage=4.3584905660377355, totalRatings=583)]

In [69]:
dfWorstMovie = spark.sql("select mr.movieId, m.title, min(mr.average) as minAverage, sum(mr.amountRatings) as totalRatings from movies_ratings mr inner join movies m on m.movieId = mr.movieId group by mr.movieId, m.title having totalRatings > 300 order by minAverage asc, totalRatings asc")

In [70]:
dfWorstMovie.take(1)

[Row(movieId=748, title='Saint, The (1997)', minAverage=3.1234177215189876, totalRatings=316)]

In [78]:
# What is the name and age of the user that have done more evaluations?
dfUsersCountReviews = spark.sql("select r.userId, u.age, count(r.userId) as amount from reviews r inner join users u on u.userId = r.userId group by r.userId, u.age order by amount desc")

In [79]:
dfUsersCountReviews.take(1)

[Row(userId=405, age=22, amount=737)]

In [81]:
dfUsersZeroReviews = spark.sql("select u.userId, u.age, count(r.userId) as amount from users u left join reviews r on r.userId = u.userId group by u.userId, u.age order by amount asc")

In [82]:
dfUsersZeroReviews.take(1)

[Row(userId=572, age=51, amount=20)]