# Questions for the MovieLens Parquet Dataset

## Setup Spark-SQL

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Speed Test") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
%load_ext sparksql_magic

In [None]:
%%sparksql

SHOW DATABASES

In [None]:
%%sparksql

USE movielens_parquet_compressed

## Playground

### How many movies do we have?

In [None]:
%%time 
%%sparksql

SELECT count(*) FROM movies

### How many ratings do we have?

In [None]:
%%time 
%%sparksql

SELECT count(*) FROM ratings

### How many users do we have?

In [None]:
%%time 
%%sparksql

SELECT COUNT(DISTINCT(userid)) FROM ratings

### Which movie(s) has (have) the most number of genres?

In [None]:
%%time 
%%sparksql

select title, year, genres, size(genres) as num_gen from movies order by num_gen desc limit 2

### Show all movies with terminator in the title

In [None]:
%%time 
%%sparksql

select movieid, title, year from movies where lower(title) like '%terminator%'

### How many movies do we have from 1984?

In [None]:
%%time 
%%sparksql

select count(*) from movies where year = 1984

### Show the distribution of movies per year (where year >= 2000), sorted by year

In [None]:
%%time 
%%sparksql

select year, count(title) from movies where year >= 2000 group by year order by year asc

### Movies with the most number of ratings

In [None]:
%%time 
%%sparksql

select title, year, num_rating, median_rating from movie_rating order by num_rating DESC limit 10

### Top ten best rated movies (by median) where we have at least 100 ratings for a movie

In [None]:
%%time 
%%sparksql

select title, year, num_rating, median_rating 
from movie_rating
where num_rating > 100
order by median_rating DESC, num_rating DESC
limit 10

### Top ten worst rated movies (by median) where we have at least 100 ratings for a movie

In [None]:
%%time 
%%sparksql

select title, year, num_rating, median_rating 
from movie_rating
where
    num_rating is not null
    and num_rating > 100
order by median_rating ASC, num_rating DESC
limit 10

### Which genres were used how often?

In [None]:
%%time 
%%sparksql

SELECT genre, COUNT(genre) AS cnt FROM (
    SELECT EXPLODE(genres) genre FROM movies
)t
GROUP BY genre
ORDER BY cnt DESC

## Naïve Movie Recommender

### Step 1 - find two movies (the `movieid` you like a lot)
 
 --> 4011 == Snatch
 
 --> 1270 == Back to the Future


In [None]:
%%time 
%%sparksql

select movieid, title, year from movies where lower(title) like '%snatch%'

In [None]:
%%time 
%%sparksql

select movieid, title, year from movies where lower(title) like '%back to the%'


### Find people who liked these movies as well and save it into temp table
```
 * hive://hadoop@localhost:10000/movielens_parquet_compressed
Done.
CPU times: user 7.76 ms, sys: 3.01 ms, total: 10.8 ms
Wall time: 30.8 s
```

In [None]:
%%time 
%%sparksql

CREATE TEMPORARY VIEW similar_people as 
select distinct(userid) userid
from ratings 
where (movieid = 4011 or movieid = 1270) and rating = 5

### Basic checks for `similar_people`

```
 * hive://hadoop@localhost:10000/movielens_parquet_compressed
Done.
CPU times: user 4.24 ms, sys: 3.32 ms, total: 7.57 ms
Wall time: 126 ms
```

In [None]:
%%time 
%%sparksql

select * from similar_people limit 2

```
 * hive://hadoop@localhost:10000/movielens_parquet_compressed
Done.
CPU times: user 8.16 ms, sys: 110 µs, total: 8.27 ms
Wall time: 102 ms
```

In [None]:
%%time 
%%sparksql

select count(*) from similar_people

### Join `similar_people` with `movies` and `ratings` and get movie recommendations

```
 * hive://hadoop@localhost:10000/movielens_parquet_compressed
Done.
CPU times: user 11.3 ms, sys: 5.8 ms, total: 17.1 ms
Wall time: 1min 2s
```

In [None]:
%%time 
%%sparksql

SELECT m.title, count(*) as five_star_count from ratings r
INNER JOIN similar_people sp ON r.userid = sp.userid
INNER JOIN movies m ON r.movieid = m.movieid
WHERE rating = 5
GROUP BY m.title
ORDER BY five_star_count DESC
LIMIT 20