# Movies

## Download the dataset

You need to install `unzip` on your computer with

```bash
sudo apt install unzip
```

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [None]:
!unzip ml-latest-small.zip

## Inspecting the data we need

In [None]:
!head ml-latest-small/movies.csv

In [None]:
!wc -l ml-latest-small/movies.csv

In [None]:
!head ml-latest-small/ratings.csv

In [None]:
!wc -l ml-latest-small/ratings.csv

## Task 1 - Compute the Average for Each Movie

### Implementation

In [None]:
%%writefile average.py

#!/usr/bin/python3
from mrjob.job import MRJob

class MyJob(MRJob):
    
    def mapper(self, key, line):
        (_,movieId,rating,_) = line.split(",")
        
        if movieId != "movieId": #ignore the header
            yield movieId, float(rating)
    

    def reducer(self, movieId, ratings):
        ratings = list(ratings)
        if len(ratings) >= 5:
            avg = sum(ratings)/len(ratings)
            yield int(movieId), avg

if __name__ == '__main__':
    MyJob.run()

### Run Task 1

In [None]:
!python average.py ml-latest-small/ratings.csv > averages.txt

## Task 2 - PreProcessing of Ratings

### Implementation

In [None]:
%%writefile movie_cleaning.py

#!/usr/bin/python3
from mrjob.job import MRJob

class MyJob(MRJob):
    
    def mapper(self, key, line):
        line = line.split(",")
        movieId = line[0]
        title = line[1:-1]
        
        if movieId != "movieId": #ignore the header
            title  = ",".join(title)
            title = title.replace('\"', "")
            yield int(movieId), title
    
if __name__ == '__main__':
    MyJob.run()

### Run Task 2

In [None]:
!python movie_cleaning.py ml-latest-small/movies.csv > movies.txt

## Task 3 - Joining `movies.txt` and `averages.txt`

In [None]:
%%writefile join.py

#!/usr/bin/python3
from mrjob.job import MRJob
from mrjob.step import MRStep
import os

class MyJob(MRJob):
    
    def steps(self):
        JOBCONF_STEP2 = {
            'mapred.output.key.comparator.class':'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options':'-nr',
        }
        return [
            MRStep(                       mapper=self.mapper,      reducer=self.reducer),
            MRStep(jobconf=JOBCONF_STEP2, mapper=self.mapper_sort, reducer=self.reducer_sort)]
    
    def mapper(self, key, line):
        file_name = os.environ['mapreduce_map_input_file']
        
        if file_name.endswith("averages.txt"):
            (movieId, rating) = line.split("\t")
            yield movieId, {"rating": rating}
        else:
            (movieId, title) = line.split("\t")
            yield movieId, {"title": title}
            
    
    def reducer(self, movieId, rating_or_title):
        l = list(rating_or_title)
        if len(l) == 2:
            dic = {**l[0], **l[1]}
            
            title = dic['title'].replace('\"', "")
            yield round(float(dic['rating']),2), title
            
    def mapper_sort(self, rating, title):
        yield rating, title
  
    def reducer_sort(self, rating, titles):
        for title in titles:
            yield rating, title
                    
    
if __name__ == '__main__':
    MyJob.run()

### Testing locally (no sort)

In [None]:
!python join.py ./*.txt

### Testing with Hadoop (Sorted)

In [None]:
!python join.py -r hadoop ./*.txt > movies_rated.txt

## Inspect the Results

Have a look at the results. Do you agree?