# Differences between Rotten Tomatoes' Tomatometer and Audience score - filtered by genre.
Does the difference between critic score and audience score vary by genre. The project's idea is to mine the famous [Rotten Tomatoes](https://www.rottentomatoes.com) for the scores of movies and then look-up the genres in imdb.

Once I got the data I will store it in a MySQL database and query from there.

In [14]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urlencode
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from sqlalchemy import *
import json
import requests

In [2]:
# Defining the options for our browser
chrome_options = Options()
chrome_options.add_argument("--headless") # to not display the page

# Defining the drivers and the URL
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://www.rottentomatoes.com/browse/dvd-streaming-all/')
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [3]:
# Loading the Connection String + database
DB_CON = json.loads(open('dbcon.json').read())['db']
OMBDB_KEY = json.loads(open('apikey.json').read())['key']

## Scraping Rotten Tomatoes

In [4]:
# Scraping Functions
def get_movie_count(webdriver):
    html = BeautifulSoup(webdriver.page_source, 'html.parser')
    init_movie_count = html.find(attrs={"id": "count-link"}).findAll('span')[2].text.split()[1]
    movie_count = html.find(attrs={"id": "count-link"}).findAll('span')[2].text.split()[-1]
    
    return int(init_movie_count), int(movie_count)

def click_on_show_more(webdriver):
    webdriver.find_element_by_css_selector('.mb-load-btn').click()
    # no returns

def scrape_all_movies(webdriver):
    '''
    The function takes the webdriver and return the whole page source for further work.
    '''
    init_count, total_count = get_movie_count(webdriver)
    
    while init_count < total_count:
        click_on_show_more(webdriver)
        init_count, total_count = get_movie_count(webdriver)
    
    return webdriver.page_source

In [5]:
# Scraping the actual page - can take a lot of time... get a coffee.
html = scrape_all_movies(driver)

In [6]:
def get_all_movies(html):
    soup_page = BeautifulSoup(html, 'html.parser')
    movie_infos = soup_page.find_all(attrs={'class':'movie_info'})
    
    movies = []
    for info in movie_infos:
        if len(info.find_all(attrs={'class':'tMeterScore'})) == 2:
            # filter the released movies that have both - the viewer and the critic scores
            movie = { 'title': info.find(attrs={'class':'movieTitle'}).text,
                      'critic_score':int(info.find_all(attrs={'class':'tMeterScore'})[0].text.strip('%')),
                      'viewer_score':int(info.find_all(attrs={'class':'tMeterScore'})[1].text.strip('%'))
                    }
            movies.append(movie)
            
    return movies

In [7]:
rt_movies = get_all_movies(html)

## Scraping IMDB
Here I cheated a bit. Instead of scraping IMDB, I opted out for using the excellent unofficial IMDB database [omdbapi.com](http://www.omdbapi.com) and used the provided API to check the genre of each movie scraped from Rotten Tomatoes.

In [11]:
rt_movies[5]['title']

'A Kid Like Jake'

In [74]:
# Functions for getting the movies
def get_movie(title):
    title_url = urlencode({'t':title})
    url = 'http://www.omdbapi.com/?apikey=' + OMBDB_KEY + '&' + title_url
    r = requests.get(url)
    if r.status_code == 200:
        return r.json()
    else:
        return {
            'Title':title,
            'Genre':'N\A',
            'imdbRating':'N\A',
            'Response':'True'
        }

def try_convert_score(value):
    try:
        val = float(value)
    except ValueError:
        val = 'N/A'
    
    return val

def get_imdb_movies(movies):
    imdb_movies = []
    for movie in movies:
        imdb_res = get_movie(movie['title'])
        if imdb_res['Response'] == 'True':
            imdb_movie = {
                'title': imdb_res['Title'],
                'genre': imdb_res['Genre'],
                'imdbRating': try_convert_score(imdb_res['imdbRating'])
            }
            imdb_movies.append(imdb_movie)
        
    return imdb_movies

In [None]:
imdb_movies = get_imdb_movies(rt_movies)

## Uploading to MySQL Database

In [41]:
def db_tables():
    # create the engine
    metadata = MetaData()
        
    rt_movies = Table('rt_movies', metadata,
                    Column('id', Integer, primary_key=True),
                    Column('title', String),
                    Column('critic_score', Integer),
                    Column('viewer_score', Integer))
    
    imdb_movies = Table('imdb_movies', metadata,
                      Column('id', Integer, primary_key=True),
                      Column('title', String),
                      Column('genre', String),
                      Column('imdbRating', Float))
    
    return rt_movies, imdb_movies

def db_upload(data, engine, table):
    if table.name == 'rt_movies':
        ins = table.insert().values(
            title = data['title'],
            critic_score = data['critic_score'],
            viewer_score = data['viewer_score']
        )
    else:
        ins = table.insert().values(
            title = data['title'],
            genre = data['genre']
        )
    
    connection = engine.connect()
    connection.execute(ins)

In [44]:
rt_movies_table, imdb_movies_table = db_tables()

In [59]:
# Create the db engine
dbengine = create_engine(DB_CON)
# UPLOAD the data
for rt_movie in rt_movies:
    db_upload(rt_movie, dbengine, rt_movies_table)