# Data Exploration 

In [4]:
import numpy as np
import pandas as pd 
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Connect to Local SQL Database

In [249]:
conn = sqlite3.connect("data/im.db")
imdb_people = pd.read_sql( 
    """
    SELECT ordering, movie_basics.start_year, movie_basics.primary_title, movie_ratings.averagerating, persons.primary_name, genres, characters, category, principals.job 
        FROM principals 
            JOIN persons
                ON principals.person_id == persons.person_id
            JOIN movie_basics 
                ON principals.movie_id == movie_basics.movie_id
            JOIN movie_ratings
                ON principals.movie_id == movie_ratings.movie_id
    """
, conn
)

#standardize column names
imdb_people = imdb_people.rename(columns={"primary_title": "title", "start_year": "year"})
imdb_people.head()



Unnamed: 0,ordering,year,title,averagerating,primary_name,genres,characters,category,job
0,10,2011,The Wicker Tree,3.9,Sean Barton,"Drama,Horror",,editor,
1,1,2011,The Wicker Tree,3.9,Brittania Nicol,"Drama,Horror","[""Beth Boothby""]",actress,
2,2,2011,The Wicker Tree,3.9,Henry Garrett,"Drama,Horror","[""Steve Thomson""]",actor,
3,3,2011,The Wicker Tree,3.9,Graham McTavish,"Drama,Horror","[""Sir Lachlan Morrison""]",actor,
4,4,2011,The Wicker Tree,3.9,Jacqueline Leonard,"Drama,Horror","[""Lady Delia Morrison""]",actress,


In [250]:
imdb_movies = pd.read_sql( 
    """
    SELECT movie_basics.start_year, movie_basics.primary_title, movie_ratings.averagerating, genres 
        FROM movie_basics 
            JOIN movie_ratings
                ON movie_basics.movie_id == movie_ratings.movie_id
    """
, conn
)
imdb_movies = imdb_movies.rename(columns={"primary_title": "title", "start_year": "year", "averagerating": "rating"})
imdb_movies.head()

Unnamed: 0,year,title,rating,genres
0,2013,Sunghursh,7.0,"Action,Crime,Drama"
1,2019,One Day Before the Rainy Season,7.2,"Biography,Drama"
2,2018,The Other Side of the Wind,6.9,Drama
3,2018,Sabse Bada Sukh,6.1,"Comedy,Drama"
4,2017,The Wandering Soap Opera,6.5,"Comedy,Drama,Fantasy"


# CSV and TSV Imports

In [251]:
#https://www.boxofficemojo.com/
bom = pd.read_csv('data/bom.movie_gross.csv')
#https://www.themoviedb.org/
tmdb = pd.read_csv('data/tmdb.movies.csv')
#https://www.the-numbers.com/
tn_movie_budgets = pd.read_csv('data/tn.movie_budgets.csv')
#https://www.rottentomatoes.com/ 
rt_movie_info = pd.read_csv('data/rt.movie_info.tsv', sep='\t')
rt_movie_review = pd.read_csv('data/rt.reviews.tsv', sep='\t', encoding='unicode_escape')

### CSV Head Checks + Basic Cleaning

We making sure each DataFrame is using a datetimes for dates and cleaning any 'unique' systems, like TMDB using a numeric code for genre. We also standardize column names. 

#### Cleaning Functions

In [252]:
def bar_to_comma(list):
    if type(list)!= str:
        return list
    if '|' in list:
        return list.replace('|', ',')
    else:
        return list

def csStringToList(cs_string):
    if type(cs_string) == str:
        return cs_string.split(',')
    else:
        return cs_string

In [253]:
imdb_movies = imdb_movies['genres'].map(csStringToList)
imdb_people = imdb_people['genres'].map(csStringToList)

In [233]:
bom = bom.rename(columns={"foreign_gross": "worldwide_gross"})
bom.head()

Unnamed: 0,title,studio,domestic_gross,worldwide_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [234]:
def genreIDtoGenre(id_list):
    #'comma separated' 
    cs = id_list[1:-1]
    cs.strip()
    ids = cs.split(', ')
    newlist = []
    for id in ids:
        if id == "12":
            newlist.append("Adventure")
        if id == "28":
            newlist.append("Action")
        if id == "16":
            newlist.append("Animation")
        if id == "35":
            newlist.append("Comedy")
        if id == "80":
            newlist.append("Crime")
        if id == "99":
            newlist.append("Documentary")
        if id == "18":
            newlist.append("Drama")
        if id == "10751":
            newlist.append("Family")
        if id == "14":
            newlist.append("Fantasy")
        if id == "36":
            newlist.append("History")
        if id == "27":
            newlist.append("Horror")
        if id == "10402":
            newlist.append("Music")
        if id == "9648":
            newlist.append("Mystery")
        if id == "10749":
            newlist.append("Romance")
        if id == "878":
            newlist.append("Science Fiction")
        if id == "10770":
            newlist.append("TV Movie")
        if id == "53":
            newlist.append("Thriller")
        if id == "10752":
            newlist.append("War")
        if id == "37":
            newlist.append("Western")
    return newlist

tmdb['genres'] = tmdb['genre_ids'].map(genreIDtoGenre)
tmdb['release_date'] = pd.to_datetime(tmdb['release_date'])
tmdb['year'] = tmdb['release_date'].dt.year

tmdb = tmdb.rename(columns={"vote_average": "rating"})

tmdb.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,rating,vote_count,genres,year
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,"[Adventure, Fantasy, Family]",2010
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,"[Fantasy, Adventure, Animation, Family]",2010
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,"[Adventure, Action, Science Fiction]",2010
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,"[Animation, Comedy, Family]",1995
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,"[Action, Science Fiction, Adventure]",2010


In [235]:
def money_to_int(money):
    #remove cash symbol
    money = money[1:]
    money = money.replace(',', '')
    money.strip()
    return money

def checkUSD(money):
    money = money[0]
    if money =='$':
        return True
    else:
        return False

tn_movie_budgets['release_date'] = pd.to_datetime(tn_movie_budgets['release_date'])
tn_movie_budgets['production_budget'] = tn_movie_budgets['production_budget'].map(money_to_int)
tn_movie_budgets['worldwide_gross'] = tn_movie_budgets['worldwide_gross'].map(money_to_int)
tn_movie_budgets['domestic_gross'] = tn_movie_budgets['domestic_gross'].map(money_to_int)

tn_movie_budgets = tn_movie_budgets.rename(columns={"movie": "title"})

tn_movie_budgets.head()


Unnamed: 0,id,release_date,title,production_budget,domestic_gross,worldwide_gross
0,1,2009-12-18,Avatar,425000000,760507625,2776345279
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


## Rotton Tomatos Combined 

In [236]:
rt_movie_info['theater_date'] = pd.to_datetime(rt_movie_info['theater_date'])
rt_movie_info['genre'] = rt_movie_info['genre'].map(bar_to_comma)
rt_movie_info['genre'] = rt_movie_info['genre'].map(csStringToList)
rt_movie_info['writer'] = rt_movie_info['writer'].map(bar_to_comma)

rt_movie_info = rt_movie_info .rename(columns={"rating": "ESRB rating", "genre": "genres", "theater_date": "release_date"})

rt_movie_info.head()

Unnamed: 0,id,synopsis,ESRB rating,genres,director,writer,release_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,"[Action and Adventure, Classics, Drama]",William Friedkin,Ernest Tidyman,1971-10-09,"Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,"[Drama, Science Fiction and Fantasy]",David Cronenberg,"David Cronenberg,Don DeLillo",2012-08-17,"Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,"[Drama, Musical and Performing Arts]",Allison Anders,Allison Anders,1996-09-13,"Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,"[Drama, Mystery and Suspense]",Barry Levinson,"Paul Attanasio,Michael Crichton",1994-12-09,"Aug 27, 1997",,,128 minutes,
4,7,,NR,"[Drama, Romance]",Rodney Bennett,Giles Cooper,NaT,,,,200 minutes,


In [237]:
def standardize_rotten_tomatos(rating):
    if type(rating) == str:
        rating = rating.strip()
        rating = rating.replace(" ", "")
        if '/' in rating:
            numeratior = float(rating.split('/')[0])
            denomenator = float(rating.split('/')[1])
            return numeratior / denomenator
    if type(rating) == float:
        return "float"
    else:
       return rating
rt_movie_review['original_rating'] =  rt_movie_review['rating']
rt_movie_review['rating'] = rt_movie_review['rating'].map(standardize_rotten_tomatos)

In [238]:
rt = pd.concat([rt_movie_review, rt_movie_info])
rt.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date,original_rating,synopsis,ESRB rating,genres,director,writer,release_date,dvd_date,currency,box_office,runtime,studio
0,3,A distinctly gallows take on contemporary fina...,0.6,fresh,PJ Nabarro,0.0,Patrick Nabarro,"November 10, 2018",3/5,,,,,,NaT,,,,,
1,3,It's an allegory in search of a meaning that n...,float,rotten,Annalee Newitz,0.0,io9.com,"May 23, 2018",,,,,,,NaT,,,,,
2,3,... life lived in a bubble in financial dealin...,float,fresh,Sean Axmaker,0.0,Stream on Demand,"January 4, 2018",,,,,,,NaT,,,,,
3,3,Continuing along a line introduced in last yea...,float,fresh,Daniel Kasman,0.0,MUBI,"November 16, 2017",,,,,,,NaT,,,,,
4,3,... a perverse twist on neorealism...,float,fresh,,0.0,Cinema Scope,"October 12, 2017",,,,,,,NaT,,,,,


# Feature Engineering 

The goals are to create two data frames. We want one to center around movie titles, studios, importaint people, production budgets, etc. The other will include genre information and public reception. 

## Movie Data

In [239]:
#Joining BOM and TMDB on Name and release year
tmdb.head()
bom.head()

Unnamed: 0,title,studio,domestic_gross,worldwide_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [242]:
bom_tmdb = pd.merge(bom, tmdb, on=["title", "year"])
bom_tmdb.head()

Unnamed: 0.1,title,studio,domestic_gross,worldwide_gross,year,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,rating,vote_count,genres
0,Toy Story 3,BV,415000000.0,652000000,2010,7,"[16, 10751, 35]",10193,en,Toy Story 3,24.445,2010-06-17,7.7,8340,"[Animation, Family, Comedy]"
1,Inception,WB,292600000.0,535700000,2010,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,8.3,22186,"[Action, Science Fiction, Adventure]"
2,Shrek Forever After,P/DW,238700000.0,513900000,2010,38,"[35, 12, 14, 16, 10751]",10192,en,Shrek Forever After,15.041,2010-05-16,6.1,3843,"[Comedy, Adventure, Fantasy, Animation, Family]"
3,The Twilight Saga: Eclipse,Sum.,300500000.0,398000000,2010,15,"[12, 14, 18, 10749]",24021,en,The Twilight Saga: Eclipse,20.34,2010-06-23,6.0,4909,"[Adventure, Fantasy, Drama, Romance]"
4,Iron Man 2,Par.,312400000.0,311500000,2010,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,6.8,12368,"[Adventure, Action, Science Fiction]"


In [243]:
#joining with imdb

movie_db = pd.merge(bom_tmdb, imdb_movies, on=["title", "year"])
movie_db.head()

KeyError: 'title'