# Exploratory Data Cleaning - Griff

### In the cells below, IMDB, TMDB, and 'The Numbers' Datasets are imported into Pandas DataFrames for cleaning.  The following datasets are used:
- imdb name basics
- imdb title basics
- imdb title crew
- imdb title principals
- imdb title ratings
- tmdb movies
- tn movie budgets

#### Pandas and Numpy are used for cleaning and analysis.  Matplotlib is for visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#### imdb.name.basics is imported and the birth_year, death_year, primary_profession, and known_for_titles columns are dropped because they will not be a part of this analysis.

In [None]:
imdb_name_basics = pd.read_csv('./data/imdb.name.basics.csv')
imdb_name_basics.drop(columns = {'birth_year', 
                                 'death_year',
                                 'primary_profession',
                                 'known_for_titles'},inplace=True)
imdb_name_basics.head()

#### imdb.title.basics is imported and the columns original_title and start_year are dropped.  The variables we will potentially be examining are runtime_minutes and genres.  Rows with missing values for genres will be dropped because they can't be replaced.  Missing values for runtime_minutes will be replaced with the median value. Median = 87 minutes, Mean = 86 minutes.

In [None]:
imdb_title_basics = pd.read_csv('./data/imdb.title.basics.csv')
imdb_title_basics.drop(columns = {'original_title','start_year'}, inplace = True)
imdb_title_basics = imdb_title_basics.loc[imdb_title_basics.genres.isna() == False]
imdb_title_basics.runtime_minutes.fillna(imdb_title_basics.runtime_minutes.median(), inplace = True)
# imdb_title_basics.genres = imdb_title_basics.genres.apply(lambda x: x.split(','))
imdb_title_basics.head()

#### From the imdb.title.crew file, the writers column is dropped.  The directors column will be explored.

In [None]:
imdb_title_crew = pd.read_csv('./data/imdb.title.crew.csv')
imdb_title_crew.drop(columns = 'writers', inplace = True)
imdb_title_crew = imdb_title_crew.dropna()
# imdb_title_crew.directors = imdb_title_crew.directors.apply(lambda x: x.split(','))
imdb_title_crew.head()

#### The imdb.title.principals file has role data for movies.  The job and characters columns are almost entirely incomplete and are dropped.

In [None]:
imdb_title_principals = pd.read_csv('./data/imdb.title.principals.csv')
imdb_title_principals.drop(columns = {'job', 'characters'}, inplace = True)
imdb_title_principals.head()

#### imdb.title.ratings has sitewide rating data.  No cleaning is needed as all entries are valid

In [None]:
imdb_title_ratings = pd.read_csv('./data/imdb.title.ratings.csv')
imdb_title_ratings = imdb_title_ratings.loc[imdb_title_ratings.numvotes >5]
imdb_title_ratings.head()

In [None]:
tmdb_movies = pd.read_csv('./data/tmdb.movies.csv')
tmdb_movies = tmdb_movies.drop(columns = {'original_language', 'release_date', 'original_title'}).set_index('Unnamed: 0')

In [None]:
def clean_columns(series):
    return series.apply(lambda x: float(x.replace('$', '').replace(',', '')))

In [None]:
tn_movie_budgets = pd.read_csv('./data/tn.movie_budgets.csv')
tn_movie_budgets = tn_movie_budgets.drop(columns={'release_date','id'})
tn_movie_budgets.production_budget = clean_columns(tn_movie_budgets.production_budget)
tn_movie_budgets.domestic_gross = clean_columns(tn_movie_budgets.domestic_gross)
tn_movie_budgets.worldwide_gross = clean_columns(tn_movie_budgets.worldwide_gross)
tn_movie_budgets.head()

In [None]:
tn_movie_budgets['budget_gross_ratio'] = tn_movie_budgets.worldwide_gross / tn_movie_budgets.production_budget
tn_movie_budgets = tn_movie_budgets.loc[tn_movie_budgets.domestic_gross > 0]
tn_movie_budgets.head()

# Exploratory Data Analysis

In [None]:
imdb_name_basics.head()
def get_name(name):
    return imdb_name_basics.loc[imdb_name_basics.nconst == name]


In [None]:
imdb_title_basics.head()

In [None]:
imdb_title_ratings.head()

In [None]:
imdb_title_crew.head()

In [None]:
imdb_title_principals.head()

In [None]:
tmdb_movies.head()

In [None]:
merge1 = tn_movie_budgets.rename(columns={'movie': 'primary_title'}).merge(imdb_title_basics, on= 'primary_title')
dat = merge1.groupby('genres').budget_gross_ratio.mean().sort_values(ascending = False)
dat = dat.loc[dat.values >25]
fig, ax = plt.subplots(figsize = (20,10))
ax.bar(dat.index, height = dat)
dat.loc[dat.values >20]

In [None]:
imdb_title_basics.head()