# Data Exploration - this is updated 2:14PM

## 1. Importing data into DataFrames

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_mojo_gross = pd.read_csv('dsc-phase-1-project/zippedData/bom.movie_gross.csv.gz')
df_imdb_name_basics = pd.read_csv('dsc-phase-1-project/zippedData/imdb.name.basics.csv.gz')
df_imdb_title = pd.read_csv('dsc-phase-1-project/zippedData/imdb.title.akas.csv.gz')
df_imdb_basics = pd.read_csv('dsc-phase-1-project/zippedData/imdb.title.basics.csv.gz')
df_imdb_title_crew = pd.read_csv('dsc-phase-1-project/zippedData/imdb.title.crew.csv.gz')
df_imdb_title_principals = pd.read_csv('dsc-phase-1-project/zippedData/imdb.title.principals.csv.gz')
df_imdb_title_ratings = pd.read_csv('dsc-phase-1-project/zippedData/imdb.title.ratings.csv.gz')
df_rt_movie_info = pd.read_csv('dsc-phase-1-project/zippedData/rt.movie_info.tsv.gz', sep='\t', encoding = 'unicode_escape')
df_rt_reviews = pd.read_csv('dsc-phase-1-project/zippedData/rt.reviews.tsv.gz', sep='\t', encoding = 'unicode_escape')
df_tmdb_movies = pd.read_csv('dsc-phase-1-project/zippedData/tmdb.movies.csv.gz')
df_tn_movie_budgets = pd.read_csv('dsc-phase-1-project/zippedData/tn.movie_budgets.csv.gz')

Previewing the raw data:

In [9]:
list_of_data = [df_mojo_gross, df_imdb_name_basics, df_imdb_title, df_imdb_basics, df_imdb_title_crew,
               df_imdb_title_principals, df_imdb_title_ratings, df_rt_movie_info, df_rt_reviews, df_tmdb_movies, 
                df_tn_movie_budgets]

for element in list_of_data:
    element.head()

## 2. Previewing DataFrames

In [None]:
for df in list_of_data:
    display(df.head())

In [None]:
for df in list_of_data:
    display(df.shape)

## 3. Visualizing DataFrames

In [None]:
df_tn_movie_budgets.info()

Converting production_budget, domestic_gross and worldwide_gross from strings into floats:

In [None]:
df_tn_movie_budgets['production_budget_edited'] = df_tn_movie_budgets.production_budget.str.replace('$', '')
df_tn_movie_budgets['production_budget_edited'] = df_tn_movie_budgets['production_budget_edited'].str.replace(',', '')
df_tn_movie_budgets['production_budget_edited'] = df_tn_movie_budgets['production_budget_edited'].astype(float)

df_tn_movie_budgets['domestic_gross_edited'] = df_tn_movie_budgets.domestic_gross.str.replace('$', '')
df_tn_movie_budgets['domestic_gross_edited'] = df_tn_movie_budgets['domestic_gross_edited'].str.replace(',', '')
df_tn_movie_budgets['domestic_gross_edited'] = df_tn_movie_budgets['domestic_gross_edited'].astype(float)

df_tn_movie_budgets['worldwide_gross_edited'] = df_tn_movie_budgets.worldwide_gross.str.replace('$', '')
df_tn_movie_budgets['worldwide_gross_edited'] = df_tn_movie_budgets['worldwide_gross_edited'].str.replace(',', '')
df_tn_movie_budgets['worldwide_gross_edited'] = df_tn_movie_budgets['worldwide_gross_edited'].astype(float)

df_tn_movie_budgets.head()

Making the basis in $M.

In [None]:
df_tn_movie_budgets['production_budget_edited'] = df_tn_movie_budgets['production_budget_edited'] / 1000000
df_tn_movie_budgets['domestic_gross_edited'] = df_tn_movie_budgets['domestic_gross_edited'] / 1000000
df_tn_movie_budgets['worldwide_gross_edited'] = df_tn_movie_budgets['worldwide_gross_edited'] / 1000000

df_tn_movie_budgets.head()

In [None]:
ax = sns.scatterplot(x=df_tn_movie_budgets['production_budget_edited'], y=df_tn_movie_budgets['worldwide_gross_edited'])

ax.set_title('Production Budget vs. Worldwide Gross ($M)')
ax.set_ylabel('Worldwide Gross ($M)')
ax.set_xlabel('Production Budget ($M)')

ax.text(df_tn_movie_budgets.production_budget_edited[df_tn_movie_budgets.movie=='Avatar'],
        df_tn_movie_budgets.worldwide_gross_edited[df_tn_movie_budgets.movie=='Avatar'],
         "Avatar", color='red');

Calculating an ROI figure.

In [None]:
df_tn_movie_budgets['ROI'] = df_tn_movie_budgets['worldwide_gross_edited'] / df_tn_movie_budgets['production_budget_edited']

df_tn_movie_budgets.head()

In [None]:
df_tn_movie_budgets_ROIaboveX = df_tn_movie_budgets[df_tn_movie_budgets['ROI'] >= 5]

df_tn_movie_budgets_ROIaboveX.head()

Showing movies with ROI > 5

In [None]:
fig, ax = plt.subplots()

ax.scatter(x=df_tn_movie_budgets['production_budget_edited'], y=df_tn_movie_budgets['worldwide_gross_edited'])
ax.scatter(x=df_tn_movie_budgets_ROIaboveX['production_budget_edited'], 
           y=df_tn_movie_budgets_ROIaboveX['worldwide_gross_edited'], label='ROI Above 5')

ax.set_title('Production Budget vs. Worldwide Gross ($M)')
ax.set_ylabel('Worldwide Gross ($M)')
ax.set_xlabel('Production Budget ($M)')
ax.legend(loc='upper left');

In [None]:
df_tn_movie_budgets_ROIaboveX.sort_values('ROI', ascending=False)

Only interested in relevant movies - post 2009

In [None]:
df_tn_movie_budgets_ROIaboveX['Year'] = df_tn_movie_budgets_ROIaboveX['release_date'].str[-4:]*1
df_tn_movie_budgets_ROIaboveX['Year'] = df_tn_movie_budgets_ROIaboveX['Year'].astype(int)
df_tn_movie_budgets_ROIaboveX.head()

In [None]:
df_tn_movie_budgets_post2009 = df_tn_movie_budgets_ROIaboveX[df_tn_movie_budgets_ROIaboveX['Year'] >= 2009]
df_tn_movie_budgets_post2009.head()

In [None]:
df_tn_movie_budgets_post2009.shape

In [None]:
df_tmdb_movies.head()

In order to translte the genre_ids into meaningful genre names, we must retrieve the genre key from TMDB:

In [None]:
api_key = "d012e23fce658ca5d5707ca650488781"

import requests

url = 'https://api.themoviedb.org/3/genre/movie/list?api_key={}&language=en-US'.format(api_key)

tmdb_genre = requests.get(url)
print(tmdb_genre)
print(type(tmdb_genre.content))
print(tmdb_genre.text)

In [None]:
tmdb_genre = tmdb_genre.json()
tmdb_genre

In [None]:
tmdb_genre_dict = {}

for x in range(len(tmdb_genre['genres'])):
    entry = {tmdb_genre['genres'][x]['id']: tmdb_genre['genres'][x]['name']}
    tmdb_genre_dict.update(entry)

tmdb_genre_dict

In [None]:
keys_values = tmdb_genre_dict.items()

tmdb_genre_dict = {str(key): str(value) for key, value in keys_values}
tmdb_genre_dict
#types1 = [type(k) for k in tmdb_genre_dict.keys()]
#types1

In [None]:
#df_tmdb_movies['genre'] = df_tmdb_movies['genre'].map(tmdb_genre_dict)
#df_tmdb_movies.head()

In [None]:
type(df_tmdb_movies['genre'][0])

In [None]:
df_tmdb_movies.head()

In [None]:
df_tmdb_movies['genre_ids'][0]

In [None]:
df_tmdb_movies['genre'] = df_tmdb_movies['genre_ids'].str.strip('[]')
df_tmdb_movies['genre'] = df_tmdb_movies['genre'].str.replace(' ', '')
df_tmdb_movies['genre'] = df_tmdb_movies['genre'].str.split(',')

df_tmdb_movies.head()

In [None]:
df_tmdb_movies_genresplit = pd.DataFrame(df_tmdb_movies['genre'].values.tolist())

for i in range (6):
    df_tmdb_movies_genresplit[i] = df_tmdb_movies_genresplit[i].apply(genreID_coverter)

df_tmdb_movies_genresplit.columns = ['genre1', 'genre2','genre3','genre4','genre5','genre6', 'genre7']

df_tmdb_movies_genresplit

In [None]:
df_combined = pd.concat([df_tmdb_movies, df_tmdb_movies_genresplit], axis=1)
df_combined

In [None]:
def genreID_coverter(string):
    if string is not None and len(string) > 0:
        return tmdb_genre_dict[string]

In [None]:
df_test = df_tmdb_movies[['genre']]

#df_test['genre_actual'] = df_test['genre'].replace(tmdb_genre_dict, inplace=True)

df_test2 = pd.DataFrame(df_test['genre'].values.tolist())

string = '12'



#genreID_coverter(string)


df_test2[1] = df_test2[1].apply(genreID_coverter)

df_test2

THIS IS A TEST CELL

In [None]:
dic = {1:'Action', 2:'Comedy', 3:'Adventure'}

def list_lookup_to_dict(lst):
    new_list = []
    for element in lst:
        if element in tmdb_genre_dict.keys():
            new_list.append(tmdb_genre_dict[element])
    return new_list
            
list_lookup_to_dict(lst)


In [None]:
lst = [1,2,3,5,7]
dic = {1:'Action', 2:'Comedy', 3:'Adventure'}

def check_list_to_dict(lst):
    new_list = []
    for i in lst:
        if i in tmdb_genre_dict.keys():
            new_list.append(tmdb_genre_dict[i])
    return new_list

check_list_to_dict(lst)

df_genre_test = pd.DataFrame(lst, columns=['genre_code'])

#df_genre_test['genre'] = df_genre_test['genre_code'].apply(check_list_to_dict)

df_genre_test.head()

In [None]:
df_tn_movie_budgets['release_date'] = pd.to_datetime(df_tn_movie_budgets['release_date'])
df_tmdb_movies['release_date'] = pd.to_datetime(df_tmdb_movies['release_date'])
tn_tmdb_merged_df = df_tmdb_movies.merge(df_tn_movie_budgets, how='left', left_on=['original_title','release_date'], 
                                         right_on=['movie','release_date'])
tn_tmdb_merged_df

In [None]:
df_MAIN = pd.concat([tn_tmdb_merged_df, df_tmdb_movies_genresplit], axis=1)
df_MAIN

In [None]:
type(df_tmdb_movies['genre'][0])

In [None]:
#df_tmdb_movies = pd.DataFrame(df_tmdb_movies['genre'].to_list(), columns=['genre1', 'genre2', 'genre3', 'genre4'])

df_tmdb_movies['genre_actual'] = df_tmdb_movies['genre'].apply(list_lookup_to_dict)

df_tmdb_movies['genre'].apply(list_lookup_to_dict)

In [None]:
#for key, value in tmdb_genre_dict.items():
#    df_tmdb_movies[value] = 0

#df_tmdb_movies.head()

In [None]:
df_tmdb_movies.merge(df_tn_movie_budgets, how='inner', left_on=['original_title'], right_on=['movie'])