# Cleaning commands and functions

In [7]:
# Unzipping csv.gz and tsv.gz files
!find . -name '*.csv.gz' -exec gzip -d {} \;
!find . -name '*.tsv.gz' -exec gzip -d {} \;

In [8]:
def indicator_str_parser(dataframe, parsed_column_str, list_of_strs):
    
    # If column full of strings has no string to be parsed, set value to 'none'
    dataframe[dataframe[parsed_column_str].isnull()] = "none"
    
    # Create indicator columns for columns with no string to be parsed
    dataframe[parsed_column_str + '_not_parsed_id'] = [1 if x == "none"
                                                       else 0 
                                                       for x in dataframe[parsed_column_str]]
    
    # starts list of created series to be used as arguments
    list_of_series = []
    
    # Loop over elements in list
    for genre in list_of_strs:
        
        # Make a new indicator column from the parsed column and the element to be searched for
        dataframe[parsed_column_str + '_' + genre + '_id'] = [1 if genre in x 
                                                            else 0 
                                                            for x in dataframe[parsed_column_str]]
        
        # Include new column in list to be fed into function
        list_of_series.append(dataframe[parsed_column_str + '_' + genre + '_id'])
        
    # Unpack list_of_series to be fed as arguments into zip function for unique tuples of parsed indicators
    dataframe[parsed_column_str + '_tuple'] = list(zip(*list_of_series))
    
    # return value counts showing how many strings in the column were parsed
    return dataframe[parsed_column_str + '_not_parsed_id'].value_counts()

# Misc Exploration

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
home = 'data/zippedData/'

df_movie_budgets = pd.read_csv('data/zippedData/tn.movie_budgets.csv')
df_movie_budgets['start_year'] = [(x[-4:]) for x in df_movie_budgets['release_date']]

df_movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,start_year
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",2009
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",2019
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",2015
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747",2017


In [3]:
df_title_basics = pd.read_csv(home + 'imdb.title.basics.csv')
df_title_basics.head()
df_title_basics.rename(columns={'primary_title': 'movie'})
df_title_basics[df_title_basics['genres'].isnull()] = "none"
df_title_basics

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
146142,none,none,none,none,none,none


In [9]:


indicator_str_parser(df_title_basics, 'genres', ['Comedy', 'Drama', 'Action'])
    

0    140736
1      5408
Name: genres_not_parsed_id, dtype: int64

In [5]:
df_movie_gross = pd.read_csv(home + 'bom.movie_gross.csv')
df_movie_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [6]:
df_budget_merge = pd.merge(df_movie_budgets, df_title_basics, how = 'inner', on = ('movie', 'start_year'))
df_budget_merge.head()

KeyError: 'movie'

In [None]:
def clean_dollars(dataframe, column_str):
    dataframe[column_str] = dataframe[column_str].str.replace(',',", '').str.replace('$', '').astype(int)
    return dataframe

In [None]:
df_budget_merge['comedy_id'] = [1 if 'comedy' in df_budget_merge['genre'] else 0]