In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Set Global Ranges and Variables

In [2]:
#years = range(2008,2018)            #Comment in/out year ranges
years = range(1988,1998)

num_budget_files = 6                #total number of budget files to read

### Set Definitions

In [3]:
#clean time from movie data
def clean_time(row):   
    try:
        return datetime.strptime(row, '%B %d, %Y')
    except: pass    

def clean_time_2(row):
    try:
        return datetime.strptime(row, "%Y-%m-%d")
    except: pass    

#create unique ID from movie open date and title
def trunc(row, characters):
    try:
        row = str(row)    #cast to string
        return row[:characters]
    except:
        return row

#create unique ID from movie open date and title
def trunc_end(row, characters):
    try:
        return row[-characters:]
    except:
        return row

#create unique ID from movie open date and title
def time2string(row):
    try:
        return row.strftime('%Y-%m-%d')
    except: pass
    
#create unique ID from movie open date and title
def time2monthyearstring(row):
    try:
        return row.strftime('%Y-%m')
    except: pass
    
#remove (YEAR) formatting
def remove_year(row):
    return row.split(' (')[0]



### Read In Data

In [4]:
#first read the annual list of movies, starting the the first year's set
year_df = pd.read_csv('../04_Data/{}_movies.csv'.format(years[0]), index_col=0)

#then concatenate the remaining 9 years together
for year in years[1:]:
    temp_df = pd.read_csv('../04_Data/{}_movies.csv'.format(year), index_col=0)
    year_df = pd.concat([year_df,temp_df])
    
#Clean Datetime
year_df["open_date"] = year_df["open_date"].apply(clean_time_2)

In [5]:
#read the movie detail data, starting with the first year's set
movie_df = pd.read_csv('../04_Data/{}_movies_detail.csv'.format(years[0]), index_col=0)

#then concatenate the remaining 9 years together
for year in years[1:]:
    temp_df = pd.read_csv('../04_Data/{}_movies_detail.csv'.format(year), index_col=0)
    movie_df = pd.concat([movie_df, temp_df])
    
#Clean date time from movie dataframe
movie_df["new_close"] = movie_df["close"].apply(clean_time)

In [6]:
#Read in the budget data
budget_df = pd.read_csv('../04_Data/budget_1.csv', index_col=0, parse_dates=['open_date'])

#concatenate remaining data together
for k in range(2,(num_budget_files+1)):                              #Use variable to set # of files to read
    temp_df = pd.read_csv('../04_Data/budget_{}.csv'.format(k), index_col=0, parse_dates=['open_date'])
    budget_df = pd.concat([budget_df, temp_df])

#isolate useful data
budget_df = budget_df.drop(["rank","worldwide"],axis=1)   #drop unecessary columns
budget_df = budget_df[budget_df['open_date'] >= datetime(years[0], 1, 1,0,0)]    #only incl. after the start year
budget_df = budget_df[budget_df['open_date'] <= datetime(years[-1], 12, 31,0,0)] #only incl. before the end of last year

budget_df = budget_df[budget_df['domestic'] > 10000]     #only include movies that grossed at least $10K

budget_df = budget_df.rename(index=str, columns={'open_date':'open'})

### Inspect Data

In [429]:
#Yearly Movie Data
year_df.head(5)

Unnamed: 0,rank,studio,total_box,max_sites,open_box,open_sites,open_date,url,title
0,1,MGM,172825435,1590.0,7005719.0,1248.0,1988-12-16,/movies/?id=rainman.htm,Rain Man
1,2,BV,156452370,1598.0,11226239.0,1045.0,1988-06-24,/movies/?id=whoframedrogerrabbit.htm,Who Framed Roger Rabbit
2,3,Par.,128152301,2064.0,21404420.0,2064.0,1988-06-29,/movies/?id=comingtoamerica.htm,Coming to America
3,4,Fox,114968774,1419.0,8216190.0,1132.0,1988-06-03,/movies/?id=big.htm,Big
4,5,Uni.,111938388,1659.0,11174980.0,1396.0,1988-12-09,/movies/?id=twins.htm,Twins


In [430]:
year_df.shape

(2000, 9)

In [431]:
#Movie Detail
movie_df.head(5)

Unnamed: 0,url,worldwide gross,mpaa,budget,genre,runtime,close,rundays,3d,imax,series,new_close
0,/movies/?id=rainman.htm,354825435.0,R,$25 million,Drama,133.0,,,0,0,0,NaT
1,/movies/?id=whoframedrogerrabbit.htm,329803958.0,PG,$70 million,Fantasy Comedy,103.0,,,0,0,0,NaT
2,/movies/?id=comingtoamerica.htm,288752301.0,R,,Romantic Comedy,116.0,"December 1, 1988",,0,0,0,1988-12-01
3,/movies/?id=big.htm,151668774.0,PG,,Fantasy Comedy,104.0,,,0,0,0,NaT
4,/movies/?id=twins.htm,216614388.0,PG,$15 million,Comedy,105.0,,,0,0,0,NaT


In [432]:
movie_df.shape

(2000, 12)

In [433]:
#Budget
budget_df.head(5)

Unnamed: 0,open,title,prod_budget,domestic
41,1997-12-19,Titanic,200000000,659363944
102,1995-07-28,Waterworld,175000000,88246220
262,1997-06-20,Batman & Robin,125000000,107325195
293,1997-02-07,Dante's Peak,115000000,67163857
308,1997-12-19,Tomorrow Never Dies,110000000,125304276


In [434]:
budget_df.shape

(547, 4)

### Merge Movies by Year and Movie Tables Together

In [435]:
#Merge the movie year + movie details lists together using the unique URL
detail_df = pd.merge(year_df, movie_df, on='url', how='left')

#re-organize tables to be a little cleaner and leave out uneeded columns
detail_df = detail_df[["title",'mpaa',"open_date","total_box",'open_box',
                     'worldwide gross','genre','runtime','rundays','3d', 'imax', 'series', 'max_sites']]

#rename columns for readability
detail_df = detail_df.rename(index=str, columns={"open_date": "open", "total_box": 'usa_box', 
                              "open_box": 'usa_open', "worldwide gross": "intl_box", "max_sites":'theaters' })

In [436]:
#take a look
detail_df.head(2)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters
0,Rain Man,R,1988-12-16,172825435,7005719.0,354825435.0,Drama,133.0,,0,0,0,1590.0
1,Who Framed Roger Rabbit,PG,1988-06-24,156452370,11226239.0,329803958.0,Fantasy Comedy,103.0,,0,0,0,1598.0


### Check for Duplicates

In [437]:
#check to see if there are any movie table duplicates. There are not. 
temp_df = detail_df.groupby(["title"])['mpaa'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['mpaa'],ascending=False).head(2)

Unnamed: 0,title,mpaa
0,101 Dalmatians (1996),1
1329,Seven,1


In [438]:
#check to see if there are any budget table duplicates on title and open. There are not. 
temp_df = budget_df.groupby(["title","open"])['prod_budget'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['prod_budget'],ascending=False).head(2)

Unnamed: 0,title,open,prod_budget
0,3 Ninjas Kick Back,1994-05-06,1
367,Sling Blade,1996-11-20,1


### Merge Movie and Budget Tables

The Movie table is from BoxOfficeMojo.com and the Budget table is from The-Numbers.com, and there are inconsistencies in how the movies are named. For example, the same movie is called 'Star Wars: The Last Jedi on BoxOfficeMojo and 'Star Wars Ep. VII: The Last Jedi' on The-Numbers. "Tyler Perry's Meet the Browns" on BoxOfficeMojo is "Meet the Browns" on The-Numbers. Also, "Under the Same Moon" on BoxOfficeMojo is "La misma luna" on The-Numbers

Due to the variety of differences in the data, I will try a range of strategies to create unique matches from one table to the others. 

__Merge 1: Unique ID Using Release Date + Forward Title Fragment__

In [439]:
#Create Unique ID from the opening day + the first 5 characters of the title. This is a preferred 
#route over matching to just the movie title because movies are named differently on BoxOfficeMojo vs The Numbers
budget_df["id"] = budget_df["open"].apply(time2monthyearstring) + budget_df["title"].apply(trunc,characters=9)
detail_df["id"] = detail_df["open"].apply(time2monthyearstring) + detail_df["title"].apply(trunc,characters=9)

In [440]:
#check to see if the budget keys are unique. They are!  
temp_df = budget_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,1988-01Return of,1
367,1996-05Spy Hard,1


In [441]:
#check to see if the movie keys are unique. They are!  
temp_df = detail_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,1988-01Braddock:,1
1319,1994-09Quiz Show,1


In [442]:
#Merge!! 
temp_df = pd.merge(detail_df, budget_df, on='id', how='outer')

In [443]:
#This contains the movies that matched to a budget via 'ID' as a unique ID
group_1_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]  
group_1_df = group_1_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_1_df = group_1_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [444]:
group_1_df.head(2)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters,prod_budget
0,Rain Man,R,1988-12-16,172825435.0,7005719.0,354825435.0,Drama,133.0,,0.0,0.0,0.0,1590.0,25000000.0
1,Who Framed Roger Rabbit,PG,1988-06-24,156452370.0,11226239.0,329803958.0,Fantasy Comedy,103.0,,0.0,0.0,0.0,1598.0,70000000.0


In [445]:
group_1_df.shape  #477 Movies Matched on this merge

(477, 14)

In [446]:
#this is a dataframe of movies that didn't match
remaining_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_df = remaining_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_df = remaining_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [447]:
remaining_df.shape #1523 Movies did not Match on this merge

(1523, 13)

In [448]:
#this is a dataframe of budget information that has not matched yet
remain_budget_df = temp_df[temp_df["title_x"].isnull()]  
remain_budget_df = remain_budget_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_df = remain_budget_df[["open","title","prod_budget","domestic"]]

In [449]:
remain_budget_df.shape  #70 Budget Lines remain

(70, 4)

__Merge 2: Unique ID Using Release Date + Trailing Title Fragment__

In [450]:
#create new 'ID2' to match using opening date plus trailing characters from the movie name
remain_budget_df["id2"] = remain_budget_df["open"].apply(time2string)  + remain_budget_df["title"].apply(trunc_end,characters=9)
remaining_df["id2"] = remaining_df["open"].apply(time2string)  + remaining_df["title"].apply(trunc_end,characters=9)

In [451]:
#check to see if the budget keys are unique. They Are!  
temp_df = remain_budget_df.groupby(["id2"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id2,title
0,1988-03-04Prison,1
44,1996-08-09from L.A.,1


In [452]:
#check to see if the movie keys are unique. There are two duplicates, which have closer inspection below
temp_df = remaining_df.groupby(["id2"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id2,title
0,1988-01-15For Keeps,1
1002,1994-01-21ersection,1


In [453]:
# inpsecting the duplicate keys above shows two are re-releases of previous movies and may not have
# had theatrical releases It's probably ok to leave these in there since they would be removed later
# on for not having theaters
remaining_df[(remaining_df['open']=='2012-07-19')]

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters,id2


In [454]:
#Merge!!!!!
temp_df = pd.merge(remaining_df, remain_budget_df, on='id2', how='outer')

In [455]:
#This contains the movies that matched to a budget via 'ID2' as a unique ID
group_2_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]  
group_2_df = group_2_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_2_df = group_2_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [456]:
group_2_df.shape    #9 Movie Matched in this round

(9, 14)

In [457]:
#this is a dataframe of movies that didn't match
remaining_2_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_2_df = remaining_2_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_2_df = remaining_2_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [458]:
remaining_2_df.shape  #1514 Movies remain unmatched 

(1514, 13)

In [459]:
#this is a dataframe of budget information that has not matched yet
remain_budget_2_df = temp_df[temp_df["title_x"].isnull()]  
remain_budget_2_df = remain_budget_2_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_2_df = remain_budget_2_df[["open","title","prod_budget","domestic"]]

In [460]:
remain_budget_2_df.shape   #61 Budget Lines remain

(61, 4)

__Merge 3: Titles__

This will catch movies that have different open dates on BoxOfficeMojo vs The-Numbers

In [461]:
#Some titles in Box Office Mojo have Years in their names with (). These should be cleaned
remaining_2_df["title"] = remaining_2_df["title"].apply(remove_year)

In [462]:
#Since the budget rows are only unique on title and also date, I'll still use year and month
remaining_2_df["id"] = remaining_2_df["open"].apply(time2monthyearstring) + remaining_2_df["title"]
remain_budget_2_df["id"] = remain_budget_2_df["open"].apply(time2monthyearstring) + remain_budget_2_df["title"]

In [463]:
#merge on the name of the movie itself
temp_df = pd.merge(remaining_2_df, remain_budget_2_df, on='id', how='outer')

In [464]:
#this contains the movies that matched by name
group_3_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["mpaa"].notnull()]    #boolean changed to mpaa here
group_3_df = group_3_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_3_df = group_3_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [465]:
group_3_df.shape  #2 Movies matched on Title

(2, 14)

In [466]:
#check to make sure there were no duplicates. Looks good! 
group_3_df.groupby(["title"])["mpaa"].count().reset_index().sort_values(["mpaa"],ascending=False).head(2)

Unnamed: 0,title,mpaa
0,Crash,1
1,Hamlet,1


In [467]:
#this is a dataframe of movies that didn't match
remaining_3_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_3_df = remaining_3_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_3_df = remaining_3_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [468]:
remaining_3_df.shape  #1512 Movies remain unmatched

(1512, 13)

In [469]:
#this is a dataframe of budget information that has not matched yet
remain_budget_3_df = temp_df[temp_df["mpaa"].isnull()]    #boolean changed to mpaa here
remain_budget_3_df = remain_budget_3_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_3_df = remain_budget_3_df[["open","title","prod_budget","domestic"]]

In [470]:
remain_budget_3_df.shape  #59 Budgets Remain Unmatched

(59, 4)

__Explore Remaining Titles: What Else Can I Merge On?__

In [471]:
remaining_3_df.sort_values(['usa_box'],ascending=False).head(5)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters
853,The Firm,R,1993-07-02,158348367.0,25400000.0,270248367.0,Thriller,154.0,,0.0,0.0,0.0,2393.0
179,Lethal Weapon 2,R,1989-07-07,147253986.0,20388800.0,227853986.0,Action Comedy,113.0,,0.0,0.0,1.0,1830.0
180,Look Who's Talking,PG-13,1989-10-13,140088813.0,12107784.0,296999813.0,Family Comedy,93.0,,0.0,0.0,1.0,1651.0
687,Sister Act,PG,1992-05-29,139605150.0,11894587.0,231605150.0,Comedy,100.0,,0.0,0.0,0.0,2087.0
1400,Star Wars,PG,1997-01-31,138257865.0,35906661.0,579646015.0,Sci-Fi Fantasy,125.0,,0.0,0.0,1.0,2375.0


In [472]:
remain_budget_3_df.sort_values(['domestic'],ascending=False).head(5)

Unnamed: 0,open,title,prod_budget,domestic
1518,1993-06-30,The Firm,42000000.0,158340892.0
1528,1994-12-16,Dumb & Dumber,16000000.0,127175374.0
1514,1997-06-20,Batman & Robin,125000000.0,107325195.0
1515,1995-11-17,Goldeneye,60000000.0,106429941.0
1516,1997-07-16,George Of The Jungle,55000000.0,105263257.0


I also think I can match on Open Date and the first few digits of the domestic box office

__Merge 4: Date + First N Digits of the Domestic Box Office Total__

In [473]:
#Create Unique ID from the opening day + the first 2 characters of the domestic budget. 
remain_budget_3_df["id"] = remain_budget_3_df["open"].apply(time2monthyearstring)  + '-' + remain_budget_3_df["domestic"].apply(trunc,characters=3)
remaining_3_df["id"] = remaining_3_df["open"].apply(time2monthyearstring)  + '-' + remaining_3_df["usa_box"].apply(trunc,characters=3)

In [474]:
#check to see if the budget keys are unique. They are!  
temp_df = remain_budget_3_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,1988-03-354,1
44,1997-01-353,1


In [475]:
#merge!!!
temp_df = pd.merge(remaining_3_df, remain_budget_3_df, on='id', how='outer')

In [476]:
#this contains the movies that matched by name
group_4_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]    
group_4_df = group_4_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_4_df = group_4_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [477]:
group_4_df.shape  #15 Movies Matched

(15, 14)

In [478]:
#this is a dataframe of movies that didn't match
remaining_4_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_4_df = remaining_4_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_4_df = remaining_4_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [479]:
remaining_4_df.shape   #1497 Movies Remail Unmatched

(1497, 13)

In [480]:
#this is a dataframe of budget information that has not matched yet
remain_budget_4_df = temp_df[temp_df["mpaa"].isnull()]    #boolean changed to mpaa here
remain_budget_4_df = remain_budget_4_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_4_df = remain_budget_4_df[["open","title","prod_budget","domestic"]]

In [481]:
remain_budget_4_df.shape   #44 Budgets unmatched

(44, 4)

__Merge 5: Title + First N Digits of the Domestic Box Office Total__

In [482]:
#Create Unique ID from the opening day + the first 2 characters of the domestic budget. 
remain_budget_4_df["id"] = remain_budget_4_df["title"] + remain_budget_4_df["domestic"].apply(trunc,characters=2)
remaining_4_df["id"] = remaining_4_df["title"] + remaining_4_df["usa_box"].apply(trunc,characters=2)

In [483]:
#check to see if the budget keys are unique. They are!  
temp_df = remain_budget_4_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,Albino Alligator35,1
1,Bottle Rocket40,1


In [484]:
#merge!!!
temp_df = pd.merge(remaining_4_df, remain_budget_4_df, on='id', how='outer')

In [485]:
#this contains the movies that matched by name
group_5_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]    
group_5_df = group_5_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_5_df = group_5_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [486]:
group_5_df.shape      #5 Movie Matched this time

(5, 14)

In [487]:
#this is a dataframe of movies that didn't match
remaining_5_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_5_df = remaining_5_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_5_df = remaining_5_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [488]:
remaining_5_df.shape   #1492 Remain Unmatched

(1492, 13)

In [489]:
#this is a dataframe of budget information that has not matched yet
remain_budget_5_df = temp_df[temp_df["mpaa"].isnull()]    #boolean changed to mpaa here
remain_budget_5_df = remain_budget_5_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_5_df = remain_budget_5_df[["open","title","prod_budget","domestic"]]

In [490]:
remain_budget_5_df.shape  #39 budgets remain unmatched

(39, 4)

__Explore Remaining Titles: Nothing Left to Merge On__

I feel pretty good stopping here. The largest movie on the remaining budget dataframe
has a domestic gross of 30 million while the top movie on the remaining movies is $90M which means all the movies in between will never be matched to the budget information, and it is thus not available. That's not bad. Also, there are movies on this list that aren't really from this decade, such as The Lion King, Monsters, Inc, so many of these unmatched movies are rereleases and shouldn't impact a prediction model for this decade's films

In [491]:
remaining_5_df.sort_values(["usa_box"],ascending=False).head(10)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters
179,Lethal Weapon 2,R,1989-07-07,147253986.0,20388800.0,227853986.0,Action Comedy,113.0,,0.0,0.0,1.0,1830.0
180,Look Who's Talking,PG-13,1989-10-13,140088813.0,12107784.0,296999813.0,Family Comedy,93.0,,0.0,0.0,1.0,1651.0
684,Sister Act,PG,1992-05-29,139605150.0,11894587.0,231605150.0,Comedy,100.0,,0.0,0.0,0.0,2087.0
1386,Star Wars,PG,1997-01-31,138257865.0,35906661.0,579646015.0,Sci-Fi Fantasy,125.0,,0.0,0.0,1.0,2375.0
1280,101 Dalmatians,G,1996-11-27,136189294.0,33504025.0,320689294.0,Family Comedy,103.0,,0.0,0.0,1.0,2901.0
181,"Honey, I Shrunk the Kids",PG,1989-06-23,130724172.0,14262961.0,222724172.0,Family Adventure,101.0,,0.0,0.0,0.0,1498.0
0,Coming to America,R,1988-06-29,128152301.0,21404420.0,288752301.0,Romantic Comedy,116.0,,0.0,0.0,0.0,2064.0
850,Sleepless in Seattle,PG,1993-06-25,126680884.0,17253733.0,227799884.0,Romantic Comedy,105.0,,0.0,0.0,0.0,1657.0
1281,The Birdcage,R,1996-03-08,124060553.0,18275828.0,185260553.0,Comedy,117.0,,0.0,0.0,0.0,2285.0
516,City Slickers,PG-13,1991-06-07,124033791.0,13032121.0,179033791.0,Comedy,112.0,,0.0,0.0,0.0,2171.0


In [492]:
remain_budget_5_df.sort_values(["domestic"],ascending=False).head(10)

Unnamed: 0,open,title,prod_budget,domestic
1500,1994-11-18,LÃ©on,16000000.0,19284974.0
1503,1996-10-25,Thinner,14000000.0,15171475.0
1511,1991-03-01,Haakon Haakonsen,8500000.0,15024232.0
1524,1994-09-23,Exotica,1500000.0,5046118.0
1513,1990-09-28,Dark Angel,7000000.0,4372561.0
1529,1991-01-01,Straight out of Brooklyn,450000.0,2712293.0
1506,1991-01-01,Return to the Blue Lagoon,11000000.0,2000000.0
1531,1996-09-20,When The Cat's Away,300000.0,1652472.0
1504,1997-12-31,Oscar and Lucinda,12500000.0,1612957.0
1497,1996-04-19,Le hussard sur le toit,35000000.0,1320043.0


__Concatenate & Export the Merged List__

In [493]:
pd.concat([group_1_df, group_2_df, group_3_df, 
            group_4_df, group_5_df, remaining_5_df]).to_csv('../04_Data/{}_{}_merged.csv'.format(years[0],years[-1]))

In [494]:
pd.concat([group_1_df, group_2_df, group_3_df, 
           group_4_df, group_5_df, remaining_5_df]).shape

(2000, 14)