In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Set Global Ranges and Variables

In [2]:
years = range(2008,2018)            #Comment in/out year ranges
#years = range(1988,1998)

num_budget_files = 6                #total number of budget files to read

### Set Definitions

In [3]:
#clean time from movie data
def clean_time(row):   
    try:
        return datetime.strptime(row, '%B %d, %Y')
    except: pass    

def clean_time_2(row):
    try:
        return datetime.strptime(row, "%Y-%m-%d")
    except: pass       
    
#create unique ID from movie open date and title
def trunc(row, characters):
    try:
        row = str(row)    #cast to string
        return row[:characters]
    except:
        return row

#create unique ID from movie open date and title
def trunc_end(row, characters):
    try:
        return row[-characters:]
    except:
        return row

#create unique ID from movie open date and title
def time2string(row):
    try:
        return row.strftime('%Y-%m-%d')
    except: pass
    
#create unique ID from movie open date and title
def time2monthyearstring(row):
    try:
        return row.strftime('%Y-%m')
    except: pass
    
#remove (YEAR) formatting
def remove_year(row):
    return row.split(' (')[0]

### Read In Data

In [4]:
#first read the annual list of movies, starting the the first year's set
year_df = pd.read_csv('../04_Data/{}_movies.csv'.format(years[0]), index_col=0)

#then concatenate the remaining 9 years together
for year in years[1:]:
    temp_df = pd.read_csv('../04_Data/{}_movies.csv'.format(year), index_col=0)
    year_df = pd.concat([year_df,temp_df])
    
#Clean Datetime
year_df["open_date"] = year_df["open_date"].apply(clean_time_2)

In [5]:
#read the movie detail data, starting with the first year's set
movie_df = pd.read_csv('../04_Data/{}_movies_detail.csv'.format(years[0]), index_col=0)

#then concatenate the remaining 9 years together
for year in years[1:]:
    temp_df = pd.read_csv('../04_Data/{}_movies_detail.csv'.format(year), index_col=0)
    movie_df = pd.concat([movie_df, temp_df])
    
#Clean date time from movie dataframe
movie_df["new_close"] = movie_df["close"].apply(clean_time)

In [6]:
#Read in the budget data
budget_df = pd.read_csv('../04_Data/budget_1.csv', index_col=0, parse_dates=['open_date'])

#concatenate remaining data together
for k in range(2,(num_budget_files+1)):                              #Use variable to set # of files to read
    temp_df = pd.read_csv('../04_Data/budget_{}.csv'.format(k), index_col=0, parse_dates=['open_date'])
    budget_df = pd.concat([budget_df, temp_df])

#isolate useful data
budget_df = budget_df.drop(["rank","worldwide"],axis=1)   #drop unecessary columns
budget_df = budget_df[budget_df['open_date'] >= datetime(years[0], 1, 1,0,0)]    #only incl. after the start year
budget_df = budget_df[budget_df['open_date'] <= datetime(years[-1], 12, 31,0,0)] #only incl. before the end of last year

budget_df = budget_df[budget_df['domestic'] > 10000]     #only include movies that grossed at least $10K

budget_df = budget_df.rename(index=str, columns={'open_date':'open'})

### Inspect Data

In [7]:
#Yearly Movie Data
year_df.head(5)

Unnamed: 0,rank,studio,total_box,max_sites,open_box,open_sites,open_date,url,title
0,1,WB,533345358,4366.0,158411483.0,4366.0,2008-07-18,/movies/?id=darkknight.htm,The Dark Knight
1,2,Par.,318412101,4154.0,98618668.0,4105.0,2008-05-02,/movies/?id=ironman.htm,Iron Man
2,3,Par.,317101119,4264.0,100137835.0,4260.0,2008-05-22,/movies/?id=indianajones4.htm,Indiana Jones and the Kingdom of the Crystal S...
3,4,Sony,227946274,3965.0,62603879.0,3965.0,2008-07-02,/movies/?id=hancock.htm,Hancock
4,5,BV,223808164,3992.0,63087526.0,3992.0,2008-06-27,/movies/?id=wall-e.htm,WALL-E


In [8]:
#Movie Detail
movie_df.head(5)

Unnamed: 0,url,worldwide gross,mpaa,budget,genre,runtime,close,rundays,3d,imax,series,new_close
0,/movies/?id=darkknight.htm,1004558444.0,PG-13,$185 million,Action / Adventure,150.0,"March 5, 2009",231,0,1,1,2009-03-05
1,/movies/?id=ironman.htm,585174222.0,PG-13,$140 million,Action / Adventure,126.0,"October 2, 2008",,0,0,1,2008-10-02
2,/movies/?id=indianajones4.htm,786636033.0,PG-13,$185 million,Period Adventure,120.0,"October 16, 2008",Opening,0,0,1,2008-10-16
3,/movies/?id=hancock.htm,624386746.0,PG-13,$150 million,Fantasy Drama,92.0,"September 7, 2008",72,0,0,0,2008-09-07
4,/movies/?id=wall-e.htm,533281433.0,G,$180 million,Animation,97.0,"January 8, 2009",196,0,0,0,2009-01-08


In [9]:
#Budget
budget_df.head(5)

Unnamed: 0,open,title,prod_budget,domestic
0,2009-12-18,Avatar,425000000,760507625
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875
2,2015-05-01,Avengers: Age of Ultron,330600000,459005868
3,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382
4,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225


### Merge Movies by Year and Movie Tables Together

In [10]:
#Merge the movie year + movie details lists together using the unique URL
detail_df = pd.merge(year_df, movie_df, on='url', how='left')

#re-organize tables to be a little cleaner and leave out uneeded columns
detail_df = detail_df[["title",'mpaa',"open_date","total_box",'open_box',
                     'worldwide gross','genre','runtime','rundays','3d', 'imax', 'series', 'max_sites']]

#rename columns for readability
detail_df = detail_df.rename(index=str, columns={"open_date": "open", "total_box": 'usa_box', 
                              "open_box": 'usa_open', "worldwide gross": "intl_box", "max_sites":'theaters' })

In [11]:
#take a look
detail_df.head(2)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters
0,The Dark Knight,PG-13,2008-07-18,533345358,158411483.0,1004558444.0,Action / Adventure,150.0,231.0,0,1,1,4366.0
1,Iron Man,PG-13,2008-05-02,318412101,98618668.0,585174222.0,Action / Adventure,126.0,,0,0,1,4154.0


### Check for Duplicates

In [12]:
#check to see if there are any movie table duplicates. There are not. 
temp_df = detail_df.groupby(["title"])['mpaa'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['mpaa'],ascending=False).head(2)

Unnamed: 0,title,mpaa
0,(500) Days of Summer,1
1329,Space Chimps,1


In [13]:
#check to see if there are any budget table duplicates on title and open. There are not. 
temp_df = budget_df.groupby(["title","open"])['prod_budget'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['prod_budget'],ascending=False).head(2)

Unnamed: 0,title,open,prod_budget
0,10 Cloverfield Lane,2016-03-11,1
1281,Solitary Man,2010-05-21,1


### Merge Movie and Budget Tables

The Movie table is from BoxOfficeMojo.com and the Budget table is from The-Numbers.com, and there are inconsistencies in how the movies are named. For example, the same movie is called 'Star Wars: The Last Jedi on BoxOfficeMojo and 'Star Wars Ep. VII: The Last Jedi' on The-Numbers. "Tyler Perry's Meet the Browns" on BoxOfficeMojo is "Meet the Browns" on The-Numbers. Also, "Under the Same Moon" on BoxOfficeMojo is "La misma luna" on The-Numbers

Due to the variety of differences in the data, I will try a range of strategies to create unique matches from one table to the others. 

__Merge 1: Unique ID Using Release Date + Forward Title Fragment__

In [14]:
#Create Unique ID from the opening day + the first 5 characters of the title. This is a preferred 
#route over matching to just the movie title because movies are named differently on BoxOfficeMojo vs The Numbers
budget_df["id"] = budget_df["open"].apply(time2string)  + budget_df["title"].apply(trunc,characters=6)
detail_df["id"] = detail_df["open"].apply(time2string)  + detail_df["title"].apply(trunc,characters=6)

In [15]:
#check to see if the budget keys are unique. They are!  
temp_df = budget_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,2008-01-04One Mi,1
1281,2014-01-31Labor,1


In [16]:
#check to see if the movie keys are unique. They are!  
temp_df = detail_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,2008-01-04One Mi,1
1329,2014-09-10My Old,1


In [17]:
#Merge!! 
temp_df = pd.merge(detail_df, budget_df, on='id', how='outer')

In [18]:
#This contains the movies that matched to a budget via 'ID' as a unique ID
group_1_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]  
group_1_df = group_1_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_1_df = group_1_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [19]:
group_1_df.head(2)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters,prod_budget
0,The Dark Knight,PG-13,2008-07-18,533345358.0,158411483.0,1004558444.0,Action / Adventure,150.0,231.0,0.0,1.0,1.0,4366.0,185000000.0
1,Iron Man,PG-13,2008-05-02,318412101.0,98618668.0,585174222.0,Action / Adventure,126.0,,0.0,0.0,1.0,4154.0,186000000.0


In [20]:
group_1_df.shape  #1439 Movies Matched on this merge

(1439, 14)

In [21]:
#this is a dataframe of movies that didn't match
remaining_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_df = remaining_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_df = remaining_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [22]:
remaining_df.shape #561 Movies did not Match on this merge

(561, 13)

In [23]:
#this is a dataframe of budget information that has not matched yet
remain_budget_df = temp_df[temp_df["title_x"].isnull()]  
remain_budget_df = remain_budget_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_df = remain_budget_df[["open","title","prod_budget","domestic"]]

In [24]:
remain_budget_df.shape  #512 Budget Lines remain

(512, 4)

__Merge 2: Unique ID Using Release Date + Trailing Title Fragment__

In [25]:
#create new 'ID2' to match using opening date plus trailing characters from the movie name
remain_budget_df["id2"] = remain_budget_df["open"].apply(time2string)  + remain_budget_df["title"].apply(trunc_end,characters=9)
remaining_df["id2"] = remaining_df["open"].apply(time2string)  + remaining_df["title"].apply(trunc_end,characters=9)

In [26]:
#check to see if the budget keys are unique. They Are!  
temp_df = remain_budget_df.groupby(["id2"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id2,title
0,2008-01-18All Hat,1
1,2008-01-18Dark Side,1


In [27]:
#check to see if the movie keys are unique. There are two duplicates, which have closer inspection below
temp_df = remaining_df.groupby(["id2"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id2,title
200,2012-07-19-release),2
0,2008-01-11st Sunday,1


In [28]:
# inpsecting the duplicate keys above shows two are re-releases of previous movies and may not have
# had theatrical releases It's probably ok to leave these in there since they would be removed later
# on for not having theaters
remaining_df[(remaining_df['open']=='2012-07-19')]

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters,id2
994,The Dark Knight (2012 re-release),PG-13,2012-07-19,1513086.0,,,Action / Adventure,150.0,1,0.0,0.0,1.0,,2012-07-19-release)
995,Batman Begins (2012 re-release),PG-13,2012-07-19,1508658.0,,,Action / Adventure,140.0,1,0.0,0.0,1.0,,2012-07-19-release)


In [29]:
#Merge!!!!!
temp_df = pd.merge(remaining_df, remain_budget_df, on='id2', how='outer')

In [30]:
#This contains the movies that matched to a budget via 'ID2' as a unique ID
group_2_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]  
group_2_df = group_2_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_2_df = group_2_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [31]:
group_2_df.shape    #27 Movie Matched in this round

(27, 14)

In [32]:
#this is a dataframe of movies that didn't match
remaining_2_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_2_df = remaining_2_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_2_df = remaining_2_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [33]:
remaining_2_df.shape  #534 Movies remain unmatched 

(534, 13)

In [34]:
#this is a dataframe of budget information that has not matched yet
remain_budget_2_df = temp_df[temp_df["title_x"].isnull()]  
remain_budget_2_df = remain_budget_2_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_2_df = remain_budget_2_df[["open","title","prod_budget","domestic"]]

In [35]:
remain_budget_2_df.shape   #485 Budget Lines remain

(485, 4)

__Merge 3: Titles__

This will catch movies that have different open dates on BoxOfficeMojo vs The-Numbers

In [36]:
#Some titles in Box Office Mojo have Years in their names with (). These should be cleaned
remaining_2_df["title"] = remaining_2_df["title"].apply(remove_year)

In [37]:
#Since the budget rows are only unique on title and also date, I'll still use year and month
remaining_2_df["id"] = remaining_2_df["open"].apply(time2monthyearstring) + remaining_2_df["title"]
remain_budget_2_df["id"] = remain_budget_2_df["open"].apply(time2monthyearstring) + remain_budget_2_df["title"]

In [38]:
#merge on the name of the movie itself
temp_df = pd.merge(remaining_2_df, remain_budget_2_df, on='id', how='outer')

In [39]:
#this contains the movies that matched by name
group_3_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["mpaa"].notnull()]    #boolean changed to mpaa here
group_3_df = group_3_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_3_df = group_3_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [40]:
group_3_df.shape  #25 Movies matched on Title

(25, 14)

In [41]:
#check to make sure there were no duplicates. Looks good! 
group_3_df.groupby(["title"])["mpaa"].count().reset_index().sort_values(["mpaa"],ascending=False).head(2)

Unnamed: 0,title,mpaa
0,Annie,1
13,Life,1


In [42]:
#this is a dataframe of movies that didn't match
remaining_3_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_3_df = remaining_3_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_3_df = remaining_3_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [43]:
remaining_3_df.shape  #509 Movies remain unmatched

(509, 13)

In [44]:
#this is a dataframe of budget information that has not matched yet
remain_budget_3_df = temp_df[temp_df["mpaa"].isnull()]    #boolean changed to mpaa here
remain_budget_3_df = remain_budget_3_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_3_df = remain_budget_3_df[["open","title","prod_budget","domestic"]]

In [45]:
remain_budget_3_df.shape  #460 Budgets Remain Unmatched

(460, 4)

__Explore Remaining Titles: What Else Can I Merge On?__

In [46]:
remaining_3_df.sort_values(['usa_box'],ascending=False).head(5)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters
272,The LEGO Movie,PG,2014-02-07,257760692.0,69050279.0,469160692.0,Animation,100.0,210,1.0,0.0,1.0,3890.0
158,MIB 3,PG-13,2012-05-25,179020854.0,54592779.0,624026776.0,Sci-Fi Comedy,106.0,108,1.0,1.0,1.0,4248.0
82,Tron Legacy,PG,2010-12-17,172062763.0,44026211.0,400062763.0,Sci-Fi Action,125.0,119,1.0,1.0,1.0,3451.0
33,Fast and Furious,PG-13,2009-04-03,155064265.0,70950500.0,363164265.0,Action,99.0,91,0.0,0.0,1.0,3674.0
1,Dr. Seuss' Horton Hears a Who!,G,2008-03-14,154529439.0,45012998.0,297138014.0,Animation,88.0,175,0.0,0.0,0.0,3961.0


In [47]:
remain_budget_3_df.sort_values(['domestic'],ascending=False).head(5)

Unnamed: 0,open,title,prod_budget,domestic
551,2014-02-07,The Lego Movie,60000000.0,257784718.0
534,2012-05-25,Men in Black 3,215000000.0,179020854.0
535,2010-12-17,Tron: Legacy,200000000.0,172062763.0
540,2009-04-03,Fast & Furious,85000000.0,155064265.0
541,2008-03-14,Horton Hears a Who,85000000.0,154529439.0


I also think I can match on Open Date and the first few digits of the domestic box office

__Merge 4: Date + First N Digits of the Domestic Box Office Total__

In [48]:
#Create Unique ID from the opening day + the first 2 characters of the domestic budget. 
remain_budget_3_df["id"] = remain_budget_3_df["open"].apply(time2string)  + '-' + remain_budget_3_df["domestic"].apply(trunc,characters=4)
remaining_3_df["id"] = remaining_3_df["open"].apply(time2string)  + '-' + remaining_3_df["usa_box"].apply(trunc,characters=4)

In [49]:
#check to see if the budget keys are unique. They are!  
temp_df = remain_budget_3_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,2008-01-18-1519,1
316,2013-08-02-5967,1


In [50]:
#merge!!!
temp_df = pd.merge(remaining_3_df, remain_budget_3_df, on='id', how='outer')

In [51]:
#this contains the movies that matched by name
group_4_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]    
group_4_df = group_4_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_4_df = group_4_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [52]:
group_4_df.shape  #50 Movies Matched

(50, 14)

In [53]:
#this is a dataframe of movies that didn't match
remaining_4_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_4_df = remaining_4_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_4_df = remaining_4_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [54]:
remaining_4_df.shape   #459 Movies Remail Unmatched

(459, 13)

In [55]:
#this is a dataframe of budget information that has not matched yet
remain_budget_4_df = temp_df[temp_df["mpaa"].isnull()]    #boolean changed to mpaa here
remain_budget_4_df = remain_budget_4_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_4_df = remain_budget_4_df[["open","title","prod_budget","domestic"]]

In [56]:
remain_budget_4_df.shape   #410 Budgets unmatched

(410, 4)

__Merge 5: Title + First N Digits of the Domestic Box Office Total__

In [57]:
#Create Unique ID from the opening day + the first 2 characters of the domestic budget. 
remain_budget_4_df["id"] = remain_budget_4_df["title"] + remain_budget_4_df["domestic"].apply(trunc,characters=2)
remaining_4_df["id"] = remaining_4_df["title"] + remaining_4_df["usa_box"].apply(trunc,characters=2)

In [58]:
#check to see if the budget keys are unique. They are!  
temp_df = remain_budget_4_df.groupby(["id"])['title'].count()
temp_df = temp_df.reset_index()
temp_df.sort_values(['title'],ascending=False).head(2)

Unnamed: 0,id,title
0,10 Days in a Madhouse14,1
269,Selma52,1


In [59]:
#merge!!!
temp_df = pd.merge(remaining_4_df, remain_budget_4_df, on='id', how='outer')

In [60]:
#this contains the movies that matched by name
group_5_df = temp_df[temp_df["prod_budget"].notnull() & temp_df["title_x"].notnull()]    
group_5_df = group_5_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
group_5_df = group_5_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters","prod_budget"]]

In [61]:
group_5_df.shape      #10 Movie Matched this time

(10, 14)

In [62]:
#this is a dataframe of movies that didn't match
remaining_5_df = temp_df[temp_df["prod_budget"].isnull()]
remaining_5_df = remaining_5_df.rename(index=str, columns={'title_x':'title','open_x':'open'})
remaining_5_df = remaining_5_df[["title","mpaa","open","usa_box","usa_open","intl_box","genre","runtime",
                         "rundays","3d","imax","series","theaters"]]

In [63]:
remaining_5_df.shape   #449 Remain Unmatched

(449, 13)

In [64]:
#this is a dataframe of budget information that has not matched yet
remain_budget_5_df = temp_df[temp_df["mpaa"].isnull()]    #boolean changed to mpaa here
remain_budget_5_df = remain_budget_5_df.rename(index=str, columns={'title_y':'title','open_y':'open'})
remain_budget_5_df = remain_budget_5_df[["open","title","prod_budget","domestic"]]

In [65]:
remain_budget_5_df.shape #400 unmatched Movie Budgets

(400, 4)

__Explore Remaining Titles: Nothing Left to Merge On__

I feel pretty good stopping here. The largest movie on the remaining budget dataframe
has a domestic gross of 30 million while the top movie on the remaining movies is $90M which means all the movies in between will never be matched to the budget information, and it is thus not available. That's not bad. Also, there are movies on this list that aren't really from this decade, such as The Lion King, Monsters, Inc, so many of these unmatched movies are rereleases and shouldn't impact a prediction model for this decade's films

In [66]:
remaining_5_df.sort_values(["usa_box"],ascending=False).head(25)

Unnamed: 0,title,mpaa,open,usa_box,usa_open,intl_box,genre,runtime,rundays,3d,imax,series,theaters
89,The Lion King,G,2011-09-16,94242001.0,30151614.0,514083777.0,Animation,89.0,119.0,0.0,0.0,0.0,2340.0
126,Titanic 3D,PG-13,2012-04-04,57884114.0,17285453.0,945030600.0,Romance,194.0,65.0,0.0,0.0,0.0,3674.0
382,The Shack,PG-13,2017-03-03,57386418.0,16172119.0,96942115.0,Drama,132.0,77.0,0.0,0.0,0.0,2888.0
26,The Haunting in Connecticut,PG-13,2009-03-27,55389516.0,23004765.0,77527732.0,Horror,92.0,49.0,0.0,0.0,0.0,2732.0
27,Ghosts of Girlfriends Past,PG-13,2009-05-01,55250026.0,15411434.0,102223269.0,Fantasy Comedy,100.0,105.0,0.0,0.0,0.0,3175.0
63,Hubble 3D,G,2010-03-19,52331382.0,413477.0,73515965.0,IMAX,43.0,3118.0,1.0,0.0,0.0,151.0
177,Tyler Perry's Temptation: Confessions of a Mar...,PG-13,2013-03-29,51975354.0,21641679.0,53125354.0,Drama,111.0,63.0,0.0,0.0,0.0,2047.0
127,Beauty and the Beast,G,2012-01-13,47617067.0,17751905.0,233377676.0,Animation,84.0,112.0,0.0,0.0,0.0,2625.0
0,College Road Trip,G,2008-03-07,45610425.0,13601419.0,51549674.0,Comedy,83.0,147.0,0.0,0.0,0.0,2706.0
178,Jurassic Park 3D,PG-13,2013-04-05,45385935.0,18620145.0,473553882.0,Sci-Fi Horror,127.0,49.0,0.0,0.0,0.0,2778.0


In [67]:
remain_budget_5_df.sort_values(["domestic"],ascending=False).head(5)

Unnamed: 0,open,title,prod_budget,domestic
483,2014-08-01,Get on Up,30000000.0,30569935.0
536,2012-01-13,The Iron Lady,14000000.0,29959436.0
496,2012-02-17,Kari gurashi no Arietti,23000000.0,19192510.0
565,2012-05-25,Les Intouchables,10800000.0,13182281.0
541,2009-05-29,MÃ¤n som hatar kvinnor,13000000.0,12749992.0


__Concatenate & Export the Merged List__

In [68]:
pd.concat([group_1_df, group_2_df, group_3_df, 
            group_4_df, group_5_df, remaining_5_df]).to_csv('../04_Data/{}_{}_merged.csv'.format(years[0],years[-1]))

In [69]:
pd.concat([group_1_df, group_2_df, group_3_df, 
           group_4_df, group_5_df, remaining_5_df]).shape

(2000, 14)