In [45]:
import bs4
from bs4 import BeautifulSoup
import requests as rq
import re
import pandas as pd
import numpy as np
import datetime 
import os

In [46]:
years=[str(a) for a in range(2008,2019)]
df_list=[]
for year in years:
    r=rq.get('https://www.boxofficemojo.com/yearly/chart/?view2=worldwide&yr=%s&p=.htm' % year)
    print('Box Office data for %s scraped' % year)
    p=BeautifulSoup(r.text,'html.parser')

    ### Look for the table ### 
    b=p.find_all('table')

    ### Usually the fourth table object on page ### 
    tb=b[3].find_all('td')

    ## Each data field is found in a <td> element in the fourth table. Store all data in a list ## 
    data=[]
    for i in tb:
        if i.find('a')!=None:
            data.append(i.find('a').contents[0])
        elif i.find('font')!=None:
             data.append(i.find('font').contents[0])
        elif i.find('b')!=None:
            data.append(i.find('b').contents[0])

    ### Still a <b> tag left for <font> tags ## 
    data=[a.contents[0] if type(a)!=bs4.element.NavigableString else a for a in data]

    ### Strip special characters ### 
    data=[re.sub('[^A-Za-z0-9-. ]+', '', a) for a in data]

    ### Fill NaNs ### 
    data=[np.nan if a =='na' else a for a in data]

    ### Define the feature names ###
    columns=['bo_year_rank','title','studio','worldwide-gross','domestic-gross','domestic-pct','overseas-gross','overseas-pct']

    ### First 6 elements are column headers # 
    to_df=data[6:]

    ### Escape clause in case the layout changes from year to year ### 
    if len(to_df)%len(columns) != 0:
        print('Possible table misalignment in table for year %s' % year)
        break 

    ### Convert to pandas dataframe ### 

    nrow=int(len(to_df)/len(columns))
    df=pd.DataFrame(np.array(to_df).reshape(nrow,8),columns=columns)
    df['bo_year']=int(year)
    df_list.append(df)

dirtymovies_df=pd.concat(df_list)

Box Office data for 2008 scraped
Box Office data for 2009 scraped
Box Office data for 2010 scraped
Box Office data for 2011 scraped
Box Office data for 2012 scraped
Box Office data for 2013 scraped
Box Office data for 2014 scraped
Box Office data for 2015 scraped
Box Office data for 2016 scraped
Box Office data for 2017 scraped
Box Office data for 2018 scraped


In [37]:
dirtymovies_df

Unnamed: 0,bo_year_rank,title,studio,worldwide-gross,domestic-gross,domestic-pct,overseas-gross,overseas-pct,bo_year
0,1,The Dark Knight,WB,1003.0,533.3,53.2,469.7,46.8,2008
1,2,Indiana Jones and the Kingdom of the Crystal S...,Par.,786.6,317.1,40.3,469.5,59.7,2008
2,3,Kung Fu Panda,PDW,631.7,215.4,34.1,416.3,65.9,2008
3,4,Hancock,Sony,624.4,227.9,36.5,396.4,63.5,2008
4,5,Mamma Mia,Uni.,609.8,144.1,23.6,465.7,76.4,2008
5,6,Madagascar Escape 2 Africa,PDW,603.9,180.0,29.8,423.9,70.2,2008
6,7,Quantum of Solace,Sony,586.1,168.4,28.7,417.7,71.3,2008
7,8,Iron Man,Par.,585.2,318.4,54.4,266.8,45.6,2008
8,9,WALL-E,BV,533.3,223.8,42.0,309.5,58.0,2008
9,10,The Chronicles of Narnia Prince Caspian,BV,419.7,141.6,33.7,278.0,66.3,2008


In [47]:
dirtymovies_df = dirtymovies_df.iloc[: , [0, 1, 2, 8]]
dirtymovies_df["bo_year_rank"] = dirtymovies_df["bo_year_rank"].apply(int)
movies_df = dirtymovies_df.loc[dirtymovies_df["bo_year_rank"] <=10,:]

In [48]:
movies_df


Unnamed: 0,bo_year_rank,title,studio,bo_year
0,1,The Dark Knight,WB,2008
1,2,Indiana Jones and the Kingdom of the Crystal S...,Par.,2008
2,3,Kung Fu Panda,PDW,2008
3,4,Hancock,Sony,2008
4,5,Mamma Mia,Uni.,2008
5,6,Madagascar Escape 2 Africa,PDW,2008
6,7,Quantum of Solace,Sony,2008
7,8,Iron Man,Par.,2008
8,9,WALL-E,BV,2008
9,10,The Chronicles of Narnia Prince Caspian,BV,2008


In [51]:
Moviedictionary = movies_df.to_dict(orient='records') 

In [52]:
print(Moviedictionary)

[{'bo_year_rank': 1, 'title': 'The Dark Knight', 'studio': 'WB', 'bo_year': 2008}, {'bo_year_rank': 2, 'title': 'Indiana Jones and the Kingdom of the Crystal Skull', 'studio': 'Par.', 'bo_year': 2008}, {'bo_year_rank': 3, 'title': 'Kung Fu Panda', 'studio': 'PDW', 'bo_year': 2008}, {'bo_year_rank': 4, 'title': 'Hancock', 'studio': 'Sony', 'bo_year': 2008}, {'bo_year_rank': 5, 'title': 'Mamma Mia', 'studio': 'Uni.', 'bo_year': 2008}, {'bo_year_rank': 6, 'title': 'Madagascar Escape 2 Africa', 'studio': 'PDW', 'bo_year': 2008}, {'bo_year_rank': 7, 'title': 'Quantum of Solace', 'studio': 'Sony', 'bo_year': 2008}, {'bo_year_rank': 8, 'title': 'Iron Man', 'studio': 'Par.', 'bo_year': 2008}, {'bo_year_rank': 9, 'title': 'WALL-E', 'studio': 'BV', 'bo_year': 2008}, {'bo_year_rank': 10, 'title': 'The Chronicles of Narnia Prince Caspian', 'studio': 'BV', 'bo_year': 2008}, {'bo_year_rank': 1, 'title': 'Avatar', 'studio': 'Fox', 'bo_year': 2009}, {'bo_year_rank': 2, 'title': 'Harry Potter and the H

In [49]:
def movie_scraper():
    years=[str(a) for a in range(2008,2019)]
    df_list=[]
    for year in years:
        r=rq.get('https://www.boxofficemojo.com/yearly/chart/?view2=worldwide&yr=%s&p=.htm' % year)
        print('Box Office data for %s scraped' % year)
        p=BeautifulSoup(r.text,'html.parser')

        ### Look for the table ### 
        b=p.find_all('table')

        ### Usually the fourth table object on page ### 
        tb=b[3].find_all('td')

        ## Each data field is found in a <td> element in the fourth table. Store all data in a list ## 
        data=[]
        for i in tb:
            if i.find('a')!=None:
                data.append(i.find('a').contents[0])
            elif i.find('font')!=None:
                 data.append(i.find('font').contents[0])
            elif i.find('b')!=None:
                data.append(i.find('b').contents[0])

        ### Still a <b> tag left for <font> tags ## 
        data=[a.contents[0] if type(a)!=bs4.element.NavigableString else a for a in data]

        ### Strip special characters ### 
        data=[re.sub('[^A-Za-z0-9-. ]+', '', a) for a in data]

        ### Fill NaNs ### 
        data=[np.nan if a =='na' else a for a in data]

        ### Define the feature names ###
        columns=['bo_year_rank','title','studio','worldwide-gross','domestic-gross','domestic-pct','overseas-gross','overseas-pct']

        ### First 6 elements are column headers # 
        to_df=data[6:]

        ### Escape clause in case the layout changes from year to year ### 
        if len(to_df)%len(columns) != 0:
            print('Possible table misalignment in table for year %s' % year)
            break 

        ### Convert to pandas dataframe ### 

        nrow=int(len(to_df)/len(columns))
        df=pd.DataFrame(np.array(to_df).reshape(nrow,8),columns=columns)
        df['bo_year']=int(year)
        df_list.append(df)

    dirtymovies_df=pd.concat(df_list)
    
    dirtymovies_df = dirtymovies_df.iloc[: , [0, 1, 2, 8]]
    dirtymovies_df["bo_year_rank"] = dirtymovies_df["bo_year_rank"].apply(int)
    movies_df = dirtymovies_df.loc[dirtymovies_df["bo_year_rank"] <=10,:]
    
    Moviedictionary = movies_df.to_dict(orient='records') 
    
    return (Moviedictionary) 
    
    



Unnamed: 0,bo_year_rank,title,studio,worldwide-gross,domestic-gross,overseas-gross,bo_year
0,1,Avengers Endgame,BV,1342700000.0,394000000.0,948700000.0,2019
1,2,Captain Marvel,BV,1111400000.0,414500000.0,696800000.0,2019
2,3,The Wandering Earth,CMC,699800000.0,5900000.0,693900000.0,2019
3,4,How to Train Your Dragon The Hidden World,Uni.,516400000.0,159700000.0,356700000.0,2019
4,5,Alita Battle Angel,Fox,404400000.0,85700000.0,318700000.0,2019
5,6,Shazam,WB NL,346200000.0,131500000.0,214700000.0,2019
6,7,Dumbo 2019,BV,328700000.0,107500000.0,221200000.0,2019
7,8,Us,Uni.,249600000.0,173000000.0,76600000.0,2019
8,9,Glass,Uni.,247000000.0,111000000.0,135900000.0,2019
9,10,The LEGO Movie 2 The Second Part,WB,190500000.0,105700000.0,84800000.0,2019


  if x is >= 1000000000 then convert to "1.xx billion"  to make more readable we are going from float to string 
  else convert to xxxmillion 
  
  Next steps for arjun figure out function above then put finished dataframe into mongodbcollection 
  