In [27]:
import requests
from bs4 import BeautifulSoup as bs
import pickle
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression

In [28]:
## Save list_movie_data in with pickle
def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

## Load pickle file data 
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
    
#function for scraping infobox
def scrape(url):
    page = requests.get(url)
    toScrape = bs(page.content, 'html.parser')
    movie_details = toScrape.find(class_='infobox vevent')
    rows = movie_details.find_all('tr') 
    movie_data = {}

    title = rows[0].find('th').get_text()
    movie_data['Title'] = title
    for i, row in enumerate(rows):
        try:
            if i <= 1:
                continue
            elif row.find('th').get_text() == 'Based on':
                continue
            elif row.find('th').get_text() == 'Starring':
                clean_tags(row)
                movie_data['Starring'] = clean(row)
                movie_data['Lead'] = movie_data['Starring'][0]
            elif 'Production' in row.find('th').get_text():
                clean_tags(row)
                movie_data['Production companies'] = clean(row)
            elif row.find('th').get_text() == 'Running time':
                clean_tags(row)
                movie_data['Running_time_min'] = clean(row)
            elif row.find('th').get_text() == 'Release date':
                clean_tags(row)
                date = clean(row).strip()
                dt = dt_conversion(date)
                movie_data['Release_date_dt'] = dt
                movie_data['Release_month'] = dt.month
            elif row.find('th').get_text() == 'Budget':
                budget = money_convert(row)
                movie_data['Budget'] = budget
            elif row.find('th').get_text() == 'Box office':
                money = money_convert(row)
                movie_data['Box_office'] = money
            else:
                clean_tags(row)
                column = row.find('th').get_text(' ', strip=True)
                data = clean(row)
                movie_data[column] = data
        except:
            pass
        
    op_rating = get_op_and_rating(title)
    movie_data['Box_office_opening'] = op_rating[0]
    movie_data['Rating'] = op_rating[1]
        
    return movie_data
    
    
    
#grab opening box office numbers and MPAA rating from Box Office Mojo given movie title
def get_op_and_rating(title):
    search_page = requests.get('https://www.boxofficemojo.com/search/?q='+title)
    search_page_content = bs(search_page.content, 'html.parser')
    search_list = search_page_content.find_all('a')
    path = ''
    for li in search_list:
        if li.get_text() == title:
            path = li['href']
            break
        elif title[:10] in li.get_text():
            path = li['href']
            break
            
    if path == '':
        path = search_list[14]['href']
    
    
    data = [None,'Unknown']
    movie_page = requests.get('https://www.boxofficemojo.com'+path)
    movie_page_content = bs(movie_page.content, 'html.parser')
    table_links = movie_page_content.find_all('a')
    for li in table_links:
        if '$' in li.get_text():
            opening_box = float(li.get_text().replace('$', '').replace(',', ''))
            data[0] = opening_box
            break
            
    spans = movie_page_content.find_all('span')
    for span in spans:
        if 'MPAA' in span.get_text():
            rating = span.find_next('span').get_text()
            data[1] = rating
            break
    return data  


#Convert date str to datetime object
def dt_conversion(date):
    patterns = ['%B %d, %Y', '%d %B %Y']
    for pat in patterns:
        try:
            return datetime.strptime(date, pat)
        except:
            pass
    return none


#remove troublesome tags
def clean_tags(content):
    t = ['sup', 'span']
    tags = content.find_all(t)
    for tag in tags:
        tag.decompose()

def money_convert(row):
    multiplier = 1
    money_str = row.find('td').get_text().replace('\xa0', ' ')
    if '£' in money_str:
        multiplier = 1.41
    elif '€' in money_str:
        multiplier = 1.21
        
        
    if 'million' in money_str:
        if '(' in money_str:
            money_str = money_str.split('(')[0].replace('$','').replace('£', '')
        if '-' in money_str:
            number = float(money_str.split('-')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
        if '–' in money_str:
            number = float(money_str.split('–')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
        else:
            number = float(money_str.split(' ')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
    elif 'billion' in money_str:
        if '(' in money_str:
            money_str = money_str.split('(')[0].replace('$','').replace('£', '')
        if '-' in money_str:
            number = float(money_str.split('-')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
        if '–' in money_str:
            number = float(money_str.split('–')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
        else:
            number = float(money_str.split(' ')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
    else:
        number = float(money_str.replace(',','').replace('$','').replace('£', ''))
        return number * multiplier


#function to clean data scraped from wikipedia infobox
def clean(row):
    if row.find('th').get_text() == 'Release date':
        if row.find('td').get_text()[0].isdigit() == True:
            return row.find('td').get_text().split(',')[0].replace('\xa0', ' ').strip('\n').strip(' ')  
        return row.find('td').get_text().split('(')[0].replace('\xa0', ' ').strip('\n').strip(' ')                                                                                         
    elif row.find('th').get_text() == 'Running time':
        return int(row.find('td').get_text().split(' ')[0])
    elif row.find('br'):
        return [text for text in row.find('td').stripped_strings]
    elif row.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row.find_all('li')]
    return row.find('td').get_text()

In [40]:
# load movie data 
test_movie_data = []

#list_movie_data.append(scrape(#some wikipedia link to movie infobox))
test_movie_data.append(scrape('https://en.wikipedia.org/wiki/The_Invisible_Man_(2020_film)'))

# Delete irrelevant columns
df = pd.DataFrame(test_movie_data) 
while True:
    try:
        df.drop(df.columns[21], axis=1, inplace = True)
    except:
        break
df.drop(df.columns[[2,3,6,7,8,9,10]], axis=1, inplace = True)
        
df = df.set_index('Title')

df.to_csv('./Data/test_movie_data.csv')

In [41]:
all_data = pd.read_csv('./Data/all_movie_data_dropna.csv')
test_data = pd.read_csv('./Data/test_movie_data.csv')
model = load_data('./Models/RandomForestRegressionModel.pickle')

# concat test_data and all_data to get correct dummy variables
data = pd.concat([all_data, test_data])
df_model = data[['Release_month', 'Budget', 'Box_office_opening', 'Box_office', 'Rating']]
df_dum = pd.get_dummies(df_model).drop('Box_office', axis=1)
df_dum

index = len(data.index)
movie = df_dum.iloc[index-1]
movie

model.predict(movie.values.reshape(1,-1)) 




array([1.13464615e+08])