In [1]:
#importing libraries
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
import io
import pandas as pd

In [2]:
# Create connection to AWS table
engine = create_engine('postgresql+psycopg2://postgres:moviesondemand@moviesondemandaws.cfwjiare7kds.us-east-2.rds.amazonaws.com:5432/postgres')

In [3]:
# Import raw movie data into dataframe
raw_df = pd.read_sql_table('List_for_MPAA', con=engine)

In [4]:
# Create list of title ids
title_ids_list = raw_df['imdb_title_id'].tolist()

In [9]:
# Function to scrape MPAA ratings off of IMDB website
from IPython.core.display import display, HTML
import re
from datetime import datetime

def get_movie_data(link):
    """
    Makes a Dicitonary of movies and it's information.
    Args:
        link: A string that is the end of a IMDb url for a specific movie.
    Returns:
        A dictionary with information regarding the movie that is associated with the link passed in.
    """
    base_url = "https://www.imdb.com"
    url = base_url + '/title/' + link + '/'

    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page)

    headers = [ "imdb_title_id", "mpaa",]

    # Collection MPAA
    mpaa_options = [
        "G",
        "PG",
        "PG-13",
        "R",
        "NC-17",
        "TV-MA",
        "Unrated",
        "Not Rated",
    ]
    
    try:
        mpaa = soup.find_all(class_="TitleBlockMetaData__ListItemText-sc-12ein40-2 jedhex")
        mpaa= mpaa[1].text
        mpaa = mpaa.split("\n")[0].strip()  
        
        if mpaa not in mpaa_options:
            mpaa = 'Crap'
    except:
        mpaa = None

    data_list = [ link, mpaa,]
    movie_dict = dict(zip(headers, data_list))
    return movie_dict

In [10]:
#Loop through title ids and apply get movie data function (In this case grab the MPAA rating)
count = 0
mpaa_list = []
for title_id in title_ids_list:
    mpaa_list.append(get_movie_data(title_id))
    count += 1

In [11]:
#Create new dataframe with title_id and MPAA_rating
mpaa_df = pd.DataFrame.from_dict(mpaa_list)

In [12]:
#Preview dataframe
mpaa_df.head()

Unnamed: 0,imdb_title_id,mpaa
0,tt0035423,PG-13
1,tt0113026,PG
2,tt0466909,R
3,tt0283503,R
4,tt2510268,Not Rated


In [13]:
#Write to new table in AWS/Postgres
mpaa_df.head(0).to_sql('mpaa_rating', engine, if_exists='replace',index=False) #drops old table and creates new empty table
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
mpaa_df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'mpaa_rating', null="") # null values become ''
conn.commit()