In [1]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

# from config import db_password

import time

In [2]:
# 1. Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):
    movie = dict(movie)  # local copy of the input parameter dictionary
    
    # List of languages
    language_columns = [
        'Also known as',
        'Arabic',
        'Bopomofo',
        'Cantonese',
        'Chinese',
        'French',
        'Gwoyeu Romatzyh',
        'Hangul',
        'Hanyu Pinyin',
        'Hebrew',
        'Hepburn',
        'Hokkien POJ',
        'IPA',
        'Japanese',
        'Jyutping',
        'Literally',
        'Mandarin',
        'McCune–Reischauer',
        'Original title',
        'Polish',
        'Revised Romanization',
        'Romanized',
        'Russian',
        'Simplified',
        'Simplified Chinese',
        'Traditional',
        'Traditional Chinese',
        'Wade–Giles',
        'Yale Romanization',
        'Yiddish'
    ]
    
    alt_titles = {}
    for key in language_columns:
        if key in movie:
            alt_titles[key] = movie.pop(key)
    if 0 < len(alt_titles):
        movie['alt_titles'] = alt_titles
    
    # list of columns to be merged
    merge_columns = [
        ['Adaptation by', 'Writer(s)'],
        ['Country of origin', 'Country'],
        ['Directed by', 'Director'],
        ['Distributed by', 'Distributor'],
        ['Edited by', 'Editor(s)'],
        ['Length', 'Running time'],
        ['Music by', 'Composer(s)'],
        ['Original language(s)', 'Language'],
        ['Original release', 'Release date'],
        ['Produced by', 'Producer(s)'],
        ['Producer', 'Producer(s)'],
        ['Productioncompanies ', 'Production company(s)'],
        ['Productioncompany ', 'Production company(s)'],
        ['Release Date', 'Release date'],
        ['Released', 'Release Date'],
        ['Screen story by', 'Writer(s)'],
        ['Screenplay by', 'Writer(s)'],
        ['Story by', 'Writer(s)'],
        ['Theme music composer', 'Composer(s)'],
        ['Written by', 'Writer(s)']
    ]
    
    for col in merge_columns:
        if col[0] in movie:                     # If the first column name exists…
            movie[col[1]] = movie.pop(col[0])   # …rename that column to the second name
                                                # (overwriting the second column, if it exists)
    
    return movie

In [3]:
# 2 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)

def process_movie_data():
    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    df_kaggle = pd.read_csv(kaggle_file, low_memory=False)
    df_ratings = pd.read_csv(ratings_file)
    
    # Open and read the Wikipedia data JSON file.
    with open(wiki_file, mode='r') as file:
        wiki_movies_raw = json.load(file)
    
    # 3. Write a list comprehension to filter out TV shows.
    wiki_movies = [movie for movie in wiki_movies_raw
        if True
        and ('Directed by' in movie or 'Director' in movie)
        and 'imdb_link' in movie
        and 'No. of episodes' not in movie
    ]
    
    # 4. Write a list comprehension to iterate through the cleaned wiki movies list
    # and call the clean_movie function on each movie.
    clean_movies = [clean_movie(movie) for movie in wiki_movies]
    
    # 5. Read in the cleaned movies list from Step 4 as a DataFrame.
    df_wiki = pd.DataFrame(clean_movies)
    
    # 6. Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    #  dropping any imdb_id duplicates. If there is an error, capture and print the exception.
    try:
        df_wiki['imdb_id'] = (
            df_wiki['imdb_link']
#             .str.strip()  # Make sure there is no extraneous whitespace on either end
#             .str.extract(r'^https?:\/\/(?:www\.)?imdb\.com\/title\/(tt\d{7})\/?$')
            .str.extract(r'(tt\d{7})')
        )
        
        df_wiki.drop_duplicates(subset='imdb_id', inplace=True)
#         raise ExpectedError()  # for testing
    except BaseException as err:
        print(f"Unexpected {err}, {type(err)}")  # From https://docs.python.org/3/tutorial/errors.html
    
#     ####
#     print('df_wiki rows:',len(df_wiki),'(7033 in Module; different regex)')###############################################
#     ####
    
    #  7. Write a list comprehension to keep the columns that don't have null values from the df_wiki DataFrame.
    df_wiki = df_wiki[[col for col in df_wiki.columns if df_wiki[col].isnull().sum() < 0.9*len(df_wiki)]]
    
    # 8. Create a variable that will hold the non-null values from the “Box office” column.
    box_office = df_wiki['Box office'].dropna()
    
    # 9. Convert the box office data created in Step 8 to string values using the lambda and join functions.
    box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)
    
    # Replace box office dollar ranges with just the upper value
    box_office = box_office.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
    
    # 10. Write a regular expression to match the six elements of "form_one" of the box office data.
    dollars_pattern_mil = r'\$\s*\d+\.?\d*\s*mill?i?on'
    dollars_pattern_bil = r'\$\s*\d+\.?\d*\s*bill?i?on'
    
    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    dollars_pattern_num = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)'
    
    dollars_pattern_combined = f'({dollars_pattern_mil}|{dollars_pattern_bil}|{dollars_pattern_num})'
    
    # 12. Add the parse_dollars function.
    def parse_dollars(s):
        # if s is not a string, return NaN
        if type(s) != str:
            return np.nan

        # if input is of the form $###.# million
        if re.match(dollars_pattern_mil, s, flags=re.IGNORECASE):

            # remove dollar sign and " million"
            s = re.sub('\$|\s|[a-zA-Z]','', s)  # Why not keep only \d and \. characters instead of
                                                # removing those specific ones?

            # convert to float and multiply by a million
            value = float(s) * 10**6

            # return value
            return value

        # if input is of the form $###.# billion
        elif re.match(dollars_pattern_bil, s, flags=re.IGNORECASE):

            # remove dollar sign and " billion"
            s = re.sub('\$|\s|[a-zA-Z]','', s)  # See question in the 'million' section, above

            # convert to float and multiply by a billion
            value = float(s) * 10**9

            # return value
            return value

        # if input is of the form $###,###,###
        elif re.match(dollars_pattern_num, s, flags=re.IGNORECASE):

            # remove dollar sign and commas
            s = re.sub('\$|,','', s)  # Same question

            # convert to float
            value = float(s)

            # return value
            return value

        # otherwise, return NaN
        else:
            return np.nan
    
    # 13. Clean the box office column in the df_wiki DataFrame.
    df_wiki['box_office'] = box_office.str.extract(
        dollars_pattern_combined,
        flags=re.IGNORECASE
    )[0].apply(parse_dollars)
    
    df_wiki.drop('Box office', axis=1, inplace=True)

    # 14. Clean the budget column in the df_wiki DataFrame.
    df_wiki['budget'] = (
        df_wiki['Budget']
        .dropna()
        .apply(lambda x: ' '.join(x) if type(x) == list else x) # Join list elements into space-separated strings
        .str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)    # Replace dollar ranges with just the upper value
        .str.replace(r'\s*\[\d+\]', '', regex=True)             # Remove citations (numbers within square brackets)
        .str.extract(dollars_pattern_combined, flags=re.IGNORECASE)[0]
        .apply(parse_dollars)
    )
    
#     df_wiki.drop('Budget', axis=1, inplace=True)

    # 15. Clean the release date column in the df_wiki DataFrame.
    date_pattern_ymd = r'\d{4}[/-][01]?\d[/-]][0123]?\d'
    date_pattern_mdy = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}'
    date_pattern_dmy = r'\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}'
    date_pattern_my = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}'
    date_pattern_yo = r'^\d{4}$|(?<=\( )\d{4}(?= \))'

    date_pattern_combined = f'({date_pattern_ymd}|{date_pattern_mdy}|{date_pattern_dmy}|{date_pattern_my}|{date_pattern_yo})'

    df_wiki['release_date'] = pd.to_datetime(
        df_wiki['Release date']
        .dropna()
        .apply(lambda x: ' '.join(x) if type(x) == list else x) # Join list elements into space-separated strings
        .str.extract(date_pattern_combined, flags=re.IGNORECASE )[0],
        infer_datetime_format=True
    )
    
#     df_wiki.drop('Release date', axis=1, inplace=True)

    # 16. Clean the running time column in the df_wiki DataFrame.
    run_time_pattern_hrs = r'(\d+)\s+h(?:(?:ou)?rs?)?(?:\s(\d+)\s+m(?:in(?:utes)?)?)?'
    run_time_pattern_min = r'(\d+)\s+m(?:in(?:utes)?)?'

    run_time_pattern_combined = f'{run_time_pattern_hrs}|{run_time_pattern_min}'

    df_wiki['running_time'] = (
        df_wiki['Running time']
        .dropna()
        .apply(lambda x: ' '.join(x) if type(x) == list else x) # Join list elements into space-separated strings
        .str.extract(run_time_pattern_combined)                 # Extract times: column 0 is hours; columns 1 & 2 are minutes
        .apply(lambda col: pd.to_numeric(col, errors='coerce')) # Coerce empty strings (errors) into NaNs
        .fillna(0)
        .astype(int)
        .apply(lambda row: 60*row[0]+row[1]+row[2], axis=1)     # Convert hours/minutes/minutes to minutes
    )

    df_wiki.drop('Running time', axis=1, inplace=True)
    
#     # Row counts to check the results of export steps to follow
#     # The :>10 parts of the f-strings mean to have the output right justified ten characters (digits) over.
#     print(f'Wiki movies: {len(df_wiki):>10} rows\nRatings:     {len(df_ratings):>10} rows\n')
    
    # Return three variables. The first is the df_wiki DataFrame
    return df_wiki, df_kaggle, df_ratings 

In [4]:
# 17. Create the path to your file directory and variables for the three files.
filepath = './Resources'
# The Wikipedia data
wiki_file = f'{filepath}/wikipedia-movies.json'
# The Kaggle metadata
kaggle_file = f'{filepath}/movies_metadata.csv'
# The MovieLens rating data.
ratings_file = f'{filepath}/ratings.csv'

In [5]:
# 19., but in a way that doesn't require destroying the data in wiki_file, kaggle_file, and ratings_file
df_wiki_movies, df_kaggle_metadata, df_movielens_ratings = process_movie_data()

In [6]:
# 20. Check that the df_wiki_movies DataFrame looks like this. 
df_wiki_movies.head()

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Release date,Country,Language,...,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",United States,English,...,Michael Tronick,"[Cliff Eidelman, Yello]","[Steve Perry, Joel Silver]",Silver Pictures,"[David Arnott, James Cappe]",tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",United States,English,...,Howard E. Smith,Maurice Jarre,"[Ric Kidney, Robert Redlin]",Avenue Pictures,"[James Foley, Robert Redlin]",tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",United States,"[English, Lao]",...,"[John Bloom, Lois Freeman-Fox]",Charles Gross,Daniel Melnick,"[Carolco Pictures, IndieProd Company]","[John Eskow, Richard Rush]",tt0099005,57718089.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",United States,English,...,Susan E. Morse,,Robert Greenhut,,Woody Allen,tt0099012,7331647.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,"December 19, 1990",US,English,...,David Stiven,Maurice Jarre,John Cornell,,Paul Hogan,tt0099018,6939946.0,25000000.0,1990-12-19,95.0


In [7]:
# 21. Check that df_wiki_movies DataFrame columns are correct. 
df_wiki_movies.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Based on',
 'Starring',
 'Cinematography',
 'Release date',
 'Country',
 'Language',
 'Budget',
 'Director',
 'Distributor',
 'Editor(s)',
 'Composer(s)',
 'Producer(s)',
 'Production company(s)',
 'Writer(s)',
 'imdb_id',
 'box_office',
 'budget',
 'release_date',
 'running_time']