In [1]:
# TO DO:
# create printouts/logs of record processing.  muck with logger framework
    # Make sure to do error logging.
# Change column replacement framework to a lookup in a saved CSV.  Current framework is kludgy.
    # Convert to lambda function
# Declare form_one, form_two in namespace

# 813 null release dates: is this accurate?

In [2]:
import json
import pandas as pd
import numpy as np

import re

from pprint import pprint

from sqlalchemy import create_engine
import psycopg2

# from config import db_password

import time

### Function Definintions

In [3]:
# 1. Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):
    """
    Takes a single wikipedia record, extracts all known values for alternate titles,
    and moves them to a list.  Additionally maps redundant/duplicative column names.
    """
    movie = dict(movie) # creates a non-destructive copy.  DON'T UNDERSTAND THIS SYNTAX
    
    # Clean alternate titles
    alt_titles = dict()
    languages = ['Arabic',
                 'Cantonese',
                 'Chinese',
                 'French',
                 'Hangul',
                 'Hebrew',
                 'Hepburn',
                 'Japanese',
                 'Literally',
                 'Mandarin',
                 'McCune–Reischauer',
                 'Polish',
                 'Revised Romanization',
                 'Romanized',
                 'Russian',
                 'Simplified',
                 'Traditional',
                 'Yiddish']

    for language in languages:
        if language in movie:
            alt_titles[language] = movie[language]
            movie.pop(language)

    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles
    
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director(s)')
    change_column_name('Director', 'Director(s)')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Written by', 'Writer(s)')
    change_column_name('Original release', 'Release date')
    
    return movie

def parse_dollars(s):
    """
    Given string s, parse currency strings to float.
    """
    if type(s) != str:
        return np.nan
    
    # form one: r"\$\s*\d{1,3}\.?\d*\s*[mb]illi?on"
    # form two: r"\$\s*\d+[,\.]\d{3}"
    
    # form: "$###.# billion: 
    # remove dollar signs, whitespace, and text.  
    # Multiply by 1billion
    if re.match(r"\$\s*\d{1,3}\.?\d*\s*billi?on", s, flags=re.IGNORECASE):
        s = re.sub('\$|\s|[a-zA-Z]', '', s)
        value = float(s) * 10**9
        return value
     
    # form: "$###.# million: 
    # remove dollar signs, whitespace, and text.  
    # Multiply by 1million   
    if re.match(r"\$\s*\d{1,3}\.?\d*\s*milli?on", s, flags=re.IGNORECASE):
        s = re.sub('\$|\s|[a-zA-Z]', '', s)
        value = float(s) * 10**6
        return value    
    
    # form: $###,###,###
    # strip dollar signs and thousands separators
    if re.match(r"\$\s*\d+[,\.]\d{3}", s, flags=re.IGNORECASE):
        s = re.sub('\$|,|\.','',s)
        value = float(s)
        return value
        
    else:
        return np.nan

# 2 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)
def import_source_files(wiki_file: str,
                        kaggle_file: str,
                        ratings_file: str):
    """
    Function takes three arguments, each corresponding to the name
    of a specific source csv or json file for the three types of data 
    objects we are importing.  Returns all three objects as unique
    pandas DataFrames.
    """
    # 2. Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_metadata = pd.read_csv(kaggle_file, low_memory=False)
    ratings = pd.read_csv(ratings_file)

    # 3. Open the read the Wikipedia data JSON file.
    with open(wiki_file, mode='r') as file:
        wiki_movies_json = json.load(file)
    
    # Remove TV shows
    wiki_movies_json = [wiki_movies_json[i]\
                        for i in range(len(wiki_movies_json))\
                        if 'No. of episodes' not in wiki_movies_json[i]]
    
    # Iterate through clean movie function to tidy columns
    wiki_movies_json = [clean_movie(wiki_movies_json[i]) for i in range(len(wiki_movies_json))]
    
    # Create dataframe
    wiki_movies_df = pd.DataFrame(wiki_movies_json)
    
    # Extract all IMDB IDs from valid URls and remove records that do not contain them
    try:
        wiki_movies_df.dropna(subset=['imdb_link'], inplace=True)
        wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
        wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
    except:
        print('IMDB extraction failed.')
    
    columns_to_drop = [column\
                         for column in wiki_movies_df.columns\
                         if wiki_movies_df[column].count() == 0]   
        
    wiki_movies_df.drop(columns=columns_to_drop, inplace=True)
    
    #Regex strings for currency patterns                                   
    form_one = r"\$\s*\d{1,3}\.?\d*\s*[mb]illi?on"                                       
    form_two = r"\$\s*\d+[,\.]\d{3}"
                                       
    box_office = wiki_movies_df['Box office'].dropna()
    box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)                                       
                                       
    wiki_movies_df['box_office'] = box_office.str.\
                                    extract(f"({form_one}|{form_two})",\
                                            flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_movies_df.drop('Box office', axis=1, inplace=True)
    
    # BUDGET
    budget = wiki_movies_df['Budget'].dropna()
    budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)
    
    # Omit wikipedia citation markers using square brackets
    budget = budget.str.replace(r'\[\d+\]\s*', '')

    # Remove any hyphens and defer to smaller end of range
    budget = budget.str.replace(r'\$.*[-—–](?![a-z])' , '$', regex=True)

    contains_form_one = budget.str.contains(pat=form_one, flags=re.IGNORECASE, na=False)
    contains_form_two = budget.str.contains(pat=form_two, flags=re.IGNORECASE, na=False)
    
    wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_movies_df.drop('Budget', axis=1, inplace=True)
    
    # fix release date col
    release_date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

    # match string one: Month Name, 1-2 digits, 4 digit year
    date_pat_1 = r"\w*\s\d{1,2},\s\d{4}"
    matches_pat_1 = release_date.str.contains(date_pat_1, flags=re.IGNORECASE, na=False)

    # pattern 2: yyyy-dd-mm
    date_pat_2 = r"\d{4}[-—–]\d{2}[-—–]\d{2}"
    matches_pat_2 = release_date.str.contains(date_pat_2, flags=re.IGNORECASE, na=False)

    # pattern 3: (optional day), month name, year
    date_pat_3 = r"\d{0,2}\s*\w{3,10}\s\d{4}"
    matches_pat_3 = release_date.str.contains(date_pat_3, flags=re.IGNORECASE, na=False)

    # pattern 4: four digit year only
    date_pat_4 = r"\d{4}"
    matches_pat_4 = release_date.str.contains(date_pat_4, flags=re.IGNORECASE, na=False)

    wiki_movies_df['release_date'] = pd.to_datetime(
        release_date.str.extract(f'({date_pat_1}|{date_pat_2}|{date_pat_3}|{date_pat_4})')[0],
        infer_datetime_format=True,
        errors='coerce')
    
    wiki_movies_df.drop('Release date', axis=1, inplace=True)    
    
    # Convert runtime from string to numeric
    # two string forms transformed: "# h(ours) ## m(inutes)", and '### minutes"
    running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
    running_time_extract = running_time.str.extract(r"(\d+)\s*ho?u?r?s?\s*(\d*)|(\d{1,3})\s*m")
    running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)
    wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)
    wiki_movies_df.drop('Running time', axis=1, inplace=True)

    # 5. Return the three DataFrames
    return wiki_movies_df, kaggle_metadata, ratings

### Map to dataframes

In [4]:
# 17. Create the path to your file directory and variables for the three files.
file_dir = './data'
# Wikipedia data
wiki_file = f'{file_dir}/wikipedia-movies.json'
# Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [5]:
# 18. Set the three variables equal to the function created in D1.
wiki_file, kaggle_file, ratings_file = import_source_files(wiki_file=wiki_file, 
                                                           kaggle_file=kaggle_file, 
                                                           ratings_file=ratings_file)



In [6]:
# 19. Set the wiki_movies_df equal to the wiki_file variable. 
wiki_movies_df = wiki_file
wiki_movies_df["box_office"].isnull().count()

7049

In [7]:
# 20. Check that the wiki_movies_df DataFrame looks like this. 
wiki_movies_df.count()

url              7049
year             7049
imdb_link        7049
title            7044
Screenplay by    2307
                 ... 
imdb_id          7049
box_office       5459
budget           4705
release_date     6236
running_time     6900
Length: 90, dtype: int64

In [8]:
# 21. Check that wiki_movies_df DataFrame columns are correct. 
wiki_movies_df.columns.to_list()

['url',
 'year',
 'imdb_link',
 'title',
 'Screenplay by',
 'Story by',
 'Based on',
 'Starring',
 'Narrated by',
 'Music by',
 'Cinematography',
 'Productioncompany ',
 'Country',
 'Language',
 'Director(s)',
 'Distributor',
 'Editor(s)',
 'Producer(s)',
 'Writer(s)',
 'Genre',
 'Theme music composer',
 'Original language(s)',
 'Production company(s)',
 'Original network',
 'Productioncompanies ',
 'Executive producer(s)',
 'Production location(s)',
 'Picture format',
 'Audio format',
 'Voices of',
 'Followed by',
 'Composer(s)',
 'Created by',
 'Preceded by',
 'Author',
 'Publisher',
 'Publication date',
 'Media type',
 'Pages',
 'ISBN',
 'OCLC',
 'LC Class',
 'Cover artist',
 'Series',
 'Set in',
 'Adaptation by',
 'Suggested by',
 'alt_titles',
 'Released',
 'Recorded',
 'Venue',
 'Label',
 'Area',
 'Coordinates',
 'Status',
 'Opening date',
 'Closing date',
 'Replaced',
 'Replaced by',
 'Name',
 'Attraction type',
 'Music',
 'Duration',
 'Also known as',
 'Animation by',
 'Color p

In [9]:
# print([column\
#        for column in wiki_movies_df.columns\
#        if wiki_movies_df[column].count()/len(wiki_movies_df) < .01])

In [10]:
## exploratory, check out release date
release_date_column = [column\
                       for column in wiki_movies_df.columns\
                       if 'date' in column.lower()\
                           or 'release' in column.lower()\
                           or 'premiere' in column.lower()]

wiki_movies_df[release_date_column].count()

# wiki_movies_df[['Date premiered', 'Date ']] 

Publication date       4
Released               4
Opening date           1
Closing date           1
Date premiered         1
Place premiered        1
release_date        6236
dtype: int64

In [11]:
# # retain me for further release date analysis at end of project if time

# release_date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)
# # release_date.head(50)

# # # match string one: Month Name, 1-2 digits, 4 digit year
# date_pat_1 = r"\w*\s\d{1,2},\s\d{4}"
# matches_pat_1 = release_date.str.contains(date_pat_1, flags=re.IGNORECASE, na=False)
# # matches_pat_1.head(50)
# # release_date[~matches_pat_1].sample(50)

# # # pattern 2: yyyy-dd-mm
# date_pat_2 = r"\d{4}[-—–]\d{2}[-—–]\d{2}"
# matches_pat_2 = release_date.str.contains(date_pat_2, flags=re.IGNORECASE, na=False)
# # release_date[matches_pat_2].sample(50)
# release_date[~matches_pat_1 & ~matches_pat_2].sample(50)

# # pattern 3: month name, year
# date_pat_3 = r"\d{0,2}\s*\w{3,10}\s\d{4}"
# matches_pat_3 = release_date.str.contains(date_pat_3, flags=re.IGNORECASE, na=False)
# # release_date[~matches_pat_1 & ~matches_pat_2 & ~matches_pat_3].sample(50)

# # pattern 4: four letter year
# date_pat_4 = r"\d{4}"
# matches_pat_4 = release_date.str.contains(date_pat_4, flags=re.IGNORECASE, na=False)
# release_date[~matches_pat_1 & ~matches_pat_2 & ~matches_pat_3 & ~matches_pat_4]

In [17]:
wiki_movies_df.head(20)

Unnamed: 0,url,year,imdb_link,title,Screenplay by,Story by,Based on,Starring,Narrated by,Music by,...,Alma mater,Film(s),Screen story by,Original work,Television series,imdb_id,box_office,budget,release_date,running_time
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990.0,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","Andrew ""Dice"" Clay","[Cliff Eidelman, Yello]",...,,,,,,tt0098987,21400000.0,20000000.0,1990-07-11,102.0
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990.0,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",,Maurice Jarre,...,,,,,,tt0098994,2700000.0,6000000.0,1990-05-17,114.0
2,https://en.wikipedia.org/wiki/Air_America_(film),1990.0,https://www.imdb.com/title/tt0099005/,Air America,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",,Charles Gross,...,,,,,,tt0099005,57718.0,35000000.0,1990-08-10,113.0
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990.0,https://www.imdb.com/title/tt0099012/,Alice,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,,...,,,,,,tt0099012,7331.0,12000000.0,1990-12-25,106.0
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990.0,https://www.imdb.com/title/tt0099018/,Almost an Angel,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",,Maurice Jarre,...,,,,,,tt0099018,6939.0,25000000.0,1990-12-19,95.0
5,https://en.wikipedia.org/wiki/The_Ambulance,1990.0,https://www.imdb.com/title/tt0099026/,The Ambulance,,,,"[Eric Roberts, James Earl Jones, Red Buttons, ...",,Jay Chattaway,...,,,,,,tt0099026,,,1990-03-22,95.0
6,https://en.wikipedia.org/wiki/American_Dream_(...,1990.0,https://www.imdb.com/title/tt0099028/,American Dream,,,,,,Michael Small,...,,,,,,tt0099028,,,1990-10-06,100.0
7,https://en.wikipedia.org/wiki/American_Ninja_4...,1990.0,https://www.imdb.com/title/tt0101326/,American Ninja 4: The Annihilation,,,,"[Michael Dudikoff, David Bradley, James Booth,...",,,...,,,,,,tt0101326,,,1991-03-08,99.0
8,https://en.wikipedia.org/wiki/Andre%27s_Mother,1990.0,https://www.imdb.com/title/tt0099037/,Andre's Mother,,,,"[Richard Thomas, Sada Thompson, Sylvia Sidney]",,Jonathan Sheffer,...,,,,,,tt0099037,,,1990-03-07,50.0
9,https://en.wikipedia.org/wiki/Angel_Town_(film),1990.0,https://www.imdb.com/title/tt0099039/,Angel Town,,,,"[Olivier Gruner, Theresa Saldana, Frank Aragon...",,Terry Plumeri,...,,,,,,tt0099039,855810.0,,1990-02-23,102.0


In [None]:
running_time_extract = running_time.str.extract(r"(\d+)\s*ho?u?r?s?\s*(\d*)|(\d{1,3})\s*m")

running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)
wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)
wiki_movies_df.drop('Running time', axis=1, inplace=True)
wiki_movies_df[['running_time']].sample(50)