# Import Wikipedia json and Kaggle csv data

In [None]:
import json
import pandas as pd
import numpy as np
import re

In [None]:
# import json file
with open ("wikipedia-movies.json", mode="r") as file:
    wiki_movies_raw = json.load(file)

In [None]:
len(wiki_movies_raw)

In [None]:
# see first 5 of raw json data
wiki_movies_raw[:5]

In [None]:
# see last 5 of raw json data
wiki_movies_raw[-5:]

In [None]:
wiki_movies_raw[0].keys()

In [None]:
kaggle_metadata = pd.read_csv("movies_metadata.csv", low_memory = False)
ratings = pd.read_csv("../ratings.csv")

In [None]:
kaggle_metadata.head()

In [None]:
kaggle_metadata.info()

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
wiki_movies = pd.DataFrame(wiki_movies_raw)
wiki_movies.head(5)

In [None]:
wiki_movies.columns.to_list()

In [None]:
# use list comprehension to restrict data with specified paramters: 
# has "Director"/"Directed by", has "imdb_link", and does not have "No. of episodes"
wiki_movies1 = [movie for movie in wiki_movies_raw
               if ("Director" in movie or "Directed by" in movie)
               and "imdb_link" in movie
               and "No. of episodes" not in movie]
len(wiki_movies1)

In [None]:
# create a function to clean up the data with movie as parameter
def clean_movie(movie):
    
    # 1. make a copy of the dict to save it in memory to avoid destructable edits 
    # (using a local variable movie that can only be referenced inside the function)
    movie = dict(movie)
    alt_titles = {}
    
    # 2. use a for loop to loop through columns with these names and remove them with pop()
    for key in ['Also known as','Arabic','Cantonese','Chinese',
                'French','Hangul','Hebrew','Hepburn','Japanese',
                'Literally','Mandarin','McCune–Reischauer','Original title',
                'Polish','Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        
        # 2a. if the key exists in the movie object remove it and append it to the created dict above
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
            
        # 3. After loop add alt_titles dict to movies
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles
        
    # 4. Merge column names by 
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name("Adaptation by", "Writer(s)")
    change_column_name("Country of origin", "Country")
    change_column_name("Directed by", "Director")
    change_column_name("Distributed by", "Distributor")
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')
    
   
    return movie

In [None]:
# use list comprehension to call the function on wiki_movies1 and iterate through it to output clean list
clean_movies = [clean_movie(movie) for movie in wiki_movies1]
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

In [None]:
# using str.extract() pull the imdb ID from the link in the dataframe
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
print(len(wiki_movies_df))

# drop duplicates of imdb IDs
wiki_movies_df.drop_duplicates(subset="imdb_id", inplace=True)
print(len(wiki_movies_df))
wiki_movies_df.head()

In [None]:
print("Number of null values in column 38 : " + 
       str(wiki_movies_df.iloc[:, 38].isnull().sum()))

In [None]:
# use list comprehension to return # of null values for each column
[[column, wiki_movies_df[column].isnull().sum()] for column in wiki_movies_df.columns]

In [None]:
# make a list of columns having less than 90% null values to reduce dataset
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]

In [None]:
wiki_movies_df.info()

In [None]:
# convert columns to proper dtypes using regular expressions (regex) which only work on strings. But first,
# drop the missing values.
box_office = wiki_movies_df["Box office"].dropna()

In [None]:
# define function to find box office data that is a string to sort it from the data that is not.
# def is_not_a_string(x):
#     return type(x) != str
# box_office[box_office.map(is_not_a_string)]

# OR use a lambda function in place of function above for single line simplicity.
box_office[box_office.map(lambda x: type(x) != str)]

In [None]:
# Because some data is stored as a list, use .join to put the lists together into a string with a separator.
box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)
box_office.head(40)

In [None]:
# begin parsing the box office data
money_form_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
matches_1 = box_office.str.contains(money_form_one, flags=re.IGNORECASE, na=False) # number of rows that match the form
matches_1.sum()

In [None]:
money_form_two = r'\$\s*\d{1,3}(?:,\d{3})+'
matches_2 = box_office.str.contains(money_form_two, na=False)
matches_2.sum()

In [None]:
# find which rows match neither regex
box_office[~matches_1 & ~matches_2]

In [None]:
# find every value that uses a hyphen and replace
box_office = box_office.str.replace(r'\$.*[---](?![a-z])', '$', regex=True)
box_office

In [None]:
# find the rest of the missing values
match_3 = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
match_4 = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)'

In [None]:
# create a function to transform extracetd values into numeric values
def parse_boxoffice(v):
    # if the value is not a string then return it as NaN
    if type(v) != str:
        return np.nan
    
    # if the value is in the form of $NNN.N million, remove the '$' and ' million'
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', v, flags=re.IGNORECASE):
        v = re.sub('\$|\s|[a-zA-Z]','', v)
        
        # convert to float and multiply by 1,000,000
        value = float(v) * 10**6
    
        # return the new value
        return value
    
    # if the value is in the form of $NNN.N billion, remove the '$' and ' billion'
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', v, flags=re.IGNORECASE):
        v = re.sub('\$|\s|[a-zA-Z]','', v)
        
        # convert to float and multiply by 1,000,000,000
        value = float(v) * 10**9
        
        # return new value
        return value
        
    # if value is in the form of $NNN,NNN,NNN
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', v, flags=re.IGNORECASE):
    
        # remove '$' and commas
        v = re.sub('\$|,','', v)
        
        # convert to float
        value = float(v)
        
        # return new value
        return value
        
    # otherwise return NaN
    else:
        return np.nan

In [None]:
# Extract and convert using regex variables defined above
wiki_movies_df["Box office"] = box_office.str.extract(f'({match_3}|{match_4})', flags=re.IGNORECASE)[0].apply(parse_boxoffice)

In [None]:
wiki_movies_df["Box office"]

In [None]:
wiki_movies_df.drop("Box office", axis=1, inplace=True)

In [None]:
wiki_movies_df.info()

In [None]:
# Begin parsing budget data by dropping null values
budget = wiki_movies_df["Budget"].dropna()

In [None]:
# convert lists to strings or leave as-is
budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)

In [None]:
# replace values between a '$' and hyphen
budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
budget

In [None]:
matches1 = budget.str.contains(match_3, flags=re.IGNORECASE, na=False) # number of rows that match the form
matches1.sum()

In [None]:
matches2 = budget.str.contains(match_4, flags=re.IGNORECASE, na=False) # number of rows that match the form
matches2.sum()

In [None]:
len(budget[~matches1 & ~matches2])

In [None]:
# look for values with citation number in brackets
budget = budget.str.replace(r'\[\d+\]\s*', '', regex=True)
budget[~matches1 & ~matches2]

In [None]:
# create a function to transform extracetd values into numeric values
def parse_budget(v):
    # if the value is not a string then return it as NaN
    if type(v) != str:
        return np.nan
    
    # if the value is in the form of $NNN.N million, remove the '$' and ' million'
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', v, flags=re.IGNORECASE):
        v = re.sub('\$|\s|[a-zA-Z]','', v)
        
        # convert to float and multiply by 1,000,000
        value = float(v) * 10**6
    
        # return the new value
        return value
    
    # if the value is in the form of $NNN.N billion, remove the '$' and ' billion'
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', v, flags=re.IGNORECASE):
        v = re.sub('\$|\s|[a-zA-Z]','', v)
        
        # convert to float and multiply by 1,000,000,000
        value = float(v) * 10**9
        
        # return new value
        return value
        
    # if value is in the form of $NNN,NNN,NNN
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', v, flags=re.IGNORECASE):
    
        # remove '$' and commas
        v = re.sub('\$|,','', v)
        
        # convert to float
        value = float(v)
        
        # return new value
        return value
        
    # otherwise return NaN
    else:
        return np.nan

In [None]:
# find the rest of the missing values
budget3 = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
budget4 = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)'

In [None]:
wiki_movies_df["Budget"] = budget.str.extract(f'({budget3}|{budget4})', flags=re.IGNORECASE)[0].apply(parse_budget)

In [None]:
wiki_movies_df["Budget"]

In [None]:
# Parse release date column
release_date = wiki_movies_df["Release date"].dropna().apply(lambda x: ' ' .join(x) if type(x) == list else x)

In [None]:
release_date.to_list()

In [None]:
date_1 = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
date_2 = r'\d{4}-[01]\d-[0123]\d'
date_3 = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
date_4 = r'\d{4}'

In [None]:
release_date.str.extract(f'({date_1}|{date_2}|{date_3}|{date_4})', flags=re.IGNORECASE)

In [None]:
wiki_movies_df['Release date'] = pd.to_datetime(release_date.str.extract(f'({date_1}|{date_2}|{date_3}|{date_4})')[0], infer_datetime_format=True)
wiki_movies_df.head()

In [None]:
# Parse running time
running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

In [None]:
running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE, na=False).sum()

In [None]:
running_time[running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE, na=False) != True]

In [None]:
# parse the remaining running time formats 
running_time_extract = running_time.str.extract(r'(\d+)\s*h?o?u?r?s\s*(\d*)|(\d+)|\s*m')

In [None]:
running_time_extract.info()

In [None]:
running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

In [None]:
running_time_extract

In [None]:
wiki_movies_df['running time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)

In [None]:
wiki_movies_df.head()

In [None]:
# drop old Running time column
wiki_movies_df.drop("Running time", axis=1, inplace=True)

In [None]:
# Clean Kaggle Data
kaggle_metadata.info()

In [None]:
# convert data types of adult, video, release_date, budget, and id columns
kaggle_metadata[~kaggle_metadata['adult'].isin(['True','False'])]

In [None]:
kaggle_metadata = kaggle_metadata[kaggle_metadata['adult'] == 'False'].drop('adult', axis=1)

In [None]:
kaggle_metadata.head()

In [None]:
# clean video column
kaggle_metadata['video'].value_counts()

In [None]:
# convert to boolean
kaggle_metadata['video'] = kaggle_metadata['video'] == 'True'

In [None]:
kaggle_metadata.info()

In [None]:
# use to_numeric on other columns
kaggle_metadata['budget'] = kaggle_metadata['budget'].astype(int)
kaggle_metadata['id'] = pd.to_numeric(kaggle_metadata['id'], errors='raise')
kaggle_metadata['popularity'] = pd.to_numeric(kaggle_metadata['budget'], errors='raise')

In [None]:
# convert release date to datetime format
kaggle_metadata['release_date'] = pd.to_datetime(kaggle_metadata['release_date'])

In [None]:
# ratings df cleaning
ratings.info(null_counts=True)

In [None]:
# convert timestamp to datetime (it's in Unix epoch)
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [None]:
# look at a histogram to catch any obvious errors in ratings
pd.options.display.float_format = '{:20,.2f}'.format
ratings['rating'].plot(kind='hist')
ratings['rating'].describe()

In [None]:
# merge the wiki and kaggle dataframes
final_movies_df = pd.merge(wiki_movies_df, kaggle_metadata, on='imdb_id', suffixes=['_wiki', '_kaggle'])

In [None]:
final_movies_df.head()

In [None]:
# begin cleaning redundant data (7 total columns)
final_movies_df[['title_wiki','title_kaggle']] # horse a piece

In [None]:
# find the rows that don't match
final_movies_df[final_movies_df['title_wiki'] != final_movies_df['title_kaggle']][['title_wiki','title_kaggle']]
