In [1]:
# Import dependencies
import json
import pandas as pd
import numpy as np

In [2]:
# Create a variable for wiki data path
file_dir = 'Data/'

In [3]:
# Read in the JSON data
with open(f'{file_dir}wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw = json.load(file)

In [4]:
# Length of wiki data
len(wiki_movies_raw)

7311

In [5]:
# First 5 records
#wiki_movies_raw[:5]

In [6]:
# Last 5 records
#wiki_movies_raw[-5:]

In [7]:
# Some records in the middle
#wiki_movies_raw[3600:3605]

In [8]:
# Create DataFrames with imported data (kaggle_metadata ?= wiki_movies_df, )
kaggle_metadata = pd.read_csv(f'{file_dir}movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{file_dir}ratings.csv')
wiki_movies_df = pd.DataFrame(wiki_movies_raw)

# Tests
#kaggle_metadata.sample(n=5)
#ratings.sample(n=5)

In [9]:
# test
#wiki_movies_df.head()

In [10]:
# Convert kaggle columns into a list
#wiki_movies_df.columns.to_list()

In [11]:
# Filter wiki_movies_df
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie]
# test
#len(wiki_movies)
#wiki_movies

In [12]:
# Create wiki_df
wiki_df = pd.DataFrame(wiki_movies)
# test
#wiki_df.head()

In [13]:
# Remove tv series
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie
                   and 'No. of episodes' not in movie]

len(wiki_movies)

7076

In [14]:
# Create a function to filter movie data (non-destructive copy into variable)
#def clean_movie(movie):
#    movie = dict(movie) #create a non-destructive copy
#    return movie

In [15]:
# Addressing languages
#test
#wiki_movies_df[wiki_movies_df['Arabic'].notnull()]
wiki_movies_df[wiki_movies_df['Arabic'].notnull()]['url']

7060    https://en.wikipedia.org/wiki/The_Insult_(film)
7293     https://en.wikipedia.org/wiki/Capernaum_(film)
Name: url, dtype: object

In [16]:
sorted(wiki_movies_df.columns.tolist())

['Actor control',
 'Adaptation by',
 'Alias',
 'Alma mater',
 'Also known as',
 'Animation by',
 'Arabic',
 'Area',
 'Area served',
 'Artist(s)',
 'Attraction type',
 'Audio format',
 'Author',
 'Based on',
 'Biographical data',
 'Bopomofo',
 'Born',
 'Box office',
 'Budget',
 'Camera setup',
 'Cantonese',
 'Characters',
 'Children',
 'Chinese',
 'Cinematography',
 'Closing date',
 'Color process',
 'Comics',
 'Composer(s)',
 'Coordinates',
 'Country',
 'Country of origin',
 'Cover artist',
 'Created by',
 'Date premiered',
 'Designer(s)',
 'Developed by',
 'Developer(s)',
 'Dewey Decimal',
 'Died',
 'Directed by',
 'Director',
 'Distributed by',
 'Distributor',
 'Divisions',
 'Duration',
 'Edited by',
 'Editor(s)',
 'Ending theme',
 'Engine',
 'Engine(s)',
 'Executive producer(s)',
 'Family',
 'Fate',
 'Film(s)',
 'Followed by',
 'Format(s)',
 'Formerly',
 'Founded',
 'Founder',
 'Founders',
 'French',
 'Full name',
 'Gender',
 'Genre',
 'Genre(s)',
 'Genres',
 'Gwoyeu Romatzyh',
 'Ha

In [17]:
# Create a function to filter movie data (non-destructive copy into variable) (#8.3.5) (Expected Indent)
#def clean_movie(movie):
#    movie = dict(movie) #create a non-destructive copy
#    alt_titles = {}
#    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
#                'Hangul','Hebrew','Hepburn','Japanese','Literally',
#                'Mandarin','McCune–Reischauer','Original title','Polish',
#                'Revised Romanization','Romanized','Russian',
#                'Simplified','Traditional','Yiddish']:
#        return movie

In [19]:
# Refine function to add alternative titles dict to movie object while popping key from 'movie'
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles

    return movie

In [20]:
# Iterate clean_movie function through wiki_movies into clean_movies
clean_movies = [clean_movie(movie) for movie in wiki_movies]

In [21]:
# Set wiki_movies_df to be the DataFrame created from clean_movies, and print out a list of columns.
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

['Adaptation by',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Country of origin',
 'Created by',
 'Directed by',
 'Director',
 'Distributed by',
 'Distributor',
 'Edited by',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'Length',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Picture format',
 'Preceded by',
 'Produced by',
 'Producer',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Productioncompanies ',
 'Productioncompany ',
 'Recorded',
 'Release date',
 'Released',
 'Running time',
 'Screen story by',
 'Screenplay by',
 'Starring',
 'Story by',
 'Suggested by',
 'Theme music composer',
 'Venue',
 'Voices of',
 'Written by',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']