In [1]:
# Import Dependencies
import numpy as np
import os
import pandas as pd


In [2]:
# Path to file directory and variables for the two files.
file_dir = os.path.join("..", "Data")

# The Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'

# The MovieLens rating data
ratings_file = f'{file_dir}/ratings_small.csv'

In [3]:
# Import Kaggle metadata, and MovieLens rating data (from Kaggle)

# Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
kaggle_metadata = pd.read_csv(kaggle_file, low_memory=False)
ratings = pd.read_csv(ratings_file)

# Import Viewer Movies and Ratings (from IMDB)
#viewer_movies = 

In [4]:
# Check DataFrames
kaggle_metadata.head()
kaggle_metadata.count()

adult                    45466
belongs_to_collection     4494
budget                   45466
genres                   45466
homepage                  7782
id                       45466
imdb_id                  45449
original_language        45455
original_title           45466
overview                 44512
popularity               45461
poster_path              45080
production_companies     45463
production_countries     45463
release_date             45379
revenue                  45460
runtime                  45203
spoken_languages         45460
status                   45379
tagline                  20412
title                    45460
video                    45460
vote_average             45460
vote_count               45460
dtype: int64

In [5]:
# Clean the Kaggle metadata.
# Keep only Non-Adult Movies, then drop Adult Column
kaggle_metadata = kaggle_metadata[kaggle_metadata['adult'] == 'False'].drop('adult', axis='columns')

# Convert video Column to Boolean
kaggle_metadata["video"] = kaggle_metadata["video"] == 'True'

# Convert Columns to Numeric
kaggle_metadata['budget'] = kaggle_metadata['budget'].astype(int)
kaggle_metadata['id'] = pd.to_numeric(kaggle_metadata['id'], errors='raise')
kaggle_metadata['popularity'] = pd.to_numeric(kaggle_metadata['popularity'], errors='raise')

# Convert release_date to datetime
kaggle_metadata['release_date'] = pd.to_datetime(kaggle_metadata['release_date'])

# Create movies_df
movies_df = kaggle_metadata


In [7]:
# Transform and merge the ratings DataFrame.
# Count Ratings
rating_counts = ratings.groupby(['movieId','rating'], as_index=False).count() \
    .rename({'userId':'count'}, axis=1)

# Pivot Ratings Count Data
rating_counts = ratings.groupby(['movieId','rating'], as_index=False).count() \
    .rename({'userId':'count'}, axis=1) \
    .pivot(index='movieId',columns='rating', values='count')

# Rename Ratings Count Columns
rating_counts.columns = ['rating_' + str(col) for col in rating_counts.columns]

# Merge Rating Counts into movies_df
movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on='id', right_index=True, how='left')

# Fill in Missing Ratings with Zeroes
movies_with_ratings_df[rating_counts.columns] = movies_with_ratings_df[rating_counts.columns].fillna(0)

