In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress

# Import the IMDb API key
# from api_keys import imdb_api_key

In [None]:
# Data files
netflix_path = "../data/netflix_titles.csv"
disney_path = "../data/disney_plus_titles.csv"
prices_path = "../data/subscription_prices.csv"

# Read the Netflix and Disney+ data
netflix_data = pd.read_csv(netflix_path)
disney_data = pd.read_csv(disney_path)
prices_data = pd.read_csv(prices_path)

In [None]:
# Create Netflix dataframe and review length
netflix_df = pd.DataFrame(netflix_data)
len(netflix_df)

In [None]:
# Netflix date_added is the key column for filtering to 2019 - 2021 to align with Disney+ dataset date_added dates
# The following steps trim the "September 24, 2018" dates to remove extra spaces, and converts date_added to date format for filtering
netflix_df['date_added'] = netflix_df['date_added'].str.strip()
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'], errors='coerce')

# There are a handful of blank date_added that will impact date filtering
netflix_blank_dates = netflix_df[netflix_df['date_added'].isna()]
len(netflix_blank_dates)

In [None]:
# Drop rows with blank dates
netflix_df = netflix_df.dropna(subset=['date_added'])

# Verify the number of rows with blank dates after dropping
len(netflix_df[netflix_df['date_added'].isna()])

In [None]:
# Filter out for dates greater than January 1, 2019
netflix_df = netflix_df.loc[(netflix_df['date_added'] >= '2019-01-01') & (netflix_df['date_added'] <= '2021-12-31')]
len(netflix_df)

In [None]:
# Create Disney+ dataframe and review length
disney_df = pd.DataFrame(disney_data)
disney_df.head()

In [None]:
# There are a handful of blank date_added that will impact data
disney_blank_dates = disney_df[disney_df['date_added'].isna()]
len(disney_blank_dates)

In [None]:
# Drop rows with blank dates
disney_df = disney_df.dropna(subset=['date_added'])

# Verify the number of rows with blank dates after dropping
len(disney_df[disney_df['date_added'].isna()])

In [None]:
# Create Subscription Price dataframe and review length
prices_df = pd.DataFrame(prices_data)
len(prices_df)

In [None]:
# Add platform column to each streaming service to maintain association
netflix_df["platform"] = "Netflix"
disney_df["platform"] = "Disney+"

In [None]:
# Concatenate Netflix and Disney+ dataframes
combined_df = pd.concat([netflix_df, disney_df], ignore_index=True)

In [None]:
combined_df.dtypes

In [None]:
# Split the listed_in genre column to extract primary genre
combined_df['listed_in'] = combined_df['listed_in'].fillna('')
combined_df['listed_in'] = combined_df['listed_in'].astype(str)
combined_df['listed_in'] = combined_df['listed_in'].str.strip()
combined_df['listed_in'] = combined_df['listed_in'].str.split(',')
combined_df.head(30)

# this needs works - Garrett