In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress


# Import the OMDB API key
from api_keys import omdb_api_key


ImportError: cannot import name 'omdb_api_key' from 'api_keys' (C:\Users\rache\netflixandchill\notebook\api_keys.py)

In [7]:
# Data files
netflix_path = "../data/netflix_titles.csv"
disney_path = "../data/disney_plus_titles.csv"
prices_path = "../data/subscription_prices.csv"

# Read the Netflix and Disney+ data
netflix_data = pd.read_csv(netflix_path)
disney_data = pd.read_csv(disney_path)
prices_data = pd.read_csv(prices_path)

In [8]:
# Create Netflix dataframe and review length
netflix_df = pd.DataFrame(netflix_data)
len(netflix_df)

8807

In [9]:
# Netflix date_added is the key column for filtering to 2019 - 2021 to align with Disney+ dataset date_added dates
# The following steps trim the "September 24, 2018" dates to remove extra spaces
netflix_df['date_added'] = netflix_df['date_added'].str.strip()
# netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'], errors='coerce')

# There are a handful of blank date_added that will impact date filtering
netflix_blank_dates = netflix_df[netflix_df['date_added'].isna()]
len(netflix_blank_dates)

10

In [10]:
# Drop rows with blank dates
netflix_df = netflix_df.dropna(subset=['date_added'])

# Verify the number of rows with blank dates after dropping
len(netflix_df[netflix_df['date_added'].isna()])

0

In [11]:
# Extract only the date portion and convert it to string
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'], errors='coerce')
netflix_df['date_added'] = netflix_df['date_added'].dt.date.astype(str)


In [12]:
# Filter out for dates greater than January 1, 2019
netflix_df = netflix_df.loc[(netflix_df['date_added'] >= '2019-01-01') & (netflix_df['date_added'] <= '2021-12-31')]
len(netflix_df)

5393

In [13]:
# Create Disney+ dataframe and review length
disney_df = pd.DataFrame(disney_data)
len(disney_df)

1450

In [14]:
# There are a handful of blank date_added that will impact data
disney_blank_dates = disney_df[disney_df['date_added'].isna()]
len(disney_blank_dates)

3

In [15]:
# Drop rows with blank dates
disney_df = disney_df.dropna(subset=['date_added'])

# Verify the number of rows with blank dates after dropping
len(disney_df[disney_df['date_added'].isna()])

0

In [16]:
# Extract only the date portion and convert it to string
disney_df['date_added'] = pd.to_datetime(disney_df['date_added'], errors='coerce')
disney_df['date_added'] = disney_df['date_added'].dt.date.astype(str)

In [17]:
# Sorted data frames by date_added
sorted_netflix_df = netflix_df.sort_values(by = 'date_added', ascending=False)
sorted_disney_df = disney_df.sort_values(by = 'date_added', ascending=False)

In [18]:
# Most recent 500 titles per platform
recentadd_netflix_df = sorted_netflix_df.head(500)
recentadd_disney_df = sorted_disney_df.head(500)

In [19]:
# Create Subscription Price dataframe and review length
prices_df = pd.DataFrame(prices_data)
len(prices_df)

777

In [20]:
# Re-establishing filtered DataFrame as primary DataFrame for analysis
netflix_df = recentadd_netflix_df
disney_df = recentadd_disney_df

In [21]:
# Add platform column to each streaming service to maintain association
netflix_df.loc[:, "platform"] = "Netflix"
disney_df.loc[:, "platform"] = "Disney+"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_df.loc[:, "platform"] = "Netflix"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disney_df.loc[:, "platform"] = "Disney+"


In [22]:
# Concatenate Netflix and Disney+ dataframes
combined_df = pd.concat([netflix_df, disney_df], ignore_index=True)
len(combined_df)

1000

In [23]:
# Find duplicate titles to remove from analysis
duplicate_titles = combined_df[combined_df.duplicated(subset=['title'])]
len(duplicate_titles)

0

In [24]:
# Extract the primary genre from listed_in by pulling the first values within the list
combined_df['listed_in'] = combined_df['listed_in'].str.split(',')
combined_df['primary_genre'] = combined_df['listed_in'].str.get(0)

# blank_genre = combined_df[combined_df['primary_genre'].isna()]
# len(blank_genre)

In [25]:
# Clean dataframe with columns of importance for data analysis
combined_df = combined_df.loc[:, ['title', 'type','release_year', 'primary_genre', 'platform', 'date_added']]

In [26]:
# Convert 'date_added' column to datetime
combined_df['date_added'] = pd.to_datetime(combined_df['date_added'])
# Extract year from 'date_added' and store it in a new column 'year_added'
combined_df['year_added'] = combined_df['date_added'].dt.year

combined_df.head()

Unnamed: 0,title,type,release_year,primary_genre,platform,date_added,year_added
0,Dick Johnson Is Dead,Movie,2020,Documentaries,Netflix,2021-09-25,2021
1,My Little Pony: A New Generation,Movie,2021,Children & Family Movies,Netflix,2021-09-24,2021
2,"Vendetta: Truth, Lies and The Mafia",TV Show,2021,Crime TV Shows,Netflix,2021-09-24,2021
3,The Starling,Movie,2021,Comedies,Netflix,2021-09-24,2021
4,The Great British Baking Show,TV Show,2021,British TV Shows,Netflix,2021-09-24,2021


In [27]:
# Add empty columns for IMDb metadata to be pulled from API
combined_df['imdb_id'] = ''
combined_df['imdb_rating'] = ''
combined_df['imdb_votes'] = ''
combined_df['box_office_sales'] = ''
combined_df['production_cost'] = ''

In [36]:
# Enter in field value replacements for genre normalization (i.e. Documentaries and Docuseries to Documentary)
len(combined_df['primary_genre'].unique())

42

In [41]:
genres = combined_df['primary_genre'].unique()
genres

array(['Documentary', 'Children & Family Movies', 'Crime TV Shows',
       'Comedy', 'British TV Shows', 'Drama', 'International TV Shows',
       'TV Dramas', 'TV Comedies', 'Thrillers', 'Horror Movies',
       "Kids' TV", 'Action & Adventure', 'Reality TV', 'Anime Series',
       'International Movies', 'Sci-Fi & Fantasy',
       'Classic/Cult/Independent', 'TV Shows', 'Stand-Up Comedy',
       'TV Action & Adventure', 'Behind the Scenes',
       'Stand-Up Comedy & Talk Shows', 'Classic & Cult TV',
       'Anime Features', 'Romantic TV Shows', 'Animated',
       'Music/Musicals', 'Historical', 'Animals & Nature',
       'Action/Adventure', 'Concert Film', 'Science Fiction',
       'Coming of Age', 'Lifestyle', 'Reality', 'Family/Kids',
       'Thriller/Crime', 'Talk Show', 'SciFi/Fantasy', 'Variety',
       'Game Show / Competition'], dtype=object)

In [38]:
# normalization
data_iteration_1 = combined_df.replace(to_replace='Animation', value = 'Animated')
data_iteration_2 = data_iteration_1.replace(to_replace=['Docuseries', 'Documentaries'], value = 'Documentary')
data_iteration_3 = data_iteration_2.replace(to_replace=['Biographical', 'Historical'], value = 'Historical')
data_iteration_4 = data_iteration_3.replace(to_replace=['Thriller', 'Crime'], value = 'Thriller/Crime')
data_iteration_5 = data_iteration_4.replace(to_replace=['Stand-UpComedy&Talk', 'Comedies'], value = 'Comedy')
data_iteration_6 = data_iteration_5.replace(to_replace=['Stand-UpComedy', 'Buddy'], value = 'Comedy')
data_iteration_7 = data_iteration_6.replace(to_replace=['Classic Movies', 'Classic&Cult'], value = 'Classic/Cult/Independent')
data_iteration_8 = data_iteration_7.replace(to_replace='IndependentMovies', value = 'Classic/Cult/Independent')
data_iteration_9 = data_iteration_8.replace(to_replace=['Horror', 'HorrorMovies'], value = 'Horror')
data_iteration_10 = data_iteration_9.replace(to_replace=['Music&Musicals', 'ConcertFilm'], value = 'Music/Musicals')
data_iteration_11 = data_iteration_10.replace(to_replace=['Music', 'Musical'], value = 'Music/Musicals')
data_iteration_12 = data_iteration_11.replace(to_replace='Dance', value = 'Music/Musicals')
data_iteration_13 = data_iteration_12.replace(to_replace=['Family', 'Kids'], value = 'Family/Kids')
data_iteration_14 = data_iteration_13.replace(to_replace=["Kids'", 'Children&FamilyMovies'], value = 'Family/Kids')
data_iteration_15 = data_iteration_14.replace(to_replace=['ScienceFiction', 'Fantasy'], value = 'SciFi/Fantasy')
data_iteration_16 = data_iteration_15.replace(to_replace='Sci-Fi&Fantasy', value = 'SciFi/Fantasy')
data_iteration_17 = data_iteration_16.replace(to_replace=['Action-Adventure', 'Action&Adventure'], value = 'Action/Adventure')
data_iteration_18 = data_iteration_17.replace(to_replace=['International', 'InternationalMovies'], value = 'International')
data_iteration_19 = data_iteration_18.replace(to_replace='Dramas', value = 'Drama')
data_iteration_20 = data_iteration_19.replace(to_replace='Gameshow/Competition', value = 'Reality')
data_iteration_21 = data_iteration_20.replace(to_replace=['Movies', 'Series'], value = 'Behind the Scenes')
combined_df = data_iteration_21.replace(to_replace='Anthology', value = 'Behind the Scenes')
combined_df

Unnamed: 0,title,type,release_year,primary_genre,platform,date_added,year_added,imdb_id,imdb_rating,imdb_votes,box_office_sales,production_cost
0,Dick Johnson Is Dead,Movie,2020,Documentary,Netflix,2021-09-25,2021,,,,,
1,My Little Pony: A New Generation,Movie,2021,Children & Family Movies,Netflix,2021-09-24,2021,,,,,
2,"Vendetta: Truth, Lies and The Mafia",TV Show,2021,Crime TV Shows,Netflix,2021-09-24,2021,,,,,
3,The Starling,Movie,2021,Comedy,Netflix,2021-09-24,2021,,,,,
4,The Great British Baking Show,TV Show,2021,British TV Shows,Netflix,2021-09-24,2021,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Disney A.N.T. Farm,TV Show,2011,Comedy,Disney+,2020-06-26,2020,,,,,
996,Marvel Studios' Avengers: Infinity War,Movie,2018,Action/Adventure,Disney+,2020-06-26,2020,,,,,
997,Tarzan II,Movie,2005,Action/Adventure,Disney+,2020-06-26,2020,,,,,
998,Tarzan,Movie,1999,Action/Adventure,Disney+,2020-06-26,2020,,,,,


In [None]:
# TEST
# Response 200 testing
# url = "http://www.omdbapi.com/?t="
# api_key = "&apikey=" + omdb_api_key
# response = requests.get(url + "Aliens" + api_key, verify=False)
# print(response)

In [None]:
# TEST
# Printing API URLs out

# Establishing base URL for OMDB API
# url = "http://www.omdbapi.com/"

# Construct the API request URL with the title and API key
# params = {
#        'apikey': omdb_api_key,
#    }

# Loop through each title in the 'title' column of combined_df
# for index, row in combined_df.iterrows():
#    time.sleep(2) # Add a delay to avoid hitting the API too quickly
    
#    title = row['title']
#    params['t'] = title # get title from combined_df
    
# Construct the API URL
#    api_url = url + "?" + "&".join([f"{key}={value}" for key, value in params.items()])

# Print the API URL for the current title
#    print("API URL for", title, ":", api_url)


In [None]:
# this is being worked on - could not get api response to work (garrett) 

# Establishing base URL for OMDB API
url = "http://www.omdbapi.com/"

# Construct the API request URL with the title and API key
params = {
        'apikey': omdb_api_key,
    }

# Loop through each title in the 'title' column of combined_df
for index, row in combined_df.iterrows():
    time.sleep(2) # Add a delay to avoid hitting the API too quickly
    
    title = row['title'] # get title from current row
    params['t'] = title # establish "t" parameter for current title
 
   # Run an API request for each of the titles
    try:
        # Parse the JSON and retrieve data
        omdb_response = requests.get(url, params=params, verify=False)
        omdb_data = omdb_response.json()
    
    # Parse out OMDB ratings, votes, etc.
        id = omdb_data.get('imdbID')
        rating = omdb_data.get('imdbRating')
        votes = omdb_data.get('imdbVotes')
        box_office_sales = omdb_data.get('BoxOffice')
        prod_cost = omdb_data.get('Production')
        
    # Assign OMDB information into combined_df
        combined_df.at[index, "imdb_id"] = id
        combined_df.at[index, "imdb_rating"] = rating
        combined_df.at[index, "imdb_votes"] = votes 
        combined_df.at[index, "box_office_sales"] = box_office_sales 
        combined_df.at[index, "production_cost"] = prod_cost

                             
        print(f"Data retrieved for '{title}': imdbRating = {rating}, imdbVotes = {votes}")
    except Exception as e:
        print(f"Failed to fetch data for '{title}': {e}")