# Netflix Engagement Reports 2023
## Step 0: Install libraries

In [1]:
#!pip install pandas

In [2]:
#!pip install numpy

In [3]:
#!pip install openpyxl

## Step 1: Import libraries and set options

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime

pd.set_option("display.max_rows", 10)
np.set_printoptions(precision=4, suppress=True)

## Step 2: Read Excel files

In [5]:
fn1: str = 'What_We_Watched_A_Netflix_Engagement_Report_2023Jan-Jun.xlsx'
fn2: str = 'What_We_Watched_A_Netflix_Engagement_Report_2023Jul-Dec.xlsx'

cols: str = 'B:E'

df1 = pd.read_excel(
    fn1, 
    # no separate sheets for TV and movies here!
    header=5, 
    usecols=cols
)

df2 = pd.read_excel(
    fn2, 
    sheet_name='TV',
    header=5,
    usecols=cols
)

### Inspect DataFrame for Jan-Jun 2023:

In [None]:
df1

### Inspect DataFrame for Jul-Dec 2023:

In [None]:
df2

## Step 3: Concatenate DataFrames and filter

### Concatenated DataFrame:

In [None]:
df = pd.concat([df1, df2]).reset_index(drop=True)
df

In [None]:
df['Release Date'] = pd.to_datetime(df['Release Date'])
df

### Titles released 1-1-2023 or later:

In [None]:
date_mask: pd.Series = df['Release Date'] > datetime(year=2022, month=12, day=31)
df = df[date_mask]
df

### Excluding movies:

Issue (#todo): 'Queen Charlotte: A Bridgerton Story' is a TV series; not a movie. How to handle cases like this one? Do we use the IMDb dataset to match titles and retrieve the `titleType` from `title.basics.tsv.gz`?

In [None]:
season_mask = df['Title'].str.contains('Season', regex=False)
no_season_mask = ~season_mask
no_season: pd.DataFrame = df[no_season_mask]

series_mask = df['Title'].str.contains('Series', regex=False)
no_series_mask = ~series_mask
no_series: pd.DataFrame = no_season[no_series_mask]

no_series

## Step 4: Group by title to find total hours viewed

In [None]:
total_views: pd.Series = df.groupby('Title')['Hours Viewed'].sum()
total_views