# Analysis of Netflix data

Loosely based off of:
https://www.dataquest.io/blog/python-tutorial-analyze-personal-netflix-data/

In [None]:
#pandas import
import pandas as pd

In [None]:
#import the data into a pandas dataframe

df = pd.read_csv("../../netflix-report/CONTENT_INTERACTION/ViewingActivity.csv")


In [None]:
#drop those columns that we don't need
df.drop(['Attributes', 'Supplemental Video Type','Device Type', 'Bookmark', 'Latest Bookmark', 'Country'],axis=1)

In [None]:
#now select out only daniel's shows
mask = df['Profile Name'].isin(['daniel'])
#apply mask to select only daniel things
new_df = df[mask]

In [None]:
#check that we are selecting correctly
#drop the columns we don't care about again
simple_df = new_df.drop(['Attributes', 'Supplemental Video Type','Device Type', 'Bookmark', 'Latest Bookmark', 'Country'],axis=1)


In [None]:
#check types of these
simple_df.dtypes

In [None]:
#we need to conver the data types to the correct values
simple_df['Duration'] = pd.to_timedelta(simple_df['Duration'])
simple_df['Start Time'] = pd.to_datetime(simple_df['Start Time'],utc=True)
#convert from UTC to Pacific time
simple_df = simple_df.set_index('Start Time')
simple_df.index = simple_df.index.tz_convert('US/Pacific')
simple_df.reset_index()

In [None]:
simple_df.dtypes

In [None]:
#function to act on the data frame and give information on each show
def simple_show_info(show_name):
    show = simple_df[simple_df['Title'].str.contains(f"{show_name}",regex=False)]
    
    
    
    print(f"Total time spent watching show {show_name}: {show['Duration'].sum()}")

    return show

In [None]:
the_office = simple_show_info('The Office (U.S.)')
the_office.head(10)

In [None]:
#now try to get all possible tv shows that I have watched

#first we get our titles only
title_df = simple_df['Title']
show_df = title_df.str.split(":")
#show_df = show_df.iloc(0)


In [None]:
#convert these into a set
show_titles = [show_df[i][0] for i in range(show_df.size)]

In [None]:
#check them
show_titles = set(show_titles)

In [None]:
#some have a weird "_ABCD" addition to it. We will get rid of those now and just take above the first "_"

#we also have a bunch of "Season X ABC". We can get rid of those without replacing them
#and some are just trailers. Get rid of those too. 

#These will keep track of the good and bad titles
good_titles = []
bad_titles = []

for title in show_titles:
    if "_" in title:
        main_title = title.split("_")[0]
        good_titles.append(main_title)
        bad_titles.append(title)
    #check for the "Season X" names. Just remove them
    if "season" in title.lower():
        bad_titles.append(title)
    if "trailer" in title.lower(): #this will remove other names like Trailer Park Boys, but those are small instances
        bad_titles.append(title)

#now that we are done, we can add and remove the titles from the set
show_titles.update(good_titles)
show_titles.difference_update(bad_titles)

In [None]:
#check them now
show_titles

In [None]:
#take a look at everything we have seen
for title in show_titles:
    simple_show_info(title)