### Imports and Data Loading

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import matplotlib.image as mpimg
from PIL import Image, ImageDraw, ImageFont
import textwrap

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/my-netflix-data/ViewingActivity.csv')
df

### Rename and drop columns

In [None]:
df.columns = ['name', 'start', 'duration', 'attrs', 'title', 'svt', 'device', 'bookmark', 'latestbookmark', 'country'] 
df = df.drop(['attrs', 'svt', 'bookmark', 'latestbookmark'], axis = 1)
df

### Keep only shows watched by me for more than 5 minutes

In [None]:
# Convert time string in format HH:MM:SS to seconds
def time_string_to_secs(time_string):
    time_string = time_string.split(':')
    return (int(time_string[0]) * 3600) + (int(time_string[1]) * 60) + int(time_string[2])

def time_string_to_mins(time_string):
    time_string = time_string.split(':')
    return (int(time_string[0]) * 3600) + int(time_string[1])

def time_string_to_hours(time_string):
    time_string = time_string.split(':')
    return float(time_string[0]) + float(time_string[1]) / 60


df['duration_mins'] = df.duration.map(time_string_to_mins)
df['duration_secs'] = df.duration.map(time_string_to_secs)

In [None]:
print(df.name.unique())
profile_name = 'TU NOMBRE AQUI'
if profile_name == 'TU NOMBRE AQUI':
    raise Exception("CAMBIA profile_name POR EL NOMBRE DE TU PERFIL")
print(f'Perfil seleccionado: {profile_name}')
df = df.loc[df.name == profile_name]
df = df.loc[df.duration_mins >= 5]
df

## General Stats: Total watch time, Account time and Watch time per week

In [None]:
# Convert seconds to string in format HH:MM:SS
def convert_to_hours_format(seconds):
    hours = seconds // 3600
    seconds -= 3600 * hours
    minutes = seconds // 60
    seconds -= 60 * minutes
    return "%d:%02d:%02d" % (hours, minutes, seconds)

In [None]:
total_time = convert_to_hours_format(df.duration_secs.sum()).split(':')
print(f'Tiempo total viendo contenido: {total_time[0]} hours, {total_time[1]} minutes, {total_time[2]} seconds')

dates = df.start.sort_values()
date_str_format = '%Y-%m-%d %H:%M:%S'
d1 = datetime.datetime.strptime(dates.head(1).values[0], date_str_format)
d2 = datetime.datetime.strptime(dates.tail(1).values[0], date_str_format)

account_time = d2 - d1
account_years = account_time.days // 365
account_months = (account_time.days - (account_years * 365)) // 30
account_days = account_time.days - (account_years * 365) - (account_months * 30)

watch_hours_per_week = int(total_time[0]) / (account_time.days / 7)
print(f'{watch_hours_per_week:.2f} horas por semana en promedio')
print(f'Durante {account_years} años, {account_months} meses, {account_days} días')
account_start = dates.head(1).values[0].split(" ")[0]
account_stats_end = dates.tail(1).values[0].split(" ")[0]
print(f'{account_time.days} días totales (Desde {account_start} hasta {account_stats_end})')

first_show_index = dates.head().index[0]
first_show = df.loc[first_show_index].title
first_show_date = df.loc[first_show_index].start.split(' ')[0]
print(f'Primer contenido visto: {first_show} ({first_show_date})')

### Top played all content

In [None]:
df.loc[:, 'plays'] = df.title.map(df.title.value_counts())
df.loc[:, 'count'] = 1
sorted_by_plays = df.loc[df.title.duplicated() == False]
sorted_by_plays = sorted_by_plays.sort_values('plays', ascending=False)
sorted_by_plays

### Top played movies (non-series)

In [None]:
nonSeriesDf = pd.DataFrame(df.loc[df.title.str.contains(": Season") == False])
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(": Book") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" \(Episode ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" : Episode ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" : Part ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" \(Chapter ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" : Chapter ") == False]

nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(": Temporada") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(": Libro") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" \(Capítulo ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" : Capítulo ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" : Parte ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" : Episodio ") == False]
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.str.contains(" \(Episodio ") == False]

nonSeriesDf

In [None]:
nonSeriesDf.loc[:, 'plays'] = df.title.map(df.title.value_counts())
nonSeriesDf = nonSeriesDf.loc[nonSeriesDf.title.duplicated() == False]
nonSeriesDf = nonSeriesDf.sort_values('plays', ascending=False)
nonSeriesDf

## Analyze TV Series (Content with multiple episodes)

#### Set each episode name to just the series name 

In [None]:
ts = df.title.str
seriesDf = pd.DataFrame(
    df.loc[
        ts.contains(': Season') | ts.contains(': Book') | ts.contains(': Part') | ts.contains(' \(Episode') | ts.contains(': Episode')| ts.contains(' \(Chapter ') | ts.contains(': Chapter') | ts.contains(': Temporada ') | ts.contains(': Libro') | ts.contains(' \(Capítulo') | ts.contains(': Capítulo') | ts.contains(': Parte') | ts.contains(': Episodio ') | ts.contains(" \(Episodio ")
        ])
seriesDf

In [None]:
# Get series name by removing everything starting with the "Season", "Book" or "Episode" suffix
def get_clean_series_name(title):
    suffixes = [
        " (Season", ": Season", ": Part", ": Book", " (Chapter ", ': Chapter', ' (Episode', ': Episode',
        ' (Temporada', ': Temporada', ': Parte', ': Libro', " (Episodio", ': Episodio', ' (Capítulo ', ": Capítulo"
    ]

    for s in suffixes:
        suffix_index = title.find(s)
        if s == ': Episodio':
            a = True
        if suffix_index != -1:
            clean_title = title[:suffix_index]    
            return clean_title


topWatchedByEpisodes = pd.DataFrame(seriesDf.loc[seriesDf.title.duplicated() == False])
topWatchedByEpisodes.title = topWatchedByEpisodes.title.map(get_clean_series_name)
seriesDf.title = seriesDf.title.map(get_clean_series_name)
topWatchedByEpisodes

In [None]:
topWatchedSeriesByPlays = pd.DataFrame(seriesDf)
topWatchedSeriesByPlays.loc[:, 'plays'] = topWatchedSeriesByPlays.title.map(topWatchedSeriesByPlays.title.value_counts())
topWatchedSeriesByPlays = topWatchedSeriesByPlays.loc[topWatchedSeriesByPlays.title.duplicated() == False]
topWatchedSeriesByPlays = topWatchedSeriesByPlays.sort_values('plays', ascending=False)
topWatchedSeriesByPlays

### Top series by watched episodes

In [None]:
topWatchedByEpisodes.loc[:, 'plays'] = topWatchedByEpisodes.title.map(topWatchedByEpisodes.title.value_counts())
topWatchedByEpisodes = topWatchedByEpisodes.loc[topWatchedByEpisodes.title.duplicated() == False]
topWatchedByEpisodes = topWatchedByEpisodes.sort_values('plays', ascending=False)
topWatchedByEpisodes

### Top series by watched time

In [None]:
# get sum of watched time
topWatchedSeries = seriesDf.groupby(['title'])['duration_secs'].sum()
topWatchedSeries

In [None]:
topWatchedSeriesDf = pd.DataFrame({'title':topWatchedSeries.index, 'duration_secs':topWatchedSeries.values}).sort_values('duration_secs', ascending=False)
topWatchedSeriesDf['duration_mins'] = topWatchedSeriesDf.duration_secs / 60
topWatchedSeriesDf['duration_hours'] = topWatchedSeriesDf.duration_mins / 60
topWatchedSeriesDf

In [None]:
def graph_top_shows(shows, qty, title="", ylabel="", width=15, height=6, ):
    sns.set_style('darkgrid')
    plt.figure(figsize=(width, height))
    sns.set_context('notebook', font_scale=1, rc={"grid.linewidth": 2})

    plt.bar(shows, qty, color=['#C0392B', '#D35400', '#E67E22', '#F39C12', '#F1C40F'])
    plt.ylabel(ylabel)
    plt.title(title)
    plt.xticks(shows, rotation=3)

    plt.show()
    plt.clf()


In [None]:
graph_top_shows(topWatchedByEpisodes.head().title, topWatchedByEpisodes.head().plays, 'Top Series por Episodios', 'Episodios')

In [None]:
graph_top_shows(topWatchedSeriesDf.head().title, topWatchedSeriesDf.head().duration_hours, 'Top Series por Horas', 'Horas')

In [None]:
graph_top_shows(nonSeriesDf.head().title, nonSeriesDf.head().plays, 'Top Películas', 'Veces Vista')

## Summary Image

## Watch events by weekday and hour of day

In [None]:
df.start = pd.to_datetime(df.start, utc=True)

df = df.set_index('start')

df.index = df.index.tz_convert('America/Mexico_City')

df = df.reset_index()

df['day'] = df.start.dt.weekday
df['hour'] = df.start.dt.hour


# set days Monday-Sunday
df['day'] = pd.Categorical(df['day'], categories=[0,1,2,3,4,5,6], ordered=True)
df.head()

In [None]:
# create data per day and sort by day using sort_index function
df_per_day = df['day'].value_counts().sort_index()

print(df_per_day)

In [None]:
# set hours of day as numeric values
df['hour'] = pd.Categorical(df['hour'], categories=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23], ordered=True)


# create data per hour and sort by hour using sort_index function
df_per_hour = df['hour'].value_counts().sort_index()

print(df_per_hour)

In [None]:
df

### Most common watch time

In [None]:
out = df.groupby(['day', 'hour'])['count'].sum().unstack()
out.sum().sum()

In [None]:
plt.figure(figsize = (16, 8))
plt.title('Shows iniciados por día de la semana y hora', fontsize = 20)
ax = sns.heatmap(out, linewidths=1, square=True, yticklabels=['L', 'M', 'M', 'J', 'V', 'S', 'D'], cmap='flare', annot=True, fmt='g')
plt.xlabel('Hora', fontsize = 15)
plt.ylabel('Día de la semana', fontsize = 15)
ax.invert_yaxis()

## Summary Image

In [None]:
im = Image.open('/content/drive/MyDrive/my-netflix-data/img/template.jpeg')

sns.set_context('notebook', font_scale = 1, rc = {"grid.linewidth":0})
# plt.axes([0.0, 0.0, 1.5, 3.3])
plt.axes([0.0, 0.0, 1, 2])
plt.axis('off')

draw = ImageDraw.Draw(im)

font_type =  {
    'medium': '/content/drive/MyDrive/my-netflix-data/fonts/Montserrat-Medium.ttf',
    'bold' : '/content/drive/MyDrive/my-netflix-data/fonts/Montserrat-Bold.ttf'
}

font = ImageFont.truetype(font_type['bold'], 65)
# Total hours
draw.text((120, 530), f'{int(total_time[0]):,}', font=font, fill='white')
# Hours per week
draw.text((680, 630), f'{watch_hours_per_week:.1f}', font=font, fill='white')
# Account age
font = ImageFont.truetype(font_type['bold'], 55)
account_age_x = 345
draw.text((account_age_x, 1360), f'{account_years} años {account_months} meses', font=font, fill='white')
font = ImageFont.truetype(font_type['medium'], 35)
draw.text((account_age_x, 1430), f'Desde {account_start}', font=font, fill='white')

# Top movie
top_movie = nonSeriesDf.head(1)
top_movie_title = top_movie.title.values[0]

font = ImageFont.truetype(font_type['bold'], 45)
lines = textwrap.wrap(top_movie_title, width=22)
top_movie_x = 425
draw.text((top_movie_x, 330), lines[0], font=font, fill='white')
if len(lines) > 1:
    draw.text((top_movie_x, 380), lines[1], font=font, fill='white')
font = ImageFont.truetype(font_type['medium'], 32)
if len(lines) == 1:
    draw.text((top_movie_x, 385), f'{top_movie.head(1).plays.values[0]} reproducciones', font=font, fill='white')
else:
    draw.text((top_movie_x, 435), f'{top_movie.head(1).plays.values[0]} reproducciones', font=font, fill='white')

# Top series
top_series = topWatchedSeriesDf.head(1)
top_series_title = top_series.head(1).title.values[0]
top_series_by_plays = topWatchedSeriesByPlays.loc[topWatchedSeriesByPlays.title == top_series_title]
top_series_by_episodes = topWatchedByEpisodes.loc[topWatchedByEpisodes.title == top_series_title]
font = ImageFont.truetype(font_type['bold'], 45)
lines = textwrap.wrap(top_series_title, width=18)
top_series_x = 55
draw.text((top_series_x, 920), lines[0], font=font, fill='white')
if len(lines) > 1:
    draw.text((top_series_x, 970), lines[1], font=font, fill='white')
font = ImageFont.truetype(font_type['medium'], 35)
if len(lines) == 1:
    draw.text((top_series_x, 980), f'{top_series.duration_hours.values[0]:.0f} Horas', font=font, fill='white')
    draw.text((top_series_x, 1025), f'{top_series_by_plays.plays.values[0]} reproducciones', font=font, fill='white')
    draw.text((top_series_x, 1070), f'{top_series_by_episodes.plays.values[0]} episodios', font=font, fill='white')
else:
    draw.text((top_series_x, 1030), f'{top_series.duration_hours.values[0]:.0f} Horas', font=font, fill='white')
    draw.text((top_series_x, 1075), f'{top_series_by_plays.plays.values[0]} reproducciones', font=font, fill='white')
    draw.text((top_series_x, 1120), f'{top_series_by_episodes.plays.values[0]} episodios', font=font, fill='white')
font = ImageFont.truetype(font_type['medium'], 35)

# First content
font = ImageFont.truetype(font_type['bold'], 45)
first_show_x = 435
lines = textwrap.wrap(first_show, width=21)
draw.text((first_show_x, 1110), lines[0], font=font, fill='white')
if len(lines) > 1:
    draw.text((first_show_x, 1160), lines[1], font=font, fill='white')
if len(lines) > 2:
    draw.text((first_show_x, 1210), lines[2], font=font, fill='white')

imgplot = plt.imshow(im, interpolation='nearest', aspect='auto')