### Libraries

In [172]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import calendar


### Insert Clean Export

In [81]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [36]:
df = pd.read_csv('data/clean-and-merged_movies.csv')

In [37]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  244 non-null    int64  
 1   Date        244 non-null    object 
 2   Title_net   244 non-null    object 
 3   Title_imdb  243 non-null    object 
 4   genre       243 non-null    object 
 5   rating      243 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 11.6+ KB


In [38]:
df.genre = df.genre.str.replace(' ','')

In [39]:
df[['genre1', 'genre2', 'genre3']] = df['genre'].str.split(',', expand=True)

In [53]:
df

Unnamed: 0.1,Unnamed: 0,Date,Title_net,Title_imdb,genre,rating,genre1,genre2,genre3
0,0,2023-01-29,The Godfather,,,,,,
1,3,2023-01-29,Arrested Development,Arrested Development,Comedy,8.7,Comedy,,
2,7,2023-01-28,Wednesday,Wednesday,"Comedy,Crime,Fantasy",8.2,Comedy,Crime,Fantasy
3,9,2023-01-14,The Pale Blue Eye,The Pale Blue Eye,"Crime,Horror,Mystery",6.6,Crime,Horror,Mystery
4,10,2023-01-08,1899,1899,"Drama,Mystery",7.4,Drama,Mystery,
...,...,...,...,...,...,...,...,...,...
239,809,2019-03-22,The Dirt,The Dirt,"Biography,Comedy,Drama",7.0,Biography,Comedy,Drama
240,810,2019-03-18,Triple Frontier,Triple Frontier,"Action,Thriller",6.4,Action,Thriller,
241,816,2019-04-03,Serenity,Serenity,"Drama,Mystery,Thriller",5.4,Drama,Mystery,Thriller
242,820,2019-02-27,Velvet Buzzsaw,Velvet Buzzsaw,"Horror,Mystery,Thriller",5.7,Horror,Mystery,Thriller


## Q1: Which genre(s) am I most likely to watch on Netflix?

In [213]:
# Reshape the data frame using melt
df_genre = df.melt(id_vars=['Title_imdb', 'genre'], value_vars=['genre1', 'genre2', 'genre3'], var_name='Genre-order', value_name='Genre-count')

# Assign weights to categories
weights = {'genre1': 1, 'genre2': 0.5, 'genre3': 0.3}
df_genre['Weighted-counts'] = df_genre['Genre-order'].map(weights)

# Group the melted data frame by title and category
df_genre = df_genre.groupby(['Genre-count'])['Weighted-counts'].sum().reset_index()

df_genre.sample(5)

Unnamed: 0,Genre-count,Weighted-counts
20,Thriller,10.4
16,Romance,10.3
0,Action,73.8
18,Short,0.5
21,Western,0.3


In [214]:
fig = px.line_polar(df_genre, r='Weighted-counts', theta='Genre-count', line_close=True,
                    title='Which genre(s) am I most likely to watch the most on Netflix?')


In [216]:
fig.update_layout(title_x=0.5)
fig.show()

## Q2: Which months I have Netflix & Chilled the most recently?

In [170]:
#months
df['Month'] = pd.DatetimeIndex(df['Date']).month
df

Unnamed: 0.1,Unnamed: 0,Date,Title_net,Title_imdb,genre,rating,genre1,genre2,genre3,Month
0,0,2023-01-29,The Godfather,,,,,,,1
1,3,2023-01-29,Arrested Development,Arrested Development,Comedy,8.7,Comedy,,,1
2,7,2023-01-28,Wednesday,Wednesday,"Comedy,Crime,Fantasy",8.2,Comedy,Crime,Fantasy,1
3,9,2023-01-14,The Pale Blue Eye,The Pale Blue Eye,"Crime,Horror,Mystery",6.6,Crime,Horror,Mystery,1
4,10,2023-01-08,1899,1899,"Drama,Mystery",7.4,Drama,Mystery,,1
...,...,...,...,...,...,...,...,...,...,...
239,809,2019-03-22,The Dirt,The Dirt,"Biography,Comedy,Drama",7.0,Biography,Comedy,Drama,3
240,810,2019-03-18,Triple Frontier,Triple Frontier,"Action,Thriller",6.4,Action,Thriller,,3
241,816,2019-04-03,Serenity,Serenity,"Drama,Mystery,Thriller",5.4,Drama,Mystery,Thriller,4
242,820,2019-02-27,Velvet Buzzsaw,Velvet Buzzsaw,"Horror,Mystery,Thriller",5.7,Horror,Mystery,Thriller,2


In [173]:
df['Month'] = df['Month'].apply(lambda x: calendar.month_abbr[x])

In [179]:
#transform counts in dataframe
df_seasons = df['Month'].value_counts()
df_seasons = pd.DataFrame(df_seasons)
df_seasons = df_seasons.reset_index()
df_seasons.rename(columns={'index': 'Month',
'Month': 'counts'},
inplace=True, errors='raise')

In [180]:
df_seasons

Unnamed: 0,Month,counts
0,Mar,29
1,Dec,26
2,Apr,25
3,Jan,23
4,Nov,23
5,Sep,23
6,Jul,23
7,Jun,17
8,May,17
9,Aug,15


In [182]:
#sort by month name
month_dict = {'Jan':1,'Feb':2,'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
df_seasons = df_seasons.sort_values('Month', key = lambda x : x.apply (lambda x : month_dict[x]))
df_seasons.reset_index(drop=True, inplace=True)

In [183]:
df_seasons

Unnamed: 0,Month,counts
0,Jan,23
1,Feb,10
2,Mar,29
3,Apr,25
4,May,17
5,Jun,17
6,Jul,23
7,Aug,15
8,Sep,23
9,Oct,13


In [184]:
seasons = ['Winter','Winter','Spring', 'Spring', 'Spring', 'Summer', 'Summer', 'Summer', 'Autumn', 'Autumn', 'Autumn', 'Winter']

In [187]:
df_seasons = df_seasons.assign(Season=seasons)
df_seasons

Unnamed: 0,Month,counts,Season
0,Jan,23,Winter
1,Feb,10,Winter
2,Mar,29,Spring
3,Apr,25,Spring
4,May,17,Spring
5,Jun,17,Summer
6,Jul,23,Summer
7,Aug,15,Summer
8,Sep,23,Autumn
9,Oct,13,Autumn


In [212]:
fig = px.area(df_seasons, x='Month', y="counts", pattern_shape="Season", color="Season", line_shape='spline', pattern_shape_sequence=[".", "|", "x", "/"])
fig.show()

## Q3: How do the seasons affect my choices?