# Exploratory Data Analysis
In this notebook, we will do some general exploration of the respective datasets to get a feel for what would be the best way to build the predictive model. Woof woof!

## Project Setup

In [1]:
# Importing the necessary libraries
import pandas as pd

In [2]:
# Loading the raw dataset
df_raw = pd.read_csv('../data/raw.csv')

## Raw Dataset Exploration

In [3]:
# Viewing the first few rows of the dataset
df_raw.head()

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number,Notes
0,Zoolander 2,Movie,7.0,Yes,10,The very first flickin!
1,Dope,Movie,8.5,Yes,11,
2,The Big Short,Movie,8.0,Yes,12,Gary had to read Caelan's notes since Caelan h...
3,Deadpool,Movie,10.0,Yes,13,
4,Vinyl,TV Show,7.5,Yes,15,


In [4]:
# Dropping the "Notes" column as it is just used for fun and will not be applicable to the model
df_raw.drop(columns = ['Notes'], inplace = True)

In [5]:
# Keeping only reviews from the main show
df_raw = df_raw[df_raw['Episode Number'].str.startswith('M') == False]

In [6]:
# Changing datatype of "Episode Number" from object to int
df_raw['Episode Number'] = df_raw['Episode Number'].astype(int)

## The Two Banger

In [17]:
# Getting all the reviews between the first and second banger
two_banger = df_raw[df_raw['Episode Number'] > 69]

In [18]:
# Viewing general info about the two banger
two_banger.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89 entries, 107 to 195
Data columns (total 5 columns):
Name              89 non-null object
Category          89 non-null object
Rating            88 non-null float64
Flickable         89 non-null object
Episode Number    89 non-null int32
dtypes: float64(1), int32(1), object(3)
memory usage: 2.8+ KB


In [19]:
# Viewing the dispersion of category types
two_banger['Category'].value_counts()

Movie      50
Food       14
Other      13
TV Show     8
Person      4
Name: Category, dtype: int64

In [20]:
# Viewing all people rated in this timeframe
two_banger[two_banger['Category'] == 'Person']

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
108,Neil DeGrasse Tyson,Person,9.5,Yes,72
109,Bill Nye,Person,7.0,Yes,72
152,Michael Biehn,Person,10.0,Yes,100
163,Caelan's little bro,Person,10.0,Yes,114


In [16]:
# Viewing perfect 10s
two_banger[two_banger['Rating'] == 10]

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
115,Pizza,Food,10.0,Yes,75
118,Summer vacation,Other,10.0,Yes,75
119,Sending caelan free Boston baked beans,Other,10.0,Yes,81
132,Rick and Marty (Season 1),TV Show,10.0,Yes,86
150,Bartenders on the Carolla cruise,Other,10.0,Yes,100
152,Michael Biehn,Person,10.0,Yes,100
163,Caelan's little bro,Person,10.0,Yes,114


In [21]:
# Viewing perfect 0s
two_banger[two_banger['Rating'] == 0]

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
111,Sonic's,Food,0.0,No,73
144,Whatchamacallit,Food,0.0,No,96


In [25]:
# Viewing low scores
two_banger[two_banger['Rating'] <= 3]

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
111,Sonic's,Food,0.0,No,73
130,Life,Movie,3.0,No,86
134,Death Note,Movie,0.5,No,87
141,Reese's Pieces,Food,3.0,No,96
142,Reese's Peanut Butter Cup,Food,3.0,Yes,96
143,Twix,Food,2.0,No,96
144,Whatchamacallit,Food,0.0,No,96
153,Mixed berry e-cigarette,Other,3.0,No,100
160,The Cloverfield Paradox,Movie,2.0,No,109
173,Kidney bean,Food,2.0,No,126


## The Three Banger

In [17]:
# Getting all the reviews between the second and third bangers
three_banger = df_raw[(df_raw['Episode Number'] >= (69*2)) & (df_raw['Episode Number'] < (69*3))]

In [18]:
three_banger

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
194,The Shape of Water,Movie,7.5,Yes,138
195,Hold the Dark,Movie,5.0,Yes,139
196,A Star is Born,Movie,9.0,Yes,140
197,Jack Ryan (Season 1),TV Show,8.0,Yes,142
198,You Were Never Really Here,Movie,5.0,No,143
199,Outlaw King,Movie,7.5,Yes,146
200,Candy cane,Food,6.0,Yes,147
201,Bodyguard,TV Show,7.5,Yes,147
202,The Christmas Chronicles,Movie,8.0,Yes,149
203,Bohemian Rhapsody,Movie,8.0,Yes,150


In [19]:
# Viewing general info about the three banger
three_banger.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47 entries, 194 to 240
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            47 non-null     object 
 1   Category        47 non-null     object 
 2   Rating          47 non-null     float64
 3   Flickable       47 non-null     object 
 4   Episode Number  47 non-null     int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 2.2+ KB


In [20]:
# Viewing the dispersion of category types
three_banger['Category'].value_counts()

Movie      26
Other      11
TV Show     5
Person      2
Food        2
Movie       1
Name: Category, dtype: int64

In [21]:
# Viewing all people rated in this timeframe
three_banger[three_banger['Category'] == 'Person']

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
211,Donovan,Person,9.0,Yes,160
224,Isla (Caelan's first daughter),Person,9.0,Yes,177


In [22]:
# Viewing perfect 10s
three_banger[three_banger['Rating'] == 10]

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
209,The experience of watching Alita battle Angel ...,Other,10.0,Yes,157


In [23]:
# Viewing perfect 0s
three_banger[three_banger['Rating'] == 0]

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
226,Having pneumonia,Other,0.0,No,182


In [26]:
# Viewing low scores
three_banger[three_banger['Rating'] <= 3]

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
216,The Highwaymen,Movie,3.0,No,165
217,The Silence,Movie,0.9,No,166
226,Having pneumonia,Other,0.0,No,182


In [30]:
# Viewing high scores
three_banger[three_banger['Rating'] >= 9.5]

Unnamed: 0,Name,Category,Rating,Flickable,Episode Number
204,Bandersnatch,Movie,9.6,Yes,151
209,The experience of watching Alita battle Angel ...,Other,10.0,Yes,157
212,Laxi (band),Other,9.6,Yes,160
218,Avengers: Endgame,Movie,9.7,Yes,169
230,Joker,Movie,9.6,Yes,192
