In [134]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [135]:
# Load Netflix Stocks Data
netflix_stock_data = pd.read_csv('../data/netflix_stock_data.csv')
# Load Box Office Movies Data
movies_data = pd.read_csv('../data/movies_data.csv')

In [136]:
netflix_stock_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2661 entries, 0 to 2660
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2661 non-null   int64  
 1   Date        2661 non-null   object 
 2   Adj Close   2661 non-null   float64
 3   Close       2661 non-null   float64
 4   High        2661 non-null   float64
 5   Low         2661 non-null   float64
 6   Open        2661 non-null   float64
 7   Volume      2661 non-null   int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 166.4+ KB


In [137]:
netflix_stock_data.head(5)

Unnamed: 0.1,Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,0,2014-01-02 00:00:00+00:00,51.831429,51.831429,52.511429,51.542858,52.401428,12325600
1,1,2014-01-03 00:00:00+00:00,51.871429,51.871429,52.495712,51.842857,52.0,10817100
2,2,2014-01-06 00:00:00+00:00,51.367142,51.367142,52.044285,50.475716,51.889999,15501500
3,3,2014-01-07 00:00:00+00:00,48.5,48.5,49.69857,48.152859,49.684284,36167600
4,4,2014-01-08 00:00:00+00:00,48.712856,48.712856,49.425713,48.074287,48.104286,20001100


In [138]:
netflix_stock_data = netflix_stock_data.drop(columns=['Unnamed: 0'])

In [139]:
netflix_stock_data['Date'] = pd.to_datetime(netflix_stock_data['Date'])

In [140]:
netflix_stock_data.isnull().sum()

Date         0
Adj Close    0
Close        0
High         0
Low          0
Open         0
Volume       0
dtype: int64

In [141]:
netflix_stock_data.rename(columns={
    'Adj Close': 'adjusted_close',
    'Close': 'close_price',
    'High': 'high_price',
    'Low': 'low_price',
    'Open': 'open_price',
    'Volume': 'volume'
}, inplace=True)

In [142]:
netflix_stock_data.describe()

Unnamed: 0,adjusted_close,close_price,high_price,low_price,open_price,volume
count,2661.0,2661.0,2661.0,2661.0,2661.0,2661.0
mean,292.409917,292.409917,296.725978,287.886233,292.379733,10135790.0
std,178.22311,178.22311,180.54163,175.920308,178.319066,9067623.0
min,44.887142,44.887142,45.842857,42.785713,44.605713,1144000.0
25%,116.239998,116.239998,117.879997,113.699997,116.209999,4717100.0
50%,298.839996,298.839996,305.209991,293.279999,299.5,7433900.0
75%,424.450012,424.450012,428.850006,415.980011,424.200012,12595800.0
max,691.690002,691.690002,700.98999,686.090027,692.349976,133387500.0


In [143]:
movies_data.head(5)

Unnamed: 0,Year,Date,Day,Day #,Top 10 Gross,%± YD,%± LW,Releases,#1 Release,Gross
0,2014,Jan 1\nNew Year's Day,Wednesday,1,"$48,419,707",+35%,-27.9%,49,Frozen,"$8,718,939"
1,2014,Jan 2,Thursday,2,"$25,361,378",-47.6%,-56.6%,49,Frozen,"$5,304,617"
2,2014,Jan 3,Friday,3,"$42,939,384",+69.3%,-26.8%,45,Paranormal Activity: The Marked Ones,"$8,722,144"
3,2014,Jan 4,Saturday,4,"$49,402,611",+15.1%,-19.7%,45,Frozen,"$8,037,475"
4,2014,Jan 5,Sunday,5,"$26,723,321",-45.9%,-44.1%,45,Frozen,"$4,785,996"


In [144]:
# Check if any entries contain '\n' or specific keywords like 'Day'
special_entries = movies_data[movies_data['Date'].str.contains(r'\n|Day|New', na=False)]
print(special_entries[['Date']].head(20))

                                        Date
0                      Jan 1\nNew Year's Day
19                           Jan 20\nMLK Day
37                     Feb 7\nSochi Olympics
38                     Feb 8\nSochi Olympics
39                     Feb 9\nSochi Olympics
40                    Feb 10\nSochi Olympics
41                    Feb 11\nSochi Olympics
42                    Feb 12\nSochi Olympics
43                    Feb 13\nSochi Olympics
44                    Feb 14\nSochi Olympics
45                    Feb 15\nSochi Olympics
46                    Feb 16\nSochi Olympics
47   Feb 17\nPresidents' Day\nSochi Olympics
48                    Feb 18\nSochi Olympics
49                    Feb 19\nSochi Olympics
50                    Feb 20\nSochi Olympics
51                    Feb 21\nSochi Olympics
52                    Feb 22\nSochi Olympics
53                    Feb 23\nSochi Olympics
109                    Apr 20\nEaster Sunday


In [145]:
# Split the Date column into Date_Clean (actual date) and Event (event name)
movies_data[['Date', 'Event']] = movies_data['Date'].str.split('\n', n=1, expand=True)
movies_data['Date'] = pd.to_datetime(movies_data['Date'] + ' ' + movies_data['Year'].astype(str), errors='raise')

In [146]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951 entries, 0 to 3950
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Year          3951 non-null   int64         
 1   Date          3951 non-null   datetime64[ns]
 2   Day           3951 non-null   object        
 3   Day #         3951 non-null   int64         
 4   Top 10 Gross  3951 non-null   object        
 5   %± YD         3951 non-null   object        
 6   %± LW         3951 non-null   object        
 7   Releases      3951 non-null   int64         
 8   #1 Release    3951 non-null   object        
 9   Gross         3951 non-null   object        
 10  Event         708 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(7)
memory usage: 339.7+ KB


In [148]:
movies_data['Gross'] = movies_data['Gross'].replace('[\$,]', '', regex=True).astype(int)
movies_data['Top 10 Gross'] = movies_data['Top 10 Gross'].replace('[\$,]', "", regex=True).astype(int)
movies_data.rename(columns={'Gross': 'Gross (Dollars)', 'Top 10 Gross': 'Top 10 Gross (Dollars)'}, inplace=True)

In [149]:
movies_data.head(10)

Unnamed: 0,Year,Date,Day,Day #,Top 10 Gross (Dollars),%± YD,%± LW,Releases,#1 Release,Gross (Dollars),Event
0,2014,2014-01-01,Wednesday,1,48419707,+35%,-27.9%,49,Frozen,8718939,New Year's Day
1,2014,2014-01-02,Thursday,2,25361378,-47.6%,-56.6%,49,Frozen,5304617,
2,2014,2014-01-03,Friday,3,42939384,+69.3%,-26.8%,45,Paranormal Activity: The Marked Ones,8722144,
3,2014,2014-01-04,Saturday,4,49402611,+15.1%,-19.7%,45,Frozen,8037475,
4,2014,2014-01-05,Sunday,5,26723321,-45.9%,-44.1%,45,Frozen,4785996,
5,2014,2014-01-06,Monday,6,9984206,-62.6%,-74.1%,45,Frozen,1728610,
6,2014,2014-01-07,Tuesday,7,12471587,+24.9%,-65.2%,45,The Wolf of Wall Street,1955396,
7,2014,2014-01-08,Wednesday,8,9494589,-23.9%,-80.4%,45,The Wolf of Wall Street,1643640,
8,2014,2014-01-09,Thursday,9,8724720,-8.1%,-65.6%,45,The Wolf of Wall Street,1469075,
9,2014,2014-01-10,Friday,10,36124969,+314.1%,-15.9%,43,Lone Survivor,14403750,
