## EDA notebook

In [1]:
import pandas as pd
import sqlite3

import data_preparation as dp

df_p = pd.read_csv('../data/tn.movie_budgets.csv.gz')
df_g = pd.read_csv('../data/tmdb.movies.csv.gz')

con = sqlite3.connect("../data/im.db")
df_r = pd.read_sql("""SELECT * FROM movie_basics;""", con)

In [2]:
set_year = 2000

In [3]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [4]:
df_g.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         26517 non-null  int64  
 1   genre_ids          26517 non-null  object 
 2   id                 26517 non-null  int64  
 3   original_language  26517 non-null  object 
 4   original_title     26517 non-null  object 
 5   popularity         26517 non-null  float64
 6   release_date       26517 non-null  object 
 7   title              26517 non-null  object 
 8   vote_average       26517 non-null  float64
 9   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB


In [5]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


## Profit Data

Budget amount and worldwide_gross amount are converted to integers. \
Profit = (budget - worldwide_gross) / 1000000

In [6]:
len(df_p.movie)

5782

In [7]:
df_p['date'] = pd.to_datetime(df_p.release_date)
df_p['date'].describe(datetime_is_numeric=True)

count                             5782
mean     2004-07-06 05:20:31.546177792
min                1915-02-08 00:00:00
25%                2000-04-22 18:00:00
50%                2007-03-02 00:00:00
75%                2012-12-25 00:00:00
max                2020-12-31 00:00:00
Name: date, dtype: object

In [8]:
df_p = dp.profit(df_p, set_year)

In [9]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4045 entries, 0 to 2
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   movie              4045 non-null   object        
 1   production_budget  4045 non-null   object        
 2   worldwide_gross    4045 non-null   object        
 3   date               4045 non-null   datetime64[ns]
 4   year               4045 non-null   int64         
 5   budget             4045 non-null   float64       
 6   w_gross            4045 non-null   float64       
 7   profit             4045 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(3)
memory usage: 284.4+ KB


In [10]:
df_p.profit.describe()

count    4045.000000
mean       68.241753
std       158.370638
min      -200.240000
25%        -1.970000
50%        12.130000
75%        70.940000
max      2351.350000
Name: profit, dtype: float64

In [11]:
df_p.budget.describe()

count    4045.000000
mean       36.885946
std        46.410398
min         0.000000
25%         7.000000
50%        20.000000
75%        48.000000
max       425.000000
Name: budget, dtype: float64

## Genre Data

In [12]:
len(df_g)

26517

In [13]:
type(df_g.genre_ids[0])

str

In [14]:
df_g = dp.genre(df_g)

In [15]:
df_g.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25494 entries, 0 to 26516
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genre_ids     25494 non-null  object
 1   release_date  25494 non-null  object
 2   title         25494 non-null  object
dtypes: object(3)
memory usage: 796.7+ KB


## Runtime data

In [16]:
len(df_r)

146144

In [17]:
df_r.runtime_minutes.describe()

count    114405.000000
mean         86.187247
std         166.360590
min           1.000000
25%          70.000000
50%          87.000000
75%          99.000000
max       51420.000000
Name: runtime_minutes, dtype: float64

In [18]:
df_r = dp.runtime(con)

In [19]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114405 entries, 0 to 114404
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         114405 non-null  object 
 1   primary_title    114405 non-null  object 
 2   start_year       114405 non-null  int64  
 3   runtime_minutes  114405 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 3.5+ MB


## Merge
Movie titles are lower-cased and spaces and punctuations are removed before merging.

In [20]:
df_p.movie.head()

0                                   Avatar
6                   Avengers: Infinity War
5     Star Wars Ep. VII: The Force Awakens
33                          Jurassic World
66                               Furious 7
Name: movie, dtype: object

In [21]:
df_g.title.head()

0    Harry Potter and the Deathly Hallows: Part 1
1                        How to Train Your Dragon
2                                      Iron Man 2
3                                       Toy Story
4                                       Inception
Name: title, dtype: object

In [22]:
df_r.primary_title.head()

0                          Sunghursh
1    One Day Before the Rainy Season
2         The Other Side of the Wind
3           The Wandering Soap Opera
4                        A Thin Life
Name: primary_title, dtype: object

In [23]:
df_p.movie = dp.movie_name_clean(df_p.movie)
df_g.title = dp.movie_name_clean(df_g.title)
df_r.primary_title = dp.movie_name_clean(df_r.primary_title)

In [24]:
df_p.movie.head()

0                           avatar
6              avengersinfinitywar
5     starwarsepviitheforceawakens
33                   jurassicworld
66                        furious7
Name: movie, dtype: object

In [25]:
df_g.title.head()

0    harrypotterandthedeathlyhallowspart1
1                    howtotrainyourdragon
2                                ironman2
3                                toystory
4                               inception
Name: title, dtype: object

In [26]:
df_r.primary_title.head()

0                     sunghursh
1    onedaybeforetherainyseason
2         theothersideofthewind
3         thewanderingsoapopera
4                     athinlife
Name: primary_title, dtype: object

In [27]:
df_genre = df_p.merge(df_g, how = 'inner', left_on = 'movie', right_on = 'title')
df_runtime = df_p.merge(df_r, how = 'inner', left_on = 'movie', right_on = 'primary_title')

In [28]:
df_genre.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1948 entries, 0 to 1947
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   movie              1948 non-null   object        
 1   production_budget  1948 non-null   object        
 2   worldwide_gross    1948 non-null   object        
 3   date               1948 non-null   datetime64[ns]
 4   year               1948 non-null   int64         
 5   budget             1948 non-null   float64       
 6   w_gross            1948 non-null   float64       
 7   profit             1948 non-null   float64       
 8   genre_ids          1948 non-null   object        
 9   release_date       1948 non-null   object        
 10  title              1948 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(6)
memory usage: 182.6+ KB


In [29]:
# count of each genre after inner merge
Genre = dp.genre_count(df_genre)
Genre

Unnamed: 0,Genre_name,Genre_count,Median,Q25,Q75
15,TV_Movie,5,-1.92,-4.18,134.82
18,Western,24,-0.825,-8.1875,58.2275
5,Documentary,71,0.89,-1.165,22.855
17,War,52,5.025,-9.875,87.29
9,History,77,7.64,-7.63,36.12
11,Music,55,8.15,-2.315,59.69
6,Drama,911,8.27,-2.82,47.77
4,Crime,248,10.815,-4.0825,57.1425
13,Romance,243,15.97,-0.65,71.305
16,Thriller,513,16.96,-2.02,72.09


In [30]:
df_runtime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2800 entries, 0 to 2799
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   movie              2800 non-null   object        
 1   production_budget  2800 non-null   object        
 2   worldwide_gross    2800 non-null   object        
 3   date               2800 non-null   datetime64[ns]
 4   year               2800 non-null   int64         
 5   budget             2800 non-null   float64       
 6   w_gross            2800 non-null   float64       
 7   profit             2800 non-null   float64       
 8   movie_id           2800 non-null   object        
 9   primary_title      2800 non-null   object        
 10  start_year         2800 non-null   int64         
 11  runtime_minutes    2800 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(5)
memory usage: 284.4+ KB


In [31]:
runtime = dp.runtime_range(df_runtime)
runtime

Unnamed: 0_level_0,Median,Q25,Q75
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Less than 60,14.13,-1.57,59.4275
60 to 80,10.38,-3.915,52.0
80 to 100,5.225,-2.67,47.6625
100 to 120,20.425,-1.2925,78.8475
over 120,39.78,0.315,182.205
