## Load in the Data

In [1]:
# importing pandas dataframe
import pandas as pd
import re

In [2]:
# reading csv
tn_df = pd.read_csv('./zippedData/tn.movie_budgets.csv.gz')

## Data Manipulation/Understanding

In [3]:
# first five rows
tn_df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [4]:
# checking out the datatypes of the columns and for null values
tn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [5]:
# dropping duplicates if any
tn_df.drop_duplicates(inplace = True)

In [6]:
# changing production and gross columns to numeric 
tn_df['production_budget'] = tn_df['production_budget'].apply(lambda x: x.strip('$'))
tn_df['domestic_gross'] = tn_df['domestic_gross'].apply(lambda x: x.strip('$'))
tn_df['worldwide_gross'] = tn_df['worldwide_gross'].apply(lambda x: x.strip('$'))
tn_df['production_budget'] = tn_df['production_budget'].apply(lambda x: int(x.replace(',','')))
tn_df['domestic_gross'] = tn_df['domestic_gross'].apply(lambda x: int(x.replace(',','')))
tn_df['worldwide_gross'] = tn_df['worldwide_gross'].apply(lambda x: int(x.replace(',','')))

In [7]:
# lower casing and stripping space of the movie title
tn_df['movie'] = tn_df['movie'].str.lower().str.strip()

In [8]:
# extracting year and month from release date and taking movies from 2010 and on
pattern = r', (\d{4})'
pattern2 = r'([A-Z][a-z]{2}) \d{1,2}, \d{4}'
tn_df['Year'] = tn_df['release_date'].str.extract(pattern)
tn_df['Year'] = tn_df['Year'].astype(int)
tn_df['month'] = tn_df['release_date'].str.extract(pattern2)
tn_df = tn_df[tn_df['Year'] >= 2010]
tn_df.reset_index(drop = True, inplace = True)

In [9]:
# number of movies produced each year and in each month
print('Number of movies each year:\n', tn_df['Year'].value_counts())
print('Number of movies each month:\n',  tn_df['month'].value_counts())

Number of movies each year:
 2015    338
2010    274
2014    255
2011    254
2013    238
2012    235
2016    219
2017    168
2018    143
2019     67
2020      3
Name: Year, dtype: int64
Number of movies each month:
 Dec    282
Oct    204
Mar    198
Sep    191
Apr    186
Aug    175
Nov    175
Jun    164
Feb    164
Jul    159
May    154
Jan    142
Name: month, dtype: int64


## Exporting to csv

In [10]:
# Saving as csv
tn_df.to_csv('tn_df.csv', index = False)