In [1]:
import numpy as np
import pandas as pd

In [2]:
movie_df = pd.read_csv('movie.csv')
movie_df

Unnamed: 0,Name,Released year,Genre,Director,Runtime,Score,Status,Language,Budget,Revenue
0,Fast X,2023,"['Action', 'Crime', 'Thriller']",Dan Mazeau,2h 22m,72.0,Released,English,"$340,000,000.00","$704,709,660.00"
1,Trolls Band Together,2023,"['Animation', 'Family', 'Music', 'Fantasy', 'C...",Thomas Dam,1h 32m,72.0,Released,English,"$95,000,000.00","$173,800,000.00"
2,Robot Apocalypse,2021,"['Science Fiction', 'Action']",Marcus Friedlander,1h 27m,21.0,Released,English,-,-
3,Five Nights at Freddy's,2023,"['Horror', 'Mystery']",Emma Tammi,1h 50m,78.0,Released,English,"$20,000,000.00","$286,700,000.00"
4,Oppenheimer,2023,"['Drama', 'History']",Christopher Nolan,3h 1m,81.0,Released,English,"$100,000,000.00","$951,000,000.00"
...,...,...,...,...,...,...,...,...,...,...
4995,Secret in Their Eyes,2015,"['Thriller', 'Mystery', 'Drama', 'Crime']",Billy Ray,1h 51m,64.0,Released,English,"$19,500,000.00","$34,854,990.00"
4996,The Vatican Tapes,2015,"['Thriller', 'Horror']",Mark Neveldine,1h 31m,53.0,Released,English,"$13,000,000.00","$1,784,763.00"
4997,Song to Song,2017,"['Romance', 'Drama', 'Music']",Terrence Malick,2h 9m,55.0,Released,English,"$10,000,000.00","$1,710,528.00"
4998,Divine Intervention,2023,['Comedy'],Pedro Pablo Ibarra,1h 40m,82.0,Released,Spanish; Castilian,-,-


In [3]:
n_rows=movie_df.shape[0]
n_cols=movie_df.shape[1]
n_rows, n_cols

(5000, 10)

#TODO: GIVE OBSERVATION ABOUT MEANING OF EACH ROWS, COLUMNS

After finding meaning of each rows, each columns in the data, we check whether data have duplicate rows

In [4]:
duplicated_rows = movie_df[movie_df.duplicated()]
len(duplicated_rows)

0

There is no duplicate row in data. We notice that there are many missing data (NaN) in the CSV file. We need to find how many percentage of missing data there are.

In [5]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           5000 non-null   object 
 1   Released year  5000 non-null   int64  
 2   Genre          5000 non-null   object 
 3   Director       5000 non-null   object 
 4   Runtime        5000 non-null   object 
 5   Score          5000 non-null   float64
 6   Status         5000 non-null   object 
 7   Language       5000 non-null   object 
 8   Budget         5000 non-null   object 
 9   Revenue        5000 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 390.8+ KB


We can see that missing value is described as '[]'or '-' in data. Now we will caculate missing percent of each column

In [6]:
movie_df.replace(['[]', '-',' -'], np.nan, inplace=True)
missing_values = movie_df.isnull().sum()
# Calculate missing ratio for each column
missing_ratio = (missing_values / len(movie_df)) * 100
missing_ratio

Name              0.00
Released year     0.00
Genre             0.12
Director          0.10
Runtime           0.86
Score             0.00
Status            0.00
Language          0.00
Budget           34.50
Revenue          32.30
dtype: float64

From observing data and the result, we can see that there are no columns have >50% missing values, so we don't need to drop any column

## CATERGORICAL COLUMNS

We can see that `Released year` is a categorical column, so we need to change it to `str` instead of `int64`

In [7]:
movie_df['Released year']=movie_df['Released year'].astype('str')
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           5000 non-null   object 
 1   Released year  5000 non-null   object 
 2   Genre          4994 non-null   object 
 3   Director       4995 non-null   object 
 4   Runtime        4957 non-null   object 
 5   Score          5000 non-null   float64
 6   Status         5000 non-null   object 
 7   Language       5000 non-null   object 
 8   Budget         3275 non-null   object 
 9   Revenue        3385 non-null   object 
dtypes: float64(1), object(9)
memory usage: 390.8+ KB


We need to count how many value are there in `Genre`, `Status`, `Language` 

In [8]:
selected_cols_profile={}
movie_df['Genre'] = movie_df['Genre'].str.replace('[', '').str.replace(']', '')
count_genre = movie_df['Genre'].str.split(', ').explode().value_counts().to_dict()
count_status = movie_df['Status'].value_counts().to_dict()
count_language = movie_df['Language'].str.split('; ').explode().value_counts().to_dict()
selected_cols_profile['Genre'] = {"num_diff_vals": len(count_genre), "distribution": count_genre}
selected_cols_profile['Status'] = {"num_diff_vals": len(count_status), "distribution": count_status}
selected_cols_profile['Language'] = {"num_diff_vals": len(count_language), "distribution": count_language}
selected_cols_profile=pd.DataFrame(data=selected_cols_profile,index=["num_diff_vals", "distribution"])
selected_cols_profile

  movie_df['Genre'] = movie_df['Genre'].str.replace('[', '').str.replace(']', '')


Unnamed: 0,Genre,Status,Language
num_diff_vals,19,3,46
distribution,"{''Drama'': 1691, ''Action'': 1637, ''Comedy''...","{'Released': 4988, 'Post Production': 7, 'In P...","{'English': 4052, 'Japanese': 308, 'Spanish': ..."


Value of these columns seem normal

## NUMERIC COLUMNS

In [9]:
def convert_runtime_to_float(runtime):
    try:
        # Split hours and minutes
        hours, minutes = map(int, runtime.replace('h', '').replace('m', '').split())
        return hours + round(minutes / 60,2)
    except:
        return None

# Apply the conversion function to the 'Runtime' column
movie_df['Runtime'] = movie_df['Runtime'].apply(convert_runtime_to_float)
movie_df.rename(columns={'Runtime': 'Runtime (h)'}, inplace=True)

Convert numeric columns `Budget` and `Revenue` from `object` to `float64`

In [11]:
movie_df['Budget'] = movie_df['Budget'].replace('[\$,]', '', regex=True).astype(float)
movie_df['Revenue'] = movie_df['Revenue'].replace('[\$,]', '', regex=True).astype(float)

movie_df.rename(columns={'Budget': 'Budget ($)'}, inplace=True)
movie_df.rename(columns={'Revenue': 'Revenue ($)'}, inplace=True)

In [12]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           5000 non-null   object 
 1   Released year  5000 non-null   object 
 2   Genre          4994 non-null   object 
 3   Director       4995 non-null   object 
 4   Runtime (h)    4729 non-null   float64
 5   Score          5000 non-null   float64
 6   Status         5000 non-null   object 
 7   Language       5000 non-null   object 
 8   Budget ($)     3275 non-null   float64
 9   Revenue ($)    3385 non-null   float64
dtypes: float64(4), object(6)
memory usage: 390.8+ KB


### Caculate quantile of numeric cols

In [14]:
numeric_cols=['Runtime (h)','Score','Budget ($)','Revenue ($)']
numeric_col_profile = movie_df[numeric_cols].quantile([0, 0.25, 0.5, 0.75, 1])
numeric_col_profile.index = ['min', '25%', '50%', '75%', 'max']
numeric_col_profile

Unnamed: 0,Runtime (h),Score,Budget ($),Revenue ($)
min,1.02,0.0,1.0,1.0
25%,1.55,61.0,10739200.0,21502980.0
50%,1.73,67.0,30000000.0,75850620.0
75%,1.97,73.0,65000000.0,186800000.0
max,5.55,100.0,460000000.0,2923706000.0


In [15]:
movie_df[movie_df['Budget ($)']==1] ## check thử cho vui

Unnamed: 0,Name,Released year,Genre,Director,Runtime (h),Score,Status,Language,Budget ($),Revenue ($)
3065,Down,2019,"'Horror', 'Drama', 'Thriller'",Daniel Stamm,1.37,68.0,Released,English,1.0,
3511,All About Lily Chou-Chou,2001,"'Drama', 'Crime'",Shunji Iwai,2.43,73.0,Released,Japanese,1.0,
