In [60]:
import pandas as pd
import numpy as np

In [85]:
df = pd.read_csv('mymoviedb.csv', lineterminator='\n')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9827 non-null   object 
 1   Title              9827 non-null   object 
 2   Overview           9827 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   int64  
 5   Vote_Average       9827 non-null   float64
 6   Original_Language  9827 non-null   object 
 7   Genre              9827 non-null   object 
 8   Poster_Url         9827 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 691.1+ KB


In [86]:
df['Genre'].head()

0    Action, Adventure, Science Fiction
1              Crime, Mystery, Thriller
2                              Thriller
3    Animation, Comedy, Family, Fantasy
4      Action, Adventure, Thriller, War
Name: Genre, dtype: object

In [87]:
df.duplicated().sum() #checking if there are any duplicate values

np.int64(0)

Basic Statistics

In [88]:
df[['Popularity','Vote_Count','Vote_Average']].median()

Popularity       21.199
Vote_Count      444.000
Vote_Average      6.500
dtype: float64

In [89]:
df.describe()


Unnamed: 0,Popularity,Vote_Count,Vote_Average
count,9827.0,9827.0,9827.0
mean,40.326088,1392.805536,6.439534
std,108.873998,2611.206907,1.129759
min,13.354,0.0,0.0
25%,16.1285,146.0,5.9
50%,21.199,444.0,6.5
75%,35.1915,1376.0,7.1
max,5083.954,31077.0,10.0


In [None]:
# ----------------------------------------------------Summary-------------------------------------------------------
# we have a dataframe consisting of 9827 rows and 9 columns.
# our dataset looks a bit tidy with no NaNs nor duplicated values.
# Release_Date column needs to be casted into date time and to extract only the
# Overview, Original Language,and Poster-Url wouldn't be so useful during analysis
# there are noticable outliers in Popularity column
# Vote_Average better be categorised for proper analysis.
# Genre column has comma Separated values and white spaces that needs to be handled and casted into categories

Typecasting Object to DateTime datatype

In [90]:
df['Release_Date'] = pd.to_datetime(df['Release_Date']) #converting Release_Date column to Datetime Format
print(df['Release_Date'].dtypes)

df['Release_Date'] = df['Release_Date'].dt.year #extracting year

df.sample(10)




datetime64[ns]


Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
33,2022,The Hunting,When a mysterious animal attack leaves a mutil...,761.127,10,5.8,en,Horror,https://image.tmdb.org/t/p/original/kvhrltQIRp...
216,2018,My Hero Academia: Two Heroes,All Might and Deku accept an invitation to go ...,174.756,743,8.0,ja,"Animation, Action, Adventure, Fantasy",https://image.tmdb.org/t/p/original/hC4nTxdhXq...
3529,1959,Rio Bravo,The sheriff of a small town in southwest Texas...,26.799,827,7.9,en,Western,https://image.tmdb.org/t/p/original/5hFxzqpucu...
4689,1967,Casino Royale,Sir James Bond is called back out of retiremen...,21.978,534,5.4,en,"Adventure, Action, Comedy",https://image.tmdb.org/t/p/original/9H942P5Wyw...
9586,2012,Populaire,Rose Pamphyle lives with her widowed father an...,13.547,456,6.6,fr,"Comedy, Romance",https://image.tmdb.org/t/p/original/jx65eLATUd...
3238,2019,Night Hunter,A Minnesota police officer crosses paths with ...,28.511,595,6.5,en,"Drama, Thriller",https://image.tmdb.org/t/p/original/tAMIXRTHRI...
6854,2011,Confessions of a Brazilian Call Girl,"Rachel is a girl, adopted by an upper middle c...",16.918,338,6.3,pt,Drama,https://image.tmdb.org/t/p/original/9htMnu5Zg3...
663,1993,Looking Back at it All: The Dragon Ball Z Year...,This movie is very different from the previous...,87.333,23,7.0,ja,"Action, Animation, Comedy",https://image.tmdb.org/t/p/original/sOtvIOc65g...
6721,2008,Drillbit Taylor,Three kids hire a low-budget bodyguard to prot...,17.142,808,5.7,en,Comedy,https://image.tmdb.org/t/p/original/shqO696kFn...
7544,2014,The Invisible Boy,"Michele is thirteen year old, shy, unpopular a...",15.885,873,6.2,it,"Comedy, Adventure, Science Fiction, Fantasy",https://image.tmdb.org/t/p/original/rmuU1ExeIZ...


Renaming the Column

In [91]:
df.rename(columns={'Release_Date':'Release_Year'}, inplace=True) #renaming the Release_Date column to Release_Year
df.head()


Unnamed: 0,Release_Year,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


Dropping Columns

In [92]:
df.drop(['Overview','Poster_Url','Original_Language'], axis=1, inplace=True) # Removing the unnecessary columns
df.head()


Unnamed: 0,Release_Year,Title,Popularity,Vote_Count,Vote_Average,Genre
0,2021,Spider-Man: No Way Home,5083.954,8940,8.3,"Action, Adventure, Science Fiction"
1,2022,The Batman,3827.658,1151,8.1,"Crime, Mystery, Thriller"
2,2022,No Exit,2618.087,122,6.3,Thriller
3,2021,Encanto,2402.201,5076,7.7,"Animation, Comedy, Family, Fantasy"
4,2021,The King's Man,1895.511,1793,7.0,"Action, Adventure, Thriller, War"


In [93]:
df.describe()

Unnamed: 0,Release_Year,Popularity,Vote_Count,Vote_Average
count,9827.0,9827.0,9827.0,9827.0
mean,2006.203623,40.326088,1392.805536,6.439534
std,15.685554,108.873998,2611.206907,1.129759
min,1902.0,13.354,0.0,0.0
25%,2000.0,16.1285,146.0,5.9
50%,2011.0,21.199,444.0,6.5
75%,2017.0,35.1915,1376.0,7.1
max,2024.0,5083.954,31077.0,10.0


Creating a new column called 'Rating' using a lambda function, which classifies movies into four groups based on their Vote_Average: popular, average, below average, and not popular. This categorization provides a quick and intuitive understanding of each movie's rating, making it easier to assess overall popularity at a glance.


In [94]:
q1 = df['Vote_Average'].quantile(0.25)
q2 = df['Vote_Average'].quantile(0.50)
q3 = df['Vote_Average'].quantile(0.75)

# Apply lambda to categorize
df['Rating'] = df['Vote_Average'].apply(
    lambda x: 'not popular' if x <= q1 else
              'below average' if x <= q2 else
              'average' if x <= q3 else
              'popular')



In [95]:
df['Rating'].value_counts() #number of movies in each Category

Rating
not popular      2567
popular          2450
average          2412
below average    2398
Name: count, dtype: int64

Dropping Null/NaN Values

In [96]:
df.dropna(inplace=True)

df.isna().sum()

Release_Year    0
Title           0
Popularity      0
Vote_Count      0
Vote_Average    0
Genre           0
Rating          0
dtype: int64

As the data is cleaned, now we will split genres into a list and then
explode our dataframe to have only one genre per row for each movie

In [97]:
df['Genre'] = df['Genre'].str.split(', ')
df  = df.explode('Genre').reset_index(drop = True)
df.head()

Unnamed: 0,Release_Year,Title,Popularity,Vote_Count,Vote_Average,Genre,Rating
0,2021,Spider-Man: No Way Home,5083.954,8940,8.3,Action,popular
1,2021,Spider-Man: No Way Home,5083.954,8940,8.3,Adventure,popular
2,2021,Spider-Man: No Way Home,5083.954,8940,8.3,Science Fiction,popular
3,2022,The Batman,3827.658,1151,8.1,Crime,popular
4,2022,The Batman,3827.658,1151,8.1,Mystery,popular


Exporting the Cleaned Data, so we can get insights

In [98]:
df.to_csv('cleaned_data.csv', index=False)