##Setup libraries, dataset path and movie genres

In [287]:
import pandas as pd
import os
dataset_path = "../dataset"
movie_genres = ['action', 'adventure', 'animation', 'biography', 'crime', 'family', 'fantasy', 
          'film-noir', 'history', 'horror', 'mystery', 'romance', 'scifi', 'sports', 
          'thriller', 'war']

##Read each file and store dataframes separately

In [288]:
# Define a dictionary to store dataframes for each genre
movies_by_genre = {}

# Update the dictionary
for file in os.listdir(dataset_path):
    for genre in movie_genres:
        if genre in file:
            if genre not in movies_by_genre:
                movies_by_genre[genre] = pd.read_csv(os.path.join(dataset_path, file))
            else:
                movies_by_genre[genre] = pd.concat([movies_by_genre[genre], pd.read_csv(os.path.join(dataset_path, file))])

# Print the number of rows and columns in each genre dataframe without any data quality checks
print("genre, number of rows, number of columns")
for genre, movies_df in movies_by_genre.items():
    print(genre, movies_df.shape)

genre, number of rows, number of columns
action (52452, 14)
adventure (25664, 14)
animation (8419, 14)
biography (8289, 14)
crime (35852, 14)
family (17095, 14)
fantasy (17163, 14)
film-noir (986, 14)
history (8996, 14)
horror (36682, 14)
mystery (18960, 14)
romance (52617, 14)
scifi (16557, 14)
sports (5292, 14)
thriller (53365, 14)
war (9911, 14)


##Perform data quality checks

In [289]:
# Dimensionality reduction
# for each dataframe in the dictionary, rename the columns and select only the columns we need
for genre, movies_df in movies_by_genre.items():
    movies_df.rename(columns={'gross(in $)': 'gross'}, inplace=True)
    movies_df.rename(columns={'movie_name': 'movie'}, inplace=True)
    movies_by_genre[genre] = movies_df[['movie', 'year', 'runtime', 'rating', 'director', 'star', 'votes', 'gross']]

# print the columns selected for each genre
for genre, movies_df in movies_by_genre.items():
    print(genre, movies_df.columns.values)

action ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
adventure ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
animation ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
biography ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
crime ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
family ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
fantasy ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
film-noir ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
history ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
horror ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
mystery ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
romance ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'votes' 'gross']
scifi ['movie' 'year' 'runtime' 'rating' 'director' 'star' 'v

##The study of trends over the years, specifically the popularity of genres
For this case study we will consider columns that can be used to visualize the trends in the movie industry over the years. We will consider the following columns: movie, year, runtime, rating, director, star. We will also consider the following columns for further analysis: gross

In [290]:
popularity_of_genres = {}
# remove null values 
for genre, movies_df in movies_by_genre.items():
    popularity_of_genres[genre] = movies_df[movies_df['year'].notnull()]

# number of rows containing null values for each genre
print("null values for each genre")
print("----------------------------------")
for genre, movies_df in movies_by_genre.items():
    print(genre, movies_df.shape[0] - popularity_of_genres[genre].shape[0])



null values for each genre
----------------------------------
action 8259
adventure 3689
animation 1369
biography 2005
crime 3563
family 1631
fantasy 2814
film-noir 0
history 1295
horror 7591
mystery 2004
romance 3437
scifi 4018
sports 575
thriller 10137
war 861


##Noise analysis

In [291]:
# try to normalize the year column values to a 4 digit year format
for genre, movies_df in movies_by_genre.items():
    movies_df['year'] = movies_df['year'].astype(str).str[:4]

# remove values that are not in the years 1950-2023
for genre, movies_df in movies_by_genre.items():
    movies_by_genre[genre] = movies_df[(movies_df['year'] >= '1950') & (movies_df['year'] <= '2023')]

# check if there are noises in the year column
for genre, movies_df in movies_by_genre.items():
    print(genre, movies_df[(movies_df['year'] < '1950') | (movies_df['year'] > '2023')].shape[0])



action 0
adventure 0
animation 0
biography 0
crime 0
family 0
fantasy 0
film-noir 0
history 0
horror 0
mystery 0
romance 0
scifi 0
sports 0
thriller 0
war 0


In [292]:
popularity_of_genres_sample = {}
# get a sample of 5% from each genre
for genre, movies_df in popularity_of_genres.items():
    popularity_of_genres_sample[genre] = movies_df.sample(frac=0.05, random_state=1)

# check number of movies for each genre
for genre, movies_df in popularity_of_genres_sample.items():
    print(genre, movies_df.shape[0])

action 2210
adventure 1099
animation 352
biography 314
crime 1614
family 773
fantasy 717
film-noir 49
history 385
horror 1455
mystery 848
romance 2459
scifi 627
sports 236
thriller 2161
war 452


##Feature Generation

In [295]:
# for each dataframe, add a new column called genre
for genre, movies_df in popularity_of_genres_sample.items():
    popularity_of_genres_sample[genre]['genre'] = genre
# combine all the dataframes into one dataframe
all_movie_genres_sample = pd.concat(popularity_of_genres_sample.values())

all_movie_genres_sample

Unnamed: 0,movie,year,runtime,rating,director,star,votes,gross,genre
0,Black Panther: Wakanda Forever,2022,161 min,6.9,Ryan Coogler,"Letitia Wright, \nLupita Nyong'o, \nDanai Guri...",204835.0,,action
1,Avatar: The Way of Water,2022,192 min,7.8,James Cameron,"Sam Worthington, \nZoe Saldana, \nSigourney We...",295119.0,,action
2,Plane,2023,107 min,6.5,Jean-François Richet,"Gerard Butler, \nMike Colter, \nTony Goldwyn, ...",26220.0,,action
3,Everything Everywhere All at Once,2022,139 min,8.0,"Dan Kwan, \nDaniel Scheinert","Michelle Yeoh, \nStephanie Hsu, \nJamie Lee Cu...",327858.0,,action
4,Fast X,2023,,,Louis Leterrier,"Vin Diesel, \nJordana Brewster, \nTyrese Gibso...",,,action
...,...,...,...,...,...,...,...,...,...
9901,Opération Maillot,2015,,,Okacha Touita,"Martin Pautard, \nMohamed Seghir Bendaoud, \nM...",,,war
9903,Pepeng Hapon,1966,,,"Jun Aristorenas, \nBerting Labra, \nGina Alonz...",,,,war
9904,Zhongqing yi hao,1970,,,Che-Fu Liang,"Shan Kwan, \nHsiao Yen Chang, \nChiang Cheng, ...",,,war
9909,Pandora Palace,2019,,,Evan Kascinde,"Muhammad Al Abasiri, \nBrai Andujar, \nDon Bla...",,,war
