# SciFi Movie Analysis

### Importing modules

In [134]:
# importing modules
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter
import requests
import os

### Saving Datasets to user computer and loading it dynamically

In [135]:
# saving to paths
genre_data = kagglehub.dataset_download("rajugc/imdb-movies-dataset-based-on-genre")
stream_data = kagglehub.dataset_download("ruchi798/movies-on-netflix-prime-video-hulu-and-disney")
print("Path to dataset files:", genre_data)
print("Path to dataset files:", stream_data)


Path to dataset files: C:\Users\drvan\.cache\kagglehub\datasets\rajugc\imdb-movies-dataset-based-on-genre\versions\3
Path to dataset files: C:\Users\drvan\.cache\kagglehub\datasets\ruchi798\movies-on-netflix-prime-video-hulu-and-disney\versions\5


In [158]:
def load_csv_files(directory):
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    
    dataframes = {}
    
    for file in csv_files:
        file_name = os.path.splitext(file)[0]  # Get the file name without the extension
        var_name = file_name + '_data'  # Create the variable name
        file_path = os.path.join(directory, file)
        
        # Read the CSV file into a DataFrame and assign it to the dictionary
        dataframes[var_name] = pd.read_csv(file_path)
    
    return dataframes

# Usage

data = load_csv_files(genre_data)

# Print the entire dictionary to verify the output
for key, dataframe in data.items():
    print(f"{key}:\n{dataframe.head()}\n")

action_data:
    movie_id                         movie_name  year certificate  runtime  \
0  tt9114286     Black Panther: Wakanda Forever  2022       PG-13  161 min   
1  tt1630029           Avatar: The Way of Water  2022       PG-13  192 min   
2  tt5884796                              Plane  2023           R  107 min   
3  tt6710474  Everything Everywhere All at Once  2022           R  139 min   
4  tt5433140                             Fast X  2023         NaN      NaN   

                        genre  rating  \
0    Action, Adventure, Drama     6.9   
1  Action, Adventure, Fantasy     7.8   
2            Action, Thriller     6.5   
3   Action, Adventure, Comedy     8.0   
4      Action, Crime, Mystery     NaN   

                                         description  \
0  The people of Wakanda fight to protect their h...   
1  Jake Sully lives with his newfound family form...   
2  A pilot finds himself caught in a war zone aft...   
3  A middle-aged Chinese immigrant is swept up 

In [165]:
def select_genre(dataframes):
    # Ask the user to select a genre
    genre = input("Please select a genre (e.g., scifi, action, romance): ").lower()
    var_name = genre + '_data'
    
    # Check if the genre exists in the dataframes dictionary
    if var_name in dataframes:
        print(f"You selected {genre}. Preparing the data for cleaning...\n")
        selected_data = dataframes[var_name]
        
        # Here you can add your data cleaning steps
        # For demonstration, let's just print the first few rows
        print(selected_data.head())
        
        # Return the selected DataFrame for further processing
        return selected_data
    else:
        print(f"Sorry, the genre '{genre}' is not available.")
        return None

In [166]:
select_genre(dataframe)

Sorry, the genre 'scifi' is not available.


In [155]:
def load_csv_files(directory):
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    
    dataframes = {}
    
    for file in csv_files:
        file_name = os.path.splitext(file)[0]
        var_name = file_name + '_data'
        file_path = os.path.join(directory, file)
        dataframes[var_name] = pd.read_csv(file_path)
     
    return dataframes
   

In [156]:
data = load_csv_files(genre_data)

# Print the entire dictionary
print(data)

# If you want to print each dataframe separately
for key, dataframe in data.items():
    print(f"{key}:\n{dataframe}\n")

{'action_data':          movie_id                         movie_name  year certificate  \
0       tt9114286     Black Panther: Wakanda Forever  2022       PG-13   
1       tt1630029           Avatar: The Way of Water  2022       PG-13   
2       tt5884796                              Plane  2023           R   
3       tt6710474  Everything Everywhere All at Once  2022           R   
4       tt5433140                             Fast X  2023         NaN   
...           ...                                ...   ...         ...   
52447  tt26743984                          Suicide 2  1999         NaN   
52448  tt26744184                              Chaos  2023         NaN   
52449  tt26744484                      Revolver Lily   NaN         NaN   
52450  tt26744765                           The Wing   NaN         NaN   
52451  tt26745202                       Vey Dharuvey   NaN         NaN   

       runtime                       genre  rating  \
0      161 min    Action, Adventure, Dram

In [None]:
# Loading datasets from dynamic path
scifi_file = os.path.join(genre_data, "scifi.csv")
stream_file = os.path.join(stream_data, "MoviesOnStreamingPlatforms.csv")
if os.path.exists(scifi_file):
    scifi_df = pd.read_csv(scifi_file)
    print("Sci-Fi dataset loaded successfully!")
else:
    print(f"Error: {scifi_file} not found!")
if os.path.exists(stream_file):
    stream_df = pd.read_csv(stream_file)
    print("Stream, dataset loaded successfully!")
else:
    print(f"Error: {stream_file} not found!")

Sci-Fi dataset loaded successfully!
Stream, dataset loaded successfully!


In [129]:
def list_files(directory):
    try:
        files = os.listdir(directory)
        return files
    except FileNotFoundError:
        return f"The directory {directory} was not found."
    except Exception as e:
        return str(e)

file_list = list_files(genre_data)
print("Files in directory:", file_list)

Files in directory: ['action.csv', 'adventure.csv', 'animation.csv', 'biography.csv', 'crime.csv', 'family.csv', 'fantasy.csv', 'film-noir.csv', 'history.csv', 'horror.csv', 'mystery.csv', 'romance.csv', 'scifi.csv', 'sports.csv', 'thriller.csv', 'war.csv']


### Cleaning the Movie Lists to prepare merging.

In [83]:
#cleaning scifi df
scifi_df.columns = (
    scifi_df.columns
    .str.upper()
    .str.strip()
    .str.replace('_',' ')
)
scifi_cleaned.columns = scifi_cleaned.columns.str.strip().str.upper()
scifi_cleaned = scifi_df.drop(
    ['MOVIE ID', 'RUNTIME', 'DIRECTOR', 'DIRECTOR ID', 'STAR', 'STAR ID', 'VOTES'], axis='columns'
    )
scifi_cleaned = scifi_cleaned.dropna()

scifi_cleaned['YEAR'] = pd.to_numeric(scifi_cleaned['YEAR'], errors='coerce')
scifi_cleaned['CERTIFICATE'] = scifi_cleaned['CERTIFICATE'].astype('category')

#scifi_cleaned.info()
scifi_cleaned.head()

Unnamed: 0,MOVIE NAME,YEAR,CERTIFICATE,GENRE,RATING,DESCRIPTION,GROSS(IN $)
6,Black Panther,2018,PG-13,"Action, Adventure, Sci-Fi",7.3,"T'Challa, heir to the hidden but advanced king...",700059566.0
7,Avatar,2009,PG-13,"Action, Adventure, Fantasy",7.9,A paraplegic Marine dispatched to the moon Pan...,760507625.0
11,Interstellar,2014,PG-13,"Adventure, Drama, Sci-Fi",8.6,A team of explorers travel through a wormhole ...,188020017.0
13,The Hunger Games,2012,PG-13,"Action, Adventure, Sci-Fi",7.2,Katniss Everdeen voluntarily takes her younger...,408010692.0
14,Dune,2021,PG-13,"Action, Adventure, Drama",8.0,A noble family becomes embroiled in a war for ...,108327830.0


### Cleaning Streaming Data to prepare for merge.

In [84]:
# Cleaning of stream data
stream_df.columns = (
    stream_df.columns
    .str.upper()
    .str.strip()
    .str.replace('_',' ')
)

stream_cleaned = stream_df.drop(
    ['UNNAMED: 0', 'ID', 'YEAR', 'AGE', 'TYPE'], axis ='columns')

stream_cleaned[['NETFLIX', 'HULU', 'PRIME VIDEO', 'DISNEY+']] = stream_cleaned[['NETFLIX', 'HULU', 'PRIME VIDEO', 'DISNEY+']].astype(bool)
stream_cleaned = stream_cleaned.rename(columns={'TITLE': 'MOVIE NAME'})
stream_cleaned = stream_cleaned.dropna()
#stream_cleaned.info()
stream_cleaned.columns = stream_cleaned.columns.str.strip().str.upper()
stream_cleaned.head()

Unnamed: 0,MOVIE NAME,ROTTEN TOMATOES,NETFLIX,HULU,PRIME VIDEO,DISNEY+
0,The Irishman,98/100,True,False,False,False
1,Dangal,97/100,True,False,False,False
2,David Attenborough: A Life on Our Planet,95/100,True,False,False,False
3,Lagaan: Once Upon a Time in India,94/100,True,False,False,False
4,Roma,94/100,True,False,False,False


### Sorted columns by rating

In [85]:
#scifi_cleaned
scifi_cleaned_sorted = scifi_cleaned.sort_values(by=['RATING'], ascending=[False])
scifi_cleaned_sorted.head(10)


Unnamed: 0,MOVIE NAME,YEAR,CERTIFICATE,GENRE,RATING,DESCRIPTION,GROSS(IN $)
18,Inception,2010,PG-13,"Action, Adventure, Sci-Fi",8.8,A thief who steals corporate secrets through t...,292576195.0
25,The Matrix,1999,R,"Action, Sci-Fi",8.7,When a beautiful stranger leads computer hacke...,171479930.0
149,Star Wars: Episode V - The Empire Strikes Back,1980,PG,"Action, Adventure, Fantasy",8.7,After the Rebels are overpowered by the Empire...,290475067.0
71,Terminator 2: Judgment Day,1991,R,"Action, Sci-Fi",8.6,"A cyborg, identical to the one who failed to k...",204843350.0
11,Interstellar,2014,PG-13,"Adventure, Drama, Sci-Fi",8.6,A team of explorers travel through a wormhole ...,188020017.0
51,Star Wars,1977,PG,"Action, Adventure, Fantasy",8.6,Luke Skywalker joins forces with a Jedi Knight...,322740140.0
47,Alien,1979,R,"Horror, Sci-Fi",8.5,The crew of a commercial spacecraft encounter ...,78900000.0
34,Back to the Future,1985,PG,"Adventure, Comedy, Sci-Fi",8.5,"Marty McFly, a 17-year-old high school student...",210609762.0
35,The Prestige,2006,PG-13,"Drama, Mystery, Sci-Fi",8.5,"After a tragic accident, two stage magicians i...",53089891.0
55,Avengers: Infinity War,2018,PG-13,"Action, Adventure, Sci-Fi",8.4,The Avengers and their allies must be willing ...,678815482.0


In [86]:
scifi_stream_df = scifi_cleaned.merge(
    stream_cleaned[['MOVIE NAME', 'NETFLIX', 'HULU', 'PRIME VIDEO', 'DISNEY+']],
    on='MOVIE NAME',
    how='left'
)
#print(scifi_stream_df.head())
#print(scifi_stream_df.info())
scifi_stream_df


Unnamed: 0,MOVIE NAME,YEAR,CERTIFICATE,GENRE,RATING,DESCRIPTION,GROSS(IN $),NETFLIX,HULU,PRIME VIDEO,DISNEY+
0,Black Panther,2018,PG-13,"Action, Adventure, Sci-Fi",7.3,"T'Challa, heir to the hidden but advanced king...",700059566.0,False,False,False,True
1,Avatar,2009,PG-13,"Action, Adventure, Fantasy",7.9,A paraplegic Marine dispatched to the moon Pan...,760507625.0,False,False,False,True
2,Interstellar,2014,PG-13,"Adventure, Drama, Sci-Fi",8.6,A team of explorers travel through a wormhole ...,188020017.0,,,,
3,The Hunger Games,2012,PG-13,"Action, Adventure, Sci-Fi",7.2,Katniss Everdeen voluntarily takes her younger...,408010692.0,,,,
4,Dune,2021,PG-13,"Action, Adventure, Drama",8.0,A noble family becomes embroiled in a war for ...,108327830.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1186,Planetfall,2005,Not Rated,"Action, Sci-Fi, Western",2.7,A spaghetti western in science fiction clothin...,2710.0,,,,
1187,Chaos,2002,R,Sci-Fi,3.7,In the near future a young biker leaves his cr...,204274.0,,,,
1188,Jim,2010,Unrated,"Drama, Fantasy, Sci-Fi",5.5,A desperate man hires an advanced biotech firm...,13015.0,,,,
1189,The Strategist Anthology,2013,PG-13,"Action, Adventure, Sci-Fi",6.8,Isaac must stop a madman before he changes the...,1000.0,,,,


In [90]:
scifi_stream_df = scifi_stream_df.dropna()

scifi_stream_df_top_rated = scifi_stream_df.sort_values(by=['RATING'], ascending=[False])
scifi_stream_df_top_rated

Unnamed: 0,MOVIE NAME,YEAR,CERTIFICATE,GENRE,RATING,DESCRIPTION,GROSS(IN $),NETFLIX,HULU,PRIME VIDEO,DISNEY+
42,Terminator 2: Judgment Day,1991,R,"Action, Sci-Fi",8.6,"A cyborg, identical to the one who failed to k...",204843350.0,True,False,False,False
24,Alien,1979,R,"Horror, Sci-Fi",8.5,The crew of a commercial spacecraft encounter ...,78900000.0,False,False,True,False
30,Avengers: Infinity War,2018,PG-13,"Action, Adventure, Sci-Fi",8.4,The Avengers and their allies must be willing ...,678815482.0,False,False,False,True
7,Avengers: Endgame,2019,PG-13,"Action, Adventure, Drama",8.4,After the devastating events of Avengers: Infi...,858373000.0,False,False,False,True
115,WALL·E,2008,G,"Animation, Adventure, Family",8.4,"In the distant future, a small waste-collectin...",223808164.0,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
713,Double Dragon,1994,PG-13,"Action, Adventure, Comedy",3.9,Two brothers have half of a powerful ancient C...,2341309.0,False,False,True,False
681,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",3.8,Two British Agents team up to stop Sir August ...,23322832.0,False,False,False,True
423,Left Behind,2014,PG-13,"Action, Fantasy, Sci-Fi",3.1,A small group of survivors is left behind afte...,13998282.0,False,False,True,False
526,Battlefield Earth,2000,PG-13,"Action, Adventure, Sci-Fi",2.5,"It's the year 3000 A.D., and the Earth is lost...",21471685.0,True,False,False,False


In [96]:
netflix_count = scifi_stream_df_top_rated[scifi_stream_df_top_rated['NETFLIX'] == True].shape[0]
hulu_count = scifi_stream_df_top_rated[scifi_stream_df_top_rated['HULU'] == True].shape[0]
print(f"NETFLIX ", netflix_count)
print(f"HULU ", hulu_count)

NETFLIX  55
HULU  69


In [126]:
def stream_count(df, column_name):
    count = df[df[column_name] == True].shape[0]
    print(f'Total available on {column_name}: {count}')
    return count

In [127]:
def stream_totals(df):
    stream_services = ['NETFLIX', 'HULU', 'PRIME VIDEO', 'DISNEY+']

    stream_counts = {}
    for service in stream_services:
        count = stream_count(df, service)
        stream_counts[service] = count
    return stream_counts

In [116]:
print(f'Total available on', stream_count(scifi_stream_df_top_rated, 'NETFLIX'), 'NETFLIX')


Total available on 55 NETFLIX


In [128]:
stream_totals(scifi_stream_df_top_rated)

Total available on NETFLIX: 55
Total available on HULU: 69
Total available on PRIME VIDEO: 76
Total available on DISNEY+: 62


{'NETFLIX': 55, 'HULU': 69, 'PRIME VIDEO': 76, 'DISNEY+': 62}