This code is to combine all the csv files scraped in ScraPy spider 1 together to make a full list of all the ids to be scraped, in order of box office earnings

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import glob

# Get a list of all CSV files
csv_files = glob.glob('/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/*.csv')
csv_files

['/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_2023.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_1995_1996.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_1997_1998.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_1999_2000.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_2001_2002.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_2003_2004.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_2005_2006.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_2021_2022.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/movies_2017_2018.csv',
 '/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Co

In [34]:
import pandas as pd

# Function to extract year from filename
def extract_year(filename):
    # Split the filename into parts using the underscore (_) and take the last part, then remove the '.csv'
    year = filename.split('_')[-1].replace('.csv', '')
    return int(year)

# Sort csv_files in descending order based on the year
csv_files.sort(key=extract_year, reverse=True)

# Create an empty DataFrame to concatenate to
merged_df = pd.DataFrame()

# Loop through the csv files and append each to the merged_df
for file in csv_files:
    df = pd.read_csv(file, delimiter='|')
    year = extract_year(file)
    df['Year'] = year
    merged_df = pd.concat([merged_df, df])



In [35]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101588 entries, 0 to 3024
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Title        101588 non-null  object
 1   Gross        101588 non-null  object
 2   Details URL  101588 non-null  object
 3   Genres       99739 non-null   object
 4   Year         101588 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 4.7+ MB


In [36]:
merged_df

Unnamed: 0,Title,Gross,Details URL,Genres,Year
0,R BnB,7460,/title/tt12338584/,"\nDrama, Romance, Thriller",2023
1,Trinket Box,2979,/title/tt21080296/,"\nDrama, Horror",2023
2,Mission: Impossible - Dead Reckoning Part One,78587,/title/tt9603212/,"\nAction, Adventure, Thriller",2023
3,Sound of Freedom,31260,/title/tt7599146/,"\nAction, Biography, Drama",2023
4,Oppenheimer,69435,/title/tt15398776/,"\nBiography, Drama, History",2023
...,...,...,...,...,...
3020,Space Jam,90463534,/title/tt0117705/,"\nAnimation, Adventure, Comedy",1996
3021,Waterworld,88246220,/title/tt0114898/,"\nAction, Adventure, Sci-Fi",1996
3022,Dangerous Minds,84919401,/title/tt0112792/,"\nBiography, Drama",1996
3023,Mr. Holland's Opus,82569971,/title/tt0113862/,"\nDrama, Music",1996


Alright, I want to order the whole thing by gross revenue descending, but because in 2023 some movies aren't yet fully revenued-out I want those to be at the top.

In [37]:
# Remove commas from 'Gross' column and convert to numeric values
merged_df['Gross'] = pd.to_numeric(merged_df['Gross'].str.replace(',', ''))

In [38]:
# Sort the DataFrame by 'Gross' in descending order
merged_df = merged_df.sort_values(by='Gross', ascending=False)

In [39]:
# Separate 2023 movies and the rest
df_2023 = merged_df[merged_df['Year'] == 2023]
df_rest = merged_df[merged_df['Year'] != 2023]

In [40]:
# Append the rest to the 2023 DataFrame
merged_df = df_2023.append(df_rest)

  merged_df = df_2023.append(df_rest)


In [41]:
# Remove duplicates based on 'Details URL' column and assign the result back to 'merged_df'
merged_df = merged_df.drop_duplicates(subset='Details URL')



In [42]:
merged_df

Unnamed: 0,Title,Gross,Details URL,Genres,Year
4387,Top Gun: Maverick,718732821,/title/tt1745960/,"\nAction, Drama",2023
4388,Avatar: The Way of Water,659682302,/title/tt1630029/,"\nAction, Adventure, Fantasy",2023
4389,Black Panther: Wakanda Forever,453721831,/title/tt9114286/,"\nAction, Adventure, Drama",2023
4390,Doctor Strange in the Multiverse of Madness,411331607,/title/tt9419884/,"\nAction, Adventure, Fantasy",2023
4391,Jurassic World Dominion,376851080,/title/tt8041270/,"\nAction, Adventure, Sci-Fi",2023
...,...,...,...,...,...
9240,Titus Androgynous,5,/title/tt11290772/,\nDrama,2020
9242,Hafiz,5,/title/tt12837942/,\nDrama,2020
8424,Aussig,5,/title/tt2022337/,\nComedy,2010
8970,Gena,5,/title/tt7304146/,"\nDrama, Thriller",2018


In [44]:
# Write the merged DataFrame back into a new CSV file
merged_df.to_csv('/content/drive/MyDrive/IMDB Project/Scraping/scraped_data/ScraPy_Code_1.1_data/merged_movies3.csv', index=False)