In [1]:
# import libraries 
import pandas as pd
import matplotlib.pyplot as plt
import os 
import math
import sys
from pathlib import Path
import seaborn as sns

# Add 'src' to the system path
sys.path.append(str(Path().resolve() / 'src'))
from src.data.process_data import *
from src.data.clean_data import *

IMPORTANT: these scripts/functions assume you have the following files in the data/raw directory:
- From the CMU dataset: 
    - movie.metadata.tsv
    - plot_summaries.txt
- From the TMDB dataset: 
    - TMDB_movie_dataset_v11.csv

AND have data/processed folder created

Note: download CMU dataset here: https://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
and TMDB dataset here (Download button): https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies

In [21]:
# from raw files, creates clean datafiles
%run src/data/clean_data.py

sep ,
headers []
original df shape (1127777, 24)
after status (1102507, 24)
after release date (760743, 24)
after release year (760743, 25)
after duplicates (746388, 25)
after numeric columns (746387, 25)
after select columns (746387, 13)
sep 	
headers ['wikipedia_movie_id', 'freebase_ID', 'title', 'release_year', 'revenue', 'runtime', 'languages', 'countries', 'genres']
original df shape (81740, 9)
after status (81740, 9)
after release date (81740, 9)
after release year (44006, 9)
after duplicates (43915, 9)
after numeric columns (43915, 9)
after select columns (43915, 5)


In [27]:
# from clean data files, creates a dataframe with CMU + plots & TMDB movies 
%run src/data/process_data.py
df_combined = create_cmu_tmdb_dataset('data/processed/movies.csv','data/processed/plot_summaries.csv', 'data/processed/TMDB_clean.csv')

In [28]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 430770 entries, 0 to 746386
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   release_year          430770 non-null  int64 
 1   summary               364351 non-null  object
 2   release_date          430770 non-null  object
 3   budget                430770 non-null  int64 
 4   original_language     430770 non-null  object
 5   overview              364351 non-null  object
 6   genres                430770 non-null  object
 7   production_companies  430770 non-null  object
 8   production_countries  430770 non-null  object
 9   spoken_languages      430770 non-null  object
 10  keywords              430770 non-null  object
 11  runtime               430770 non-null  int64 
 12  revenue               430770 non-null  int64 
 13  title                 430767 non-null  object
 14  dvd_era               430770 non-null  object
dtypes: int64(4), object(11

In [29]:
df_combined.head()

Unnamed: 0,release_year,summary,release_date,budget,original_language,overview,genres,production_companies,production_countries,spoken_languages,keywords,runtime,revenue,title,dvd_era
0,1987,A series of murders of rich young women throug...,1987-06-19,0,en,"In a wealthy and isolated desert community, a ...","[Horror, Thriller]",[Mrs. White's Productions],[United Kingdom],[English],"[based on novel or book, gas station, psychopa...",221,0,White Of The EyeWhite of the Eye,pre
1,1983,"Eva, an upper class housewife, becomes frustra...",1983-05-11,0,de,"Eva, an upper-class housewife, frustratedly le...",[Drama],[Dieter Geissler Filmproduktion],[Germany],[German],"[jealousy, eroticism, gigolo, longing, dominat...",212,0,A Woman in FlamesA Woman in Flames,pre
2,2002,"Every hundred years, the evil Morgana returns...",2002-04-12,0,en,"Every hundred years, the evil sorceress Morgan...","[Adventure, Family, Fantasy]","[Peakviewing Productions, Peakviewing Transatl...",[United Kingdom],"[French, English]",[morgana],172,0,The Sorcerer's ApprenticeThe Sorcerer's Appren...,during
3,1997,"Adam, a San Francisco-based artist who works a...",1997-04-04,0,en,Best friends Adam and Kevin have a lot in comm...,"[Comedy, Romance]","[Bandeira Entertainment, Miramax]",[],[English],[],183,0,Little cityLittle City,pre
4,1989,{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...,1989-10-05,9000000,en,Gritty adaption of William Shakespeare's play ...,"[War, Drama, History]","[BBC Film, Renaissance Films, Samuel Goldwyn C...",[United Kingdom],[English],"[france, kingdom, theater play, based on true ...",274,20337800,Henry VHenry V,pre


# Production countries

In [30]:
df_combined.groupby('dvd_era')['production_countries'].apply(lambda x: [country for sublist in x for country in sublist]).reset_index()

Unnamed: 0,dvd_era,production_countries
0,during,"[United Kingdom, United States of America, Aus..."
1,post,"[United States of America, India, Germany, Pol..."
2,pre,"[United Kingdom, Germany, United Kingdom, Indi..."
