# Loading data

In [1]:
import pandas as pd

df = pd.read_csv('top_movies.csv') # read csv file

First review to the DataFrame

In [2]:
df.shape

(2000, 10)

In [3]:
df.dtypes

Movie Name       object
Release Year     object
Duration          int64
IMDB Rating     float64
Metascore       float64
Votes            object
Genre            object
Director         object
Cast             object
Gross            object
dtype: object

Standarize the column's names.

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

Clean the values of the release year.

In [5]:
df['release_year'].unique()

array(['1972', '1974', '1980', '1962', '1971', '1977', '1968', '1967',
       '1960', '1940', '1946', '1957', '1965', '1966', '1953', '1948',
       '1963', '1958', '1964', '1975', '1954', '1979', '1976', '1961',
       '1941', '1949', '1950', '1956', '1973', '1981', '1970',
       '1964–1968', '1939', '1955', '1945', '1978', '1959', '1944',
       '1969', '1951', '1982', '1931', '1952', '1926', '1925', '1934',
       '1936', '1927', '1921', '1932', '1933', '1937', '1988', '1986',
       '1983', '1987', '1985', '1991', '1984', '1989', '1990', 'I 1987',
       '1992', 'I 1990', '1996', '1995', '1993', '1994', '1997', 'I 1995',
       'I 1996', '1995–2016', 'I 1999', '2001', '1998 Video', '1999',
       '2000', '1998', 'I 1998', 'I 2000', '1998 TV Movie',
       '1999 TV Movie', '2001 TV Movie', '2004', '2003', '2002', 'I 2003',
       'I 2002', 'I 2004', '2005', '2006', '2005 Video', 'I 2005',
       'II 2004', '2006 TV Movie', 'I 2006', '2007', '2008', 'I 2007',
       'I 2008', 'II 20

In [6]:
import re

def extract_year(value):
    match = re.search(r'\b(19|20)\d{2}\b', value)
    return match.group(0) if match else None

# Aplicar la función a la columna 'year'
df['clean_year'] =df['release_year'].apply(extract_year)

# Convertir la columna 'clean_year' a números enteros, manejar errores
df['clean_year'] = pd.to_numeric(df['clean_year'], errors='coerce')

Clean the gender's column.

In [7]:
# Separar los géneros en la columna 'genre' por comas
df['genres'] = df['genre'].str.split(',')
# Explode la columna de géneros para tener una fila por cada género
df_exploded = df.explode('genres')
#Quitamos los espacios y las minúsculas
df_exploded['genres'] = df_exploded['genres'].str.strip().str.lower()
df.head(10)

Unnamed: 0,movie_name,release_year,duration,imdb_rating,metascore,votes,genre,director,cast,gross,clean_year,genres
0,The Godfather,1972,175,9.2,100.0,2002655,"Crime, Drama",Francis Ford Coppola,Marlon Brando,$134.97M,1972,"[Crime, Drama]"
1,The Godfather Part II,1974,202,9.0,90.0,1358608,"Crime, Drama",Francis Ford Coppola,Al Pacino,$57.30M,1974,"[Crime, Drama]"
2,Ordinary People,1980,124,7.7,86.0,56476,Drama,Robert Redford,Donald Sutherland,$54.80M,1980,[Drama]
3,Lawrence of Arabia,1962,218,8.3,100.0,313044,"Adventure, Biography, Drama",David Lean,Peter O'Toole,$44.82M,1962,"[Adventure, Biography, Drama]"
4,Straw Dogs,1971,113,7.4,73.0,64331,"Crime, Drama, Thriller",Sam Peckinpah,Dustin Hoffman,,1971,"[Crime, Drama, Thriller]"
5,Close Encounters of the Third Kind,1977,138,7.6,90.0,216050,"Drama, Sci-Fi",Steven Spielberg,Richard Dreyfuss,$132.09M,1977,"[Drama, Sci-Fi]"
6,Once Upon a Time in the West,1968,166,8.5,82.0,348110,Western,Sergio Leone,Henry Fonda,$5.32M,1968,[Western]
7,The Dirty Dozen,1967,150,7.7,73.0,78858,"Action, Adventure, War",Robert Aldrich,Lee Marvin,$45.30M,1967,"[Action, Adventure, War]"
8,Rosemary's Baby,1968,137,8.0,96.0,234034,"Drama, Horror",Roman Polanski,Mia Farrow,,1968,"[Drama, Horror]"
9,Cabaret,1972,124,7.8,80.0,59119,"Drama, Music, Musical",Bob Fosse,Liza Minnelli,$42.77M,1972,"[Drama, Music, Musical]"


In [8]:
df_exploded['genres'].unique()

array(['crime', 'drama', 'adventure', 'biography', 'thriller', 'sci-fi',
       'western', 'action', 'war', 'horror', 'music', 'musical',
       'mystery', 'romance', 'film-noir', 'comedy', 'family', 'fantasy',
       'sport', 'animation', 'history', 'documentary'], dtype=object)

In [9]:
df = df.drop(columns = ['release_year', 'genre', 'duration', 'votes'])

In [10]:
df.rename(columns={'clean_year': 'release_year'}, inplace=True)


In [24]:
df.sample(10)

Unnamed: 0,movie_name,imdb_rating,metascore,director,cast,gross,release_year,genres
293,Beetlejuice,7.5,71.0,Tim Burton,Alec Baldwin,$73.71M,1988,"[Comedy, Fantasy]"
1592,Meet the Robinsons,6.8,61.0,Stephen J. Anderson,Daniel Hansen,$97.82M,2007,"[Animation, Adventure, Comedy]"
1773,Eden Lake,6.7,65.0,James Watkins,Kelly Reilly,$0.01M,2008,"[Horror, Thriller]"
1822,Transformers: Revenge of the Fallen,6.0,35.0,Michael Bay,Shia LaBeouf,$402.11M,2009,"[Action, Adventure, Sci-Fi]"
377,Once Upon a Time in America,8.3,75.0,Sergio Leone,Robert De Niro,$5.32M,1984,"[Crime, Drama]"
1904,Ghosts of Girlfriends Past,5.8,34.0,Mark Waters,Matthew McConaughey,$55.25M,2009,"[Comedy, Fantasy, Romance]"
1584,The Holiday,6.9,52.0,Nancy Meyers,Kate Winslet,$63.22M,2006,"[Comedy, Romance]"
121,Goldfinger,7.7,87.0,Guy Hamilton,Sean Connery,$51.08M,1964,"[Action, Adventure, Thriller]"
669,Philadelphia,7.7,66.0,Jonathan Demme,Tom Hanks,$77.32M,1993,[Drama]
979,Coyote Ugly,5.7,27.0,David McNally,Piper Perabo,$60.79M,2000,"[Comedy, Drama, Music]"


In [12]:
df.to_csv('top_movies_cleaned.csv', index=False)

In [25]:
df2 = pd.read_csv('top_movies_cast.csv') # read csv file

In [26]:
df2.head(10)

Unnamed: 0.1,Unnamed: 0,Moive Name,Rating,Votes,Meta Score,Genre,PG Rating,Year,Duration,Cast,Director
0,0,Leave the World Behind,6.5,90000.0,67.0,"Drama, Mystery, Thriller",R,2023,2h 18m,"Julia Roberts, Mahershala Ali, Ethan Hawke, My...",Sam Esmail
1,1,Wonka,7.4,24000.0,66.0,"Adventure, Comedy, Family",PG,2023,1h 56m,"Timothée Chalamet, Gustave Die, Murray McArthu...",Paul King
2,2,Poor Things,8.5,6700.0,86.0,"Comedy, Drama, Romance",R,2023,2h 21m,"Emma Stone, Mark Ruffalo, Willem Dafoe, Ramy Y...",Yorgos Lanthimos
3,3,Killers of the Flower Moon,7.8,128000.0,89.0,"Crime, Drama, History",R,2023,3h 26m,"Leonardo DiCaprio, Robert De Niro, Lily Gladst...",Martin Scorsese
4,4,May December,7.0,21000.0,85.0,"Comedy, Drama",R,2023,1h 57m,"Natalie Portman, Chris Tenzis, Charles Melton,...",Todd Haynes
5,5,The Hunger Games: The Ballad of Songbirds & S...,7.1,56000.0,54.0,"Action, Adventure, Drama",PG-13,2023,2h 37m,"Rachel Zegler, Tom Blyth, Viola Davis, Dexter ...",Francis Lawrence
6,6,Napoleon,6.6,66000.0,64.0,"Action, Adventure, Biography",R,2023,2h 38m,"Joaquin Phoenix, Vanessa Kirby, Tahar Rahim, R...",Ridley Scott
7,7,Oppenheimer,8.4,553000.0,89.0,"Biography, Drama, History",R,2023,3h,"Cillian Murphy, Emily Blunt, Matt Damon, Rober...",Christopher Nolan
8,8,Love Actually,7.6,517000.0,55.0,"Comedy, Drama, Romance",R,2003,2h 15m,"Hugh Grant, Martine McCutcheon, Liam Neeson, L...",Richard Curtis
9,9,Candy Cane Lane,5.6,13000.0,47.0,"Comedy, Family, Fantasy",PG,2023,1h 57m,"Eddie Murphy, Tracee Ellis Ross, Jillian Bell,...",Reginald Hudlin


In [27]:
df2.columns

Index(['Unnamed: 0', 'Moive Name', 'Rating', 'Votes', 'Meta Score', 'Genre',
       'PG Rating', 'Year', 'Duration', 'Cast', 'Director'],
      dtype='object')