In [1]:
import pandas as pd
import numpy as np

# Actividad 1: Limpieza de Datos
Este notebook realiza la limpieza del dataset IMDB 5000 Movie Dataset como parte del proceso de minería de datos.

## Objetivos:
- Eliminar duplicados
- Detectar y tratar valores nulos
- Corregir tipos de datos
- Preparar el archivo limpio para análisis posteriores


In [8]:
# Carga del dataset
df = pd.read_csv('../movie_metadata.csv')  # Ajusta ruta si lo mueves
df.head()


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [9]:
# Información básica
df.info()

# Verificar duplicados
print("Duplicados:", df.duplicated().sum())

# Valores nulos por columna
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      14
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

## Tratamiento de valores nulos
- Se rellenará la columna `duration` con su mediana.
- Se eliminarán columnas con demasiados valores nulos o que no aportan al análisis.


In [10]:
# Eliminar columnas poco útiles o con muchos nulos
df.drop(columns=['color', 'aspect_ratio', 'plot_keywords'], inplace=True)

# Eliminar filas con datos críticos faltantes
df.dropna(subset=['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name'], inplace=True)

# Rellenar columnas numéricas con mediana
num_cols_mediana = [
    'num_critic_for_reviews', 'duration', 'director_facebook_likes',
    'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes',
    'facenumber_in_poster', 'num_user_for_reviews'
]
for col in num_cols_mediana:
    df[col].fillna(df[col].median(), inplace=True)

# Rellenar columnas categóricas con moda
cat_cols_moda = ['language', 'country', 'content_rating']
for col in cat_cols_moda:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Convertir título año a datetime
df['title_year'] = pd.to_datetime(df['title_year'], format='%Y', errors='coerce')

# Verificar resultado
print(df.isnull().sum())


director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                        776
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster           0
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating                 0
budget                       391
title_year                     2
actor_2_facebook_likes         0
imdb_score                     0
movie_facebook_likes           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [11]:
# Verificamos resultados finales
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
Index: 4919 entries, 0 to 5042
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   director_name              4919 non-null   object        
 1   num_critic_for_reviews     4919 non-null   float64       
 2   duration                   4919 non-null   float64       
 3   director_facebook_likes    4919 non-null   float64       
 4   actor_3_facebook_likes     4919 non-null   float64       
 5   actor_2_name               4919 non-null   object        
 6   actor_1_facebook_likes     4919 non-null   float64       
 7   gross                      4143 non-null   float64       
 8   genres                     4919 non-null   object        
 9   actor_1_name               4919 non-null   object        
 10  movie_title                4919 non-null   object        
 11  num_voted_users            4919 non-null   int64         
 12  cast_total_

director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                        776
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster           0
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating                 0
budget                       391
title_year                     2
actor_2_facebook_likes         0
imdb_score                     0
movie_facebook_likes           0
dtype: int64

In [None]:
# Guardar el dataset limpio
df.to_csv('dataset_limpio.csv', index=False)
print("Dataset limpio guardado como 'dataset_limpio.csv'")


✅ Dataset limpio guardado como 'dataset_limpio.csv'
