In [59]:
# Librerías
import sqlite3  # Módulo para conectar con la base de datos
import pandas as pd  # Módulo para trabajar con dataframes

# Conectar a la base de datos
conn = sqlite3.connect("data/db_movies")  # Conectamos con la base de datos
cur = conn.cursor()  # Creamos un cursor

# Consultamos las tablas de la base de datos
cur.execute("select name from sqlite_master where type='table'")
print(cur.fetchall())

[('ratings',), ('movies',), ('movies_clean',)]


# Preprocesado

In [60]:
# Consulta movies y colocamos en un dataframe
movies = pd.read_sql_query("SELECT * FROM movies", conn)
movies.info()

# Verificar si hay valores nulos
print("---------------------------------NULOS---------------------")
print(movies.isnull().sum())

# Buscar duplicados en title y ver coincidencias
print("------------------------------DUPLICADOS---------------------")
movies[movies.duplicated(subset=["title"], keep=False)].sort_values(by="title")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
---------------------------------NULOS---------------------
movieId    0
title      0
genres     0
dtype: int64
------------------------------DUPLICADOS---------------------


Unnamed: 0,movieId,title,genres
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
650,838,Emma (1996),Comedy|Drama|Romance
5601,26958,Emma (1996),Romance
5854,32600,Eros (2004),Drama
9135,147002,Eros (2004),Drama|Romance
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
6932,64997,War of the Worlds (2005),Action|Sci-Fi


Vemos que hay 5 peliculas que contienen valores duplicados, pero estos se diferencian en los generos, procedemos a buscar en páginas web de películas para verificar los generos de cada una de ellas y proceder a eliminar los duplicados.

In [61]:
import funciones as fn

# Ejecutamos el archivo sql
fn.ejecutar_sql("preprocessing.sql", cur)
# Consultamos las tablas de la base de datos
cur.execute("select name from sqlite_master where type='table' ")
# Mostramos las tablas
cur.fetchall()

[('ratings',), ('movies',), ('movies_clean',)]

In [62]:
# Consulta movies y colocamos en un dataframe
movies = pd.read_sql_query("SELECT * FROM movies_clean", conn)
movies.info()

# Verificar si hay valores nulos
print("---------------------------------NULOS---------------------")
print(movies.isnull().sum())

# Buscar duplicados en title y ver coincidencias
print("------------------------------DUPLICADOS---------------------")
movies[movies.duplicated(subset=["title"], keep=False)].sort_values(by="title")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9737 entries, 0 to 9736
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9737 non-null   int64 
 1   title    9737 non-null   object
 2   genres   9737 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.3+ KB
---------------------------------NULOS---------------------
movieId    0
title      0
genres     0
dtype: int64
------------------------------DUPLICADOS---------------------


Unnamed: 0,movieId,title,genres


# Union

In [63]:
# Unir tablas con movieID como llave con sql y unificar movieID
query = """
SELECT
    movies.movieId,
    movies.title,
    movies.genres,
    ratings.rating,
    ratings.userId
FROM movies
INNER JOIN ratings ON movies.movieId = ratings.movieId
"""
df = pd.read_sql_query(query, conn)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   movieId  100836 non-null  int64  
 1   title    100836 non-null  object 
 2   genres   100836 non-null  object 
 3   rating   100836 non-null  float64
 4   userId   100836 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 3.8+ MB
