In [300]:
%load_ext autoreload
%autoreload 2

# Librerías para tratamiento de datos

import pandas as pd
pd.set_option('display.max_columns', None) # Parámetro que modifica la visualización de los DFs
import numpy as np
import re

# Librería para el acceso a variables y funciones
import sys
sys.path.append("../")
from src import soporte_funciones as sf #Archivo .py donde encontraremos todas nuestras funciones
from src import soporte_variables as sv

# Librería para acceder a funcionalidades del sistema operativo
import os

# Librerías para trabajar con distintos formatos de archivos
import pickle
import json

# Librería para ignorar avisos
import warnings
warnings.filterwarnings("ignore") # Ignora TODOS los avisos

# BeautifulSoup y Selenium
from selenium import webdriver  
from webdriver_manager.chrome import ChromeDriverManager  
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.support.ui import Select 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException 
import time
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from time import sleep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Consulta de tipos de contenido

In [14]:

url = "https://moviesdatabase.p.rapidapi.com/titles/utils/titleTypes"

headers = {
	"x-rapidapi-key": "e9d53ce8f2msh50c48f79aa0b1b1p1674b7jsn7a3fd4b9a409",
	"x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
}

response = requests.get(url, headers=headers)

print(response.json())

{'results': [None, 'movie', 'musicVideo', 'podcastEpisode', 'podcastSeries', 'short', 'tvEpisode', 'tvMiniSeries', 'tvMovie', 'tvPilot', 'tvSeries', 'tvShort', 'tvSpecial', 'video', 'videoGame']}


## Consulta de títulos en API

Hago consulta a la API

In [295]:
resultados_peliculas = sf.consulta_peliculas(1)

Procesando géneros: 100%|██████████| 8/8 [20:11<00:00, 151.44s/it]


Compruebo longitud de resultados (tuplas). Nos quedamos con los datos de 1990.

In [296]:
len(resultados_peliculas)

596

Almaceno resultados

In [330]:
with open('datos/resultados_peliculas.pkl', 'wb') as f:
    pickle.dump(resultados_peliculas, f)

Creo lista con IDs

In [313]:
lista_ids = []

for resultado in resultados_peliculas:
    lista_ids.append(resultado[0])

Confirmo que coinciden

In [315]:
len(lista_ids)

596

## Enriquecimiento de datos con scraping de IMBD

Extraemos la información que buscamos.

In [325]:
sopas = sf.scrap_imdb(lista_ids)

Cookies aceptadas


100%|██████████| 596/596 [53:23<00:00,  5.37s/it]


Scraping finalizado


No todos los IDs extraídos existen en IMDB, resultando en menos coincidencias.

In [327]:
len(sopas)

367

Guardamos el conjunto.

In [328]:
with open('datos/resultados_scraping.pkl', 'wb') as f:
    pickle.dump(sopas, f)

## Preparamos los datos para la inserción en la DDBB

In [340]:
df1 = pd.DataFrame(sopas)
df1.columns = "id", "calificacion", "director", "guion", "argumento", "duracion"
df1.head()

Unnamed: 0,id,calificacion,director,guion,argumento,duracion
0,tt0059325,6.5,Jürgen Böttcher,Jürgen BöttcherKlaus Poche,"DDR film from the mid-60s: Li and Al, not long...",1h 40m
1,tt0059900,6.5,Egon Günther,Egon GüntherHelga Schütz,Adam receives a flashlight with special powers...,1h 18m
2,tt0065188,6.5,Zdenek Tyc,Jaromir KacerJirí SoukupZdenek Tyc,,1h 20m
3,tt0068494,5.0,Arch Oboler,Arch Oboler,A young college student and a former G.I. on h...,R
4,tt0075259,5.7,Lindsay Shonteff,Len Deighton,"A British spy is framed, so he must evade the ...",1h 43m
...,...,...,...,...,...,...
362,tt0100705,5.0,Gregory Dark,Gregory DarkJohn Powers,John Miller is an politician wanting to become...,R
363,tt0099216,5.8,Lincoln Kupchak,Lincoln Kupchak,Woman is stalked by her demented boyfriend on ...,3m
364,tt0151585,6.9,Álex de la Iglesia,Jorge GuerricaechevarríaÁlex de la Iglesia,A man comes into a bar in which there's only o...,12m
365,tt1341308,6.4,Fred Gallo,Tracy BaroneElisa Bell,,24m


In [357]:
df2 = pd.DataFrame(resultados_peliculas)
df2.columns = "id", "nombre", "tipo", "anio", "mes", "genero"
df2.head()

Unnamed: 0,id,nombre,tipo,anio,mes,genero
0,tt0059325,Jahrgang 45,Movie,1990,10,Drama
1,tt0059900,"Wenn du groß bist, lieber Adam",Movie,1990,10,Drama
2,tt0065188,"Vojtech, receny sirotek",Movie,1990,10,Drama
3,tt0068494,Domo Arigato,Movie,1990,8,Drama
4,tt0075259,Spy Story,Movie,1990,3,Drama


Mapeo para crear IDs del tipo de contenido: movie 1 short 2, y cambio a minutos.

In [441]:
df_merged = pd.merge(right= df1, left=df2)
df_merged["id_tipo"] = df_merged["tipo"].map({"Movie":1, "Short": 2})
df_merged.head()

Unnamed: 0,id,nombre,tipo,anio,mes,genero,calificacion,director,guion,argumento,duracion,minutos,id_tipo
0,tt0059325,Jahrgang 45,Movie,1990,10,Drama,6.5,Jürgen Böttcher,Jürgen BöttcherKlaus Poche,"DDR film from the mid-60s: Li and Al, not long...",1h 40m,100,1
1,tt0059325,Jahrgang 45,Movie,1990,10,Drama,6.5,Jürgen Böttcher,Jürgen BöttcherKlaus Poche,"DDR film from the mid-60s: Li and Al, not long...",1h 40m,100,1
2,tt0059900,"Wenn du groß bist, lieber Adam",Movie,1990,10,Drama,6.5,Egon Günther,Egon GüntherHelga Schütz,Adam receives a flashlight with special powers...,1h 18m,78,1
3,tt0059900,"Wenn du groß bist, lieber Adam",Movie,1990,10,Drama,6.5,Egon Günther,Egon GüntherHelga Schütz,Adam receives a flashlight with special powers...,1h 18m,78,1
4,tt0065188,"Vojtech, receny sirotek",Movie,1990,10,Drama,6.5,Zdenek Tyc,Jaromir KacerJirí SoukupZdenek Tyc,,1h 20m,80,1


In [442]:
df_merged.drop_duplicates(inplace=True)

In [443]:
df_merged.fillna("0", inplace=True)

In [444]:
df_dict = pd.DataFrame(df_merged["director"].unique()).reset_index()
df_dict.columns = "id_director", "nombre"
df_dict.head()

Unnamed: 0,id_director,nombre
0,0,Jürgen Böttcher
1,1,Egon Günther
2,2,Zdenek Tyc
3,3,Arch Oboler
4,4,Lindsay Shonteff


In [445]:
df_contenido = pd.merge(right= df_dict, right_on="nombre", left=df_merged, left_on="director")
df_contenido = df_contenido[["id", "nombre_x", "id_tipo", "id_director"]]
df_contenido.columns = "id_contenido", "nombre", "id_tipo", "id_director"
df_contenido.head()

Unnamed: 0,id_contenido,nombre,id_tipo,id_director
0,tt0059325,Jahrgang 45,1,0
1,tt0059900,"Wenn du groß bist, lieber Adam",1,1
2,tt0065188,"Vojtech, receny sirotek",1,2
3,tt0068494,Domo Arigato,1,3
4,tt0075259,Spy Story,1,4


In [446]:
df_contenido.to_csv("datos/contenido.csv")

In [447]:
df_caracteristicas = df_merged[["id", "guion", "anio", "mes", "minutos", "calificacion", "argumento"]]
df_caracteristicas.head()

Unnamed: 0,id,guion,anio,mes,minutos,calificacion,argumento
0,tt0059325,Jürgen BöttcherKlaus Poche,1990,10,100,6.5,"DDR film from the mid-60s: Li and Al, not long..."
2,tt0059900,Egon GüntherHelga Schütz,1990,10,78,6.5,Adam receives a flashlight with special powers...
4,tt0065188,Jaromir KacerJirí SoukupZdenek Tyc,1990,10,80,6.5,
5,tt0068494,Arch Oboler,1990,8,0,5.0,A young college student and a former G.I. on h...
6,tt0075259,Len Deighton,1990,3,103,5.7,"A British spy is framed, so he must evade the ..."


In [448]:
df_caracteristicas.to_csv("datos/caracteristicas.csv")

In [449]:
df_tipo = pd.DataFrame({1:["Movie"], 2:["Short"]}).T
df_tipo.reset_index(inplace=True)
df_tipo.columns = "id_tipo", "nombre"
df_tipo

Unnamed: 0,id_tipo,nombre
0,1,Movie
1,2,Short


In [450]:
df_tipo.to_csv("datos/tipo.csv")

In [451]:
df_dict.head()

Unnamed: 0,id_director,nombre
0,0,Jürgen Böttcher
1,1,Egon Günther
2,2,Zdenek Tyc
3,3,Arch Oboler
4,4,Lindsay Shonteff


In [452]:
df_dict.to_csv("datos/directores.csv")