In [None]:
!pip install ipython-sql

In [1]:
import os
import pandas as pd
import numpy as np
import ast

from tqdm import tqdm
from sqlalchemy import create_engine

from pprint import pprint


In [2]:
# create engine
CONNECTION_URI = "postgresql://postgres:postgres@localhost:5432/mocommender"
engine = create_engine(CONNECTION_URI)
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fc84cd65810>

In [62]:
%load_ext sql
%sql $CONNECTION_URI

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [65]:
%%sql

UsageError: %%sql is a cell magic, but the cell body is empty. Did you mean the line magic %sql (single %)?


---

In [3]:
EXPORT_DIR = "./export"

In [63]:
exported_data = os.listdir(EXPORT_DIR)
data = {}

for i in exported_data:
  data[i] = pd.read_parquet(EXPORT_DIR + '/' + i)

In [39]:
schemas = {}
for i in exported_data:
  schemas[i] = pd.io.sql.get_schema(data[i], name=i, con=engine)

## METADATA

In [79]:
metadata = data['metadata'].drop('original_language', axis=1)
metadata

Unnamed: 0,id,imdb_id,overview,popularity,release_date,runtime,title,vote_average,vote_count
0,3,tt0092149,"An episode in the life of Nikander, a garbage ...",2.292110,10-16-1986,76.0,Shadows in Paradise,7.1,35
1,5,tt0113101,It's Ted the Bellhop's first night on the job....,9.026586,12-09-1995,98.0,Four Rooms,6.5,539
2,6,tt0107286,"While racing to a boxing match, Frank, Mike, J...",5.538671,10-15-1993,110.0,Judgment Night,6.4,79
3,12,tt0266543,"Nemo, an adventurous young clownfish, is unexp...",25.497794,05-30-2003,100.0,Finding Nemo,7.6,6292
4,13,tt0109830,A man with a low IQ has accomplished great thi...,48.307194,07-06-1994,142.0,Forrest Gump,8.2,8147
...,...,...,...,...,...,...,...,...,...
28996,460870,tt2083695,Professor Jim Al-Khalili tells the electrifyin...,0.007123,10-06-2011,180.0,Shock and Awe: The Story of Electricity,0.0,0
28997,461297,tt6212210,In this family-friendly action reboot of the 1...,1.248353,08-01-2017,97.0,Cop and a Half: New Recruit,6.0,1
28998,461615,tt4717402,Angelino is just one of thousands of deadbeats...,0.249393,06-13-2017,90.0,Mutafukaz,7.5,2
28999,461805,tt6840134,"Academy Award-winning filmmaker, Oliver Stone ...",0.642527,06-12-2017,240.0,The Putin Interviews,8.1,8


In [80]:
metadata.to_sql(con=engine, name="metadata", if_exists="append", index=False)

1

# GENRE

In [64]:
genre = data['genre'].copy()

In [66]:
genre.to_sql(con=engine, name="genre", if_exists="append", index=False)

20

## MOVIE & GENRE

In [81]:
movie_genre = data['movie_genre']
movie_genre = movie_genre.explode('genres').convert_dtypes()
movie_genre = movie_genre.rename(columns={"id": "metadata_id", "genres": "genre_id"})

In [82]:
movie_genre.to_sql(con=engine, name="metadata_genre", if_exists="append", index=False)

263

# CAST

In [67]:
cast = data['cast'].copy()

In [68]:
# comment this after exec
cast.rename(columns={'id':'actor_id', 'credit_id': 'id', 'order': 'cast_order', 'character': 'cast_character'}, inplace=True)
cast.to_sql(con=engine, name="mcast", if_exists="append", index=False)

883

## MOVIE & CAST

In [83]:
def convert(x):
  return [i.strip() for i in x[1:-1].split(',')]

movie_cast = data['movie_cast']
movie_cast = movie_cast.rename(columns={"id": "metadata_id", "cast": "cast_id"})
movie_cast['cast_id'] = movie_cast['cast_id'].apply(convert)
movie_cast = movie_cast.explode('cast_id')
movie_cast

Unnamed: 0,metadata_id,cast_id
0,8844,52fe44bfc3a36847f80a7c73
0,8844,52fe44bfc3a36847f80a7c99
0,8844,52fe44bfc3a36847f80a7c77
0,8844,52fe44c0c3a36847f80a7ce7
0,8844,52fe44bfc3a36847f80a7c9d
...,...,...
17322,67758,57ed0e90c3a3687dcf005d12
17322,67758,57ed0f43c3a3683a85005278
17322,67758,57ed0f4ec3a36839f700532b
17322,67758,57ed0f5ac3a36839f7005330


In [85]:
movie_cast.to_sql(con=engine, name="metadata_cast", if_exists="replace", index=False)

13

# CREW

In [69]:
crew = data['crew'].copy()

In [70]:
crew.rename(columns={'id':'employee_id', 'credit_id': 'id' }, inplace=True)
crew.to_sql(con=engine, name="crew", if_exists="append", index=False)

746

## MOVIE & CREW

In [86]:
movie_crew = data['movie_crew']
movie_crew = movie_crew.rename(columns={"id": "metadata_id", "crew": "crew_id"})
movie_crew = movie_crew.explode('crew_id')
movie_crew

Unnamed: 0,metadata_id,crew_id
0,8844,52fe44bfc3a36847f80a7cd1
0,8844,52fe44bfc3a36847f80a7c89
0,8844,52fe44bfc3a36847f80a7cdd
0,8844,52fe44bfc3a36847f80a7c7d
0,8844,52fe44bfc3a36847f80a7cd7
...,...,...
17624,67758,52fe4776c3a368484e0c8387
17624,67758,52fe4776c3a368484e0c838d
17624,67758,52fe4776c3a368484e0c8393
17624,67758,52fe4776c3a368484e0c8399


In [87]:
movie_crew.to_sql(con=engine, name="metadata_crew", if_exists="replace", index=False)

982

# KEYWORD

In [None]:
keyword = data['keyword']

In [72]:
keyword.to_sql(con=engine, name="keyword", if_exists="append", index=False)

828

## MOVIE & KEYWORD

In [88]:
movie_keyword = data['movie_keyword']
movie_keyword = movie_keyword.rename(columns={"id": "metadata_id", "keywords": "keyword_id"})
movie_keyword = movie_keyword.explode('keyword_id').convert_dtypes()

In [90]:
movie_keyword.to_sql(con=engine, name="metadata_keyword", if_exists="replace", index=False)

516

# COMPANY

In [73]:
company = data['company']

In [74]:
company.to_sql(con=engine, name="company", if_exists="append", index=False)

342

## MOVIE & COMPANY

In [91]:
movie_company = data['movie_company']
movie_company = movie_company.rename(columns={"id": "metadata_id", "production_companies": "company_id"})
movie_company = movie_company.explode('company_id').convert_dtypes()

In [92]:
movie_company.to_sql(con=engine, name="metadata_company", if_exists="replace", index=False)

299

# LANGUAGE

In [75]:
language = data['language']

In [76]:
language.to_sql(con=engine, name="language", if_exists="append", index=False)

122

## MOVIE & LANGUAGE

In [72]:
data['movie_language']

Unnamed: 0,id,spoken_languages
0,2,"[fi, de]"
1,3,"[en, fi, sv]"
2,5,[en]
3,6,[en]
4,11,[en]
...,...,...
28996,461634,[en]
28997,461805,"[en, ru]"
28998,462788,[en]
28999,463800,"[en, vi]"


In [93]:
movie_language = data['movie_language']
movie_language = movie_language.rename(columns={"id": "metadata_id", "spoken_languages": "language_id"})
movie_language = movie_language.explode('language_id')

In [94]:
movie_language.to_sql(con=engine, name="metadata_language", if_exists="replace", index=False)

928

# COUNTRY

In [77]:
country = data['country']
country

Unnamed: 0,id,name
0,AE,United Arab Emirates
1,AF,Afghanistan
2,AL,Albania
3,AM,Armenia
4,AN,Netherlands Antilles
...,...,...
146,XC,Czechoslovakia
147,XG,East Germany
148,YU,Yugoslavia
149,ZA,South Africa


In [78]:
country.to_sql(con=engine, name="country", if_exists="append", index=False)

151

## MOVIE & COUNTRY

In [75]:
data['movie_country']

Unnamed: 0,id,production_countries
0,2,[FI]
1,3,[FI]
2,5,[US]
3,6,"[JP, US]"
4,11,[US]
...,...,...
28996,461634,[US]
28997,461805,[US]
28998,462788,[US]
28999,463800,[CA]


In [95]:
movie_country = data['movie_country']
movie_country = movie_country.rename(columns={"id": "metadata_id", "production_countries": "country_id"})
movie_country = movie_country.explode('country_id')

In [96]:
movie_country.to_sql(con=engine, name="metadata_country", if_exists="replace", index=False)

29