# Converter CSV para SQL
Esse notebook lê os arquivos CSV e escreve um arquivo SQL que insere os dados no banco

In [2]:
import pandas as pd
import numpy as np

In [3]:
import pickle

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
!pip install git+https://github.com/cinemagoer/cinemagoer
import imdb
ia = imdb.IMDb()

Collecting git+https://github.com/cinemagoer/cinemagoer
  Cloning https://github.com/cinemagoer/cinemagoer to /tmp/pip-req-build-v0elckuz
  Running command git clone -q https://github.com/cinemagoer/cinemagoer /tmp/pip-req-build-v0elckuz


## 2. Ler arquivos e juntar

Vamos ler os 4 arquivos em CSV, juntar todos em um único dataframe e remover a coluna 'show_id', pois o id no nosso banco não será o mesmo id do arquivo original

In [6]:
# Le os 4 arquivos
amazon = pd.read_csv('amazon_prime_titles.csv')
disney = pd.read_csv('disney_plus_titles.csv')
hulu = pd.read_csv('hulu_titles.csv')
netflix = pd.read_csv('netflix_titles.csv')

# Adiciona one-hot de plataforma
amazon['amazon'] = True
disney['disney'] = True
netflix['netflix'] = True
hulu['hulu'] = True

# Adiciona data de inclusao para cada plataforma
amazon.rename(columns={'date_added': 'date_added_amazon'}, inplace=True)
disney.rename(columns={'date_added': 'date_added_disney'}, inplace=True)
hulu.rename(columns={'date_added': 'date_added_hulu'}, inplace=True)
netflix.rename(columns={'date_added': 'date_added_netflix'}, inplace=True)

# Junta em um dataframe so
shows = amazon.append(disney.append(hulu.append(netflix, ignore_index=True), ignore_index=True), ignore_index=True)

# Zera as outras plataformas
shows[['amazon','disney','hulu','netflix']] = shows[['amazon','disney','hulu','netflix']].fillna(False)

# Muda os vazios para NaN
shows.replace('', np.NaN)
 
# Remove a coluna 'show_id'
shows.drop(columns=['show_id'], inplace=True)

In [None]:
shows.columns

Index(['type', 'title', 'director', 'cast', 'country', 'date_added_amazon',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'amazon', 'date_added_disney', 'disney', 'date_added_hulu', 'hulu',
       'date_added_netflix', 'netflix'],
      dtype='object')

In [None]:
shows.count()

type                  22998
title                 22998
director              14736
cast                  17677
country               11499
date_added_amazon       155
release_year          22998
rating                22134
duration              22516
listed_in             22998
description           22994
amazon                22998
date_added_disney      1447
disney                22998
date_added_hulu        3045
hulu                  22998
date_added_netflix     8797
netflix               22998
dtype: int64

## 3. Limpeza de dados 

In [7]:
def createSet(series):
    seriesSet = set()
    for seriesString in series.dropna().unique():
        for element in seriesString.split(', '):
            if(element != '' and element[-1] != ','): seriesSet.add(element)
    return seriesSet

Conserta a duração de alguns shows que estão na coluna rating

In [8]:
shows.loc[shows['rating'].str.contains(r"min|Season") == True,['duration']] = shows[shows['rating'].str.contains(r"min|Season") == True]['rating']
shows.loc[shows['rating'].str.contains(r"min|Season") == True,['rating']] = np.NaN

Junta os shows duplicados

In [9]:
duplicatedShows = [v for v in shows.groupby(['title','type','release_year']).groups.values() if len(v)>1]

In [10]:
def mergeRow(series):
    return ', '.join(createSet(series))

def dateAdded(dates):
    return np.NaN if dates.isnull().all() else dates.dropna().iloc[0]

mergedShows = {
    'type': [],
    'title': [],
    'release_year': [],
    'director': [],
    'cast': [],
    'country': [],
    'listed_in': [],
    'amazon': [],
    'disney': [],
    'hulu': [],
    'netflix': [],
    'date_added_amazon': [],
    'date_added_disney': [],
    'date_added_hulu': [],
    'date_added_netflix': [],
    'duration': [],
    'rating': [],
    'description': []
}
duplicatedIds = []

i = 0
n = len(duplicatedShows)

for showsId in duplicatedShows:
    duplicatedIds.extend(showsId)
    duplicatedPair = shows.loc[showsId]
    
    mergedShows['type'].append(duplicatedPair['type'].iloc[0])
    mergedShows['title'].append(duplicatedPair['title'].iloc[0])
    mergedShows['release_year'].append(duplicatedPair['release_year'].iloc[0])

    mergedShows['director'].append(mergeRow(duplicatedPair['director']))
    mergedShows['cast'].append(mergeRow(duplicatedPair['cast']))
    mergedShows['country'].append(mergeRow(duplicatedPair['country']))
    mergedShows['listed_in'].append(mergeRow(duplicatedPair['listed_in']))

    mergedShows['amazon'].append(duplicatedPair['amazon'].any())
    mergedShows['disney'].append(duplicatedPair['disney'].any())
    mergedShows['hulu'].append(duplicatedPair['hulu'].any())
    mergedShows['netflix'].append(duplicatedPair['netflix'].any())

    mergedShows['date_added_amazon'].append(dateAdded(duplicatedPair['date_added_amazon']))
    mergedShows['date_added_disney'].append(dateAdded(duplicatedPair['date_added_disney']))
    mergedShows['date_added_hulu'].append(dateAdded(duplicatedPair['date_added_hulu']))
    mergedShows['date_added_netflix'].append(dateAdded(duplicatedPair['date_added_netflix']))

    mergedShows['duration'].append(duplicatedPair.mode()['duration'].iloc[0])
    mergedShows['rating'].append(duplicatedPair.mode()['rating'].iloc[0])
    mergedShows['description'].append(duplicatedPair.mode()['description'].iloc[0])
    
    i += 1
    print('{:03.2f}%\r'.format(100*i/n),end='')
    
shows = shows.drop(duplicatedIds)
shows = shows.append(pd.DataFrame.from_dict(mergedShows), ignore_index=True)



In [11]:
def replaceQuotes(setWithQuotes):
    return [g.replace("'","''") for g in setWithQuotes]

countries = pd.DataFrame(replaceQuotes(createSet(shows['country'])), columns=['Name'])
genre = pd.DataFrame(replaceQuotes(createSet(shows['listed_in'])), columns=['Name'])
actors = pd.DataFrame(replaceQuotes(createSet(shows['cast'])), columns=['Name'])
directors = pd.DataFrame(replaceQuotes(createSet(shows['director'])), columns=['Name'])

In [27]:
count = 0
total = len(actors) + len(directors)
peoplePhoto = {}
with open('peoplePhoto.pickle','rb') as f:
    peoplePhoto = pickle.load(f)

def getActorFileName(name):
    #global count
    #count += 1
    #print('{:03.3f}%\r'.format(100*count/total),end='')
    
    if(name in peoplePhoto): return peoplePhoto[name]

    photoBlob = None
    try:
        person = ia.search_person(name)[0]
        photoBlob = person['full-size headshot']
    except KeyboardInterrupt:
        raise
    except:
        photoBlob = np.NaN
    
    peoplePhoto[name] = photoBlob
    with open('peoplePhoto.pickle','wb') as f:
        pickle.dump(peoplePhoto, f)
        
    return photoBlob
'''  '''
    
actors['photo'] = actors['Name'].progress_apply(getActorFileName)
directors['photo'] = directors['Name'].progress_apply(getActorFileName)

  0%|          | 0/62534 [00:00<?, ?it/s]

  0%|          | 0/10897 [00:00<?, ?it/s]

In [41]:
actors = actors.replace('https://m.media-amazon.png', np.NaN)
directors = directors.replace('https://m.media-amazon.png', np.NaN)

In [42]:
countryCodeCSV = pd.read_csv('countries.csv')

# Vatican City = Holy See (Vatican City State)
countryCodeCSV.replace(['Holy See (Vatican City State)'],'Vatican City',inplace=True)

# Iran = Iran, Islamic Republic of
countryCodeCSV.replace(['Iran, Islamic Republic of'],'Iran',inplace=True)

# Namibia = Namibia ?????????????????
countryCodeCSV.replace(['Namibia'],'Namibia',inplace=True)

# Tanzania = Tanzania, United Republic of
countryCodeCSV.replace(['Tanzania, United Republic of'],'Tanzania',inplace=True)

# Palestine = Palestine, State of
countryCodeCSV.replace(['Palestine, State of'],'Palestine',inplace=True)

# Venezuela = Venezuela, Bolivarian Republic of
countryCodeCSV.replace(['Venezuela, Bolivarian Republic of'],'Venezuela',inplace=True)

# Syria = Syrian Arab Republic
countryCodeCSV.replace(['Syrian Arab Republic'],'Syria',inplace=True)

# Taiwan = Taiwan, Province of China
countryCodeCSV.replace(['Taiwan, Province of China'],'Taiwan',inplace=True)

# Russia = Russian Federation
countryCodeCSV.replace(['Russian Federation'],'Russia',inplace=True)

# South Korea = Korea, Republic of
countryCodeCSV.replace(['Korea, Republic of'],'South Korea',inplace=True)

# Vietnam = Viet Nam
countryCodeCSV.replace(['Viet Nam'],'Vietnam',inplace=True)

countries = countries.join(countryCodeCSV.set_index('Name'), on='Name')

### Criação das tabelas

In [43]:
Exhibit = {
  'id':[],
  'name':[],
  'release_year':[],
  'parental_rating':[],
  'description':[]
}

Country = {
  'country_code':[],
  'country_name':[],
  'id_exhibit':[]
}

Series = {
  'num_seasons':[],
  'id_exhibit':[]
}

Movie = {
  'runtime':[],
  'id_exhibit':[]
}

Actor = {
  'id':[],
  'name':[],
  'profile_photo':[]
}

Genre = {
  'id':[],
  'name':[]
}

Director = {
  'id':[],
  'name':[],
  'profile_photo':[]
}

Platform = {
  'id':[1,2,3,4],
  'name':['Amazon Prime', 'Netflix', 'Disney+', 'Hulu']
}

PlatformComposition = {
  'id_platform':[],
  'inclusion_date':[],
  'id_exhibit':[]
}

Directing = {
  'id_exhibit':[],
  'id_director':[]
}

GenreCategorization = {
  'id_exhibit':[],
  'id_genre':[]
}

Acting = {
  'id_exhibit':[],
  'id_actor':[]
}

In [44]:
def populated_dict(show):
  id = show.index
  Exhibit['id'].append(id)
  Exhibit['name'].append(show['title'] if type(show['title']) != float else None)
  Exhibit['release_year'].append(show['release_year'] if type(show['release_year']) != float else None)
  Exhibit['parental_rating'].append(show['rating'] if type(show['rating']) != float else None)
  Exhibit['description'].append(show['description'] if type(show['description']) != float else None)

  if(type(show['country']) != float):
    for country in show['country'].split(', '):
      if(country == '' or country[-1] == ','): continue
      countryRow = countries[countries['Name'] == country]
      Country['country_code'].append(countryRow['Code'] if type(countryRow['Code']) != float else None)
      Country['country_name'].append(countryRow['Name'])
      Country['id_exhibit'].append(id)

  duration = show['duration']
  if(duration and type(duration) != float):
    durationValue = int(duration.split()[0])
    if('Season' in duration):
      Series['num_seasons'].append(durationValue)
      Series['id_exhibit'].append(id)
    else:
      Movie['runtime'].append(durationValue)
      Movie['id_exhibit'].append(id)

  if(show['amazon'] == 1):
    PlatformComposition['id_platform'].append(1)
    PlatformComposition['inclusion_date'].append(show['date_added_amazon'])
    PlatformComposition['id_exhibit'].append(id)

  if(show['netflix'] == 1):
    PlatformComposition['id_platform'].append(2)
    PlatformComposition['inclusion_date'].append(show['date_added_netflix'])
    PlatformComposition['id_exhibit'].append(id)

  if(show['disney'] == 1):
    PlatformComposition['id_platform'].append(3)
    PlatformComposition['inclusion_date'].append(show['date_added_disney'])
    PlatformComposition['id_exhibit'].append(id)

  if(show['hulu'] == 1):
    PlatformComposition['id_platform'].append(4)
    PlatformComposition['inclusion_date'].append(show['date_added_hulu'])
    PlatformComposition['id_exhibit'].append(id)
  
  if(type(show['director']) != float):
    for director in show['director'].split(', '):
      if(director == '' or director[-1] == ','): continue
      Directing['id_director'].append(directors[directors['Name'] == director].index)
      Directing['id_exhibit'].append(id)

  if(type(show['listed_in']) != float):
    for actual_genre in show['listed_in'].split(', '):
      if(actual_genre == '' or actual_genre[-1] == ','): continue
      GenreCategorization['id_genre'].append(genre[genre['Name'] == actual_genre].index)
      GenreCategorization['id_exhibit'].append(id)

  if(type(show['cast']) != float):
    for actor in show['cast'].split(', '):
      if(actor == '' or actor[-1] == ','): continue
      Acting['id_actor'].append(actors[actors['Name'] == actor].index)
      Acting['id_exhibit'].append(id)

shows.progress_apply(populated_dict, axis=1)

  0%|          | 0/22608 [00:00<?, ?it/s]

0        None
1        None
2        None
3        None
4        None
         ... 
22603    None
22604    None
22605    None
22606    None
22607    None
Length: 22608, dtype: object

In [45]:
def insert_dict(data):
  insertString = ""
  for i in range(len(data[list(data.keys())[0]])-1):
    insertString += '('
    first = True
    for key in data.keys():
      if(first):
        first = False
      else:
        insertString += ','
      value = data[key][i]
      if(type(value) == str): insertString += "'"
      insertString += f"{value}"
      if(type(value) == str): insertString += "'"
    insertString += ')'
    if(i != len(data[list(data.keys())[0]]) - 2):
      insertString += ','
  return insertString

In [None]:
def insert_df(data):
  insertString = ""
  for key, row in data.iterrows():
    insertString += '('
    first = True
    for key in data.keys():
      if(first):
        first = False
      else:
        insertString += ','
      value = data[key][i]
      if(type(value) == str): insertString += "'"
      insertString += f"{value}"
      if(type(value) == str): insertString += "'"
    insertString += ')'
    if(i != len(data[list(data.keys())[0]]) - 2):
      insertString += ','
  return insertString

In [46]:
with open('dump.sql','w') as f:
    f.write(f"insert into Plataform values {Platform}\n")
    f.write(f"insert into Exhibit values {Exhibit}\n")
    f.write(f"insert into Country values {Country}\n")
    f.write(f"insert into Series values {Series}\n")
    f.write(f"insert into Movie values {Movie}\n")
    f.write(f"insert into Actor values {Actor}\n")
    f.write(f"insert into Genre values {Genre}\n")
    f.write(f"insert into Director values {Director}\n")
    f.write(f"insert into PlatformComposition values {PlatformComposition}\n")
    f.write(f"insert into Directing values {Directing}\n")
    f.write(f"insert into GenreCategorization values {GenreCategorization}\n")
    f.write(f"insert into Acting values {Acting}\n")