# Converter CSV para SQL
Esse notebook lê os arquivos CSV e escreve um arquivo SQL que insere os dados no banco

In [1]:
import pandas as pd
import numpy as np

In [2]:
import requests
import re
import shutil

## 2. Ler arquivos e juntar

Vamos ler os 4 arquivos em CSV, juntar todos em um único dataframe e remover a coluna 'show_id', pois o id no nosso banco não será o mesmo id do arquivo original

In [80]:
# Le os 4 arquivos
amazon = pd.read_csv('amazon_prime_titles.csv')
disney = pd.read_csv('disney_plus_titles.csv')
hulu = pd.read_csv('hulu_titles.csv')
netflix = pd.read_csv('netflix_titles.csv')

# Adiciona one-hot de plataforma
amazon['amazon'] = True
disney['disney'] = True
netflix['netflix'] = True
hulu['hulu'] = True

# Adiciona data de inclusao para cada plataforma
amazon.rename(columns={'date_added': 'date_added_amazon'}, inplace=True)
disney.rename(columns={'date_added': 'date_added_disney'}, inplace=True)
hulu.rename(columns={'date_added': 'date_added_hulu'}, inplace=True)
netflix.rename(columns={'date_added': 'date_added_netflix'}, inplace=True)

# Junta em um dataframe so
shows = amazon.append(disney.append(hulu.append(netflix, ignore_index=True), ignore_index=True), ignore_index=True)

# Zera as outras plataformas
shows[['amazon','disney','hulu','netflix']] = shows[['amazon','disney','hulu','netflix']].fillna(False)

# Remove a coluna 'show_id'
shows.drop(columns=['show_id'], inplace=True)

In [18]:
shows.columns

Index(['type', 'title', 'director', 'cast', 'country', 'date_added_amazon',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'amazon', 'date_added_disney', 'disney', 'date_added_hulu', 'hulu',
       'date_added_netflix', 'netflix'],
      dtype='object')

In [19]:
shows.count()

type                  22998
title                 22998
director              14736
cast                  17677
country               11499
date_added_amazon       155
release_year          22998
rating                22134
duration              22516
listed_in             22998
description           22994
amazon                22998
date_added_disney      1447
disney                22998
date_added_hulu        3045
hulu                  22998
date_added_netflix     8797
netflix               22998
dtype: int64

## 3. Limpeza de dados 

In [None]:
def createSet(series):
    seriesSet = set()
    for seriesString in seriesSet.dropna().unique():
        for element in seriesString.split(', '):
            seriesSet.add(element)
    return seriesSet

Conserta a duração de alguns shows que estão na coluna rating

In [81]:
shows.loc[shows['rating'].str.contains(r"min|Season") == True,['duration']] = shows[shows['rating'].str.contains(r"min|Season") == True]['rating']
shows.loc[shows['rating'].str.contains(r"min|Season") == True,['rating']] = np.NaN

Junta os shows duplicados

In [82]:
duplicatedShows = [v for v in shows.groupby(['title','type','release_year']).groups.values() if len(v)>1]

In [83]:
def mergeRow(series):
    return ', '.join([e for e in createSet(series)])

def dateAdded(dates):
    return np.NaN if dates.isnull().all() else dates.dropna().iloc[0]

mergedShows = {
    'type': [],
    'title': [],
    'release_year': [],
    'director': [],
    'cast': [],
    'country': [],
    'listed_in': [],
    'amazon': [],
    'disney': [],
    'hulu': [],
    'netflix': [],
    'date_added_amazon': [],
    'date_added_disney': [],
    'date_added_hulu': [],
    'date_added_netflix': [],
    'duration': [],
    'rating': [],
    'description': []
}
duplicatedIds = []

i = 0
n = len(duplicatedShows)

for showsId in duplicatedShows:
    duplicatedIds.extend(showsId)
    duplicatedPair = shows.loc[showsId]
    
    mergedShows['type'].append(duplicatedPair['type'].iloc[0])
    mergedShows['title'].append(duplicatedPair['title'].iloc[0])
    mergedShows['release_year'].append(duplicatedPair['release_year'].iloc[0])

    mergedShows['director'].append(mergeRow(duplicatedPair['director']))
    mergedShows['cast'].append(mergeRow(duplicatedPair['cast']))
    mergedShows['country'].append(mergeRow(duplicatedPair['country']))
    mergedShows['listed_in'].append(mergeRow(duplicatedPair['listed_in']))

    mergedShows['amazon'].append(duplicatedPair['amazon'].any())
    mergedShows['disney'].append(duplicatedPair['disney'].any())
    mergedShows['hulu'].append(duplicatedPair['hulu'].any())
    mergedShows['netflix'].append(duplicatedPair['netflix'].any())

    mergedShows['date_added_amazon'].append(dateAdded(duplicatedPair['date_added_amazon']))
    mergedShows['date_added_disney'].append(dateAdded(duplicatedPair['date_added_disney']))
    mergedShows['date_added_hulu'].append(dateAdded(duplicatedPair['date_added_hulu']))
    mergedShows['date_added_netflix'].append(dateAdded(duplicatedPair['date_added_netflix']))

    mergedShows['duration'].append(duplicatedPair.mode()['duration'].iloc[0])
    mergedShows['rating'].append(duplicatedPair.mode()['rating'].iloc[0])
    mergedShows['description'].append(duplicatedPair.mode()['description'].iloc[0])
    
    i += 1
    print('{:03.2f}%\r'.format(100*i/n),end='')
    
shows = shows.drop(duplicatedIds)
shows = shows.append(pd.DataFrame.from_dict(mergedShows), ignore_index=True)

100.00%

In [47]:
countries = createSet(shows['country'])
genre = createSet(shows['listed_in'])
actors = createSet(shows['cast'])
directors = createSet(shows['director'])

## ?. Coletar as imagens dos artistas
Para isso será feita uma raspagem do site 'biography.com'

In [26]:
actorName = 'Nicolas Cage'
actorNameFormatted = actorName.lower().replace(' ', '-')
actorUrl = f'https://www.biography.com/actor/{actorNameFormatted}'

try:
  with requests.get(actorUrl) as actorReq:
    imgUrl = re.findall('<img src="[^"]*"', actorReq.text)[2][10:-1]
    with requests.get(imgUrl, stream=True) as imgReq:
      with open(f'{actorNameFormatted}.png','wb') as file:
        shutil.copyfileobj(imgReq.raw, file)
except:
  print(f'{actorName} not found')

In [55]:
with open('dump.sql','w') as f:
    f.write("insert into Plataforma(nome) values ('Amazon Prime'), ('Netflix'), ('Disney+'), ('Hulu')\n")