<a href="https://colab.research.google.com/github/eduardotas/sparkProject/blob/main/spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Constants

In [13]:
zip_data_folder = "zip_data"
uniziped_data_folder = "uniziped_data"

#Create Environment

In [18]:
import os
os.makedirs(zip_data_folder, exist_ok=True)
os.makedirs(uniziped_data_folder, exist_ok=True)

#Dowload Data

In [20]:
import requests
import gzip
import shutil

In [30]:
# Lista de URLs para download
urls = [
    "https://datasets.imdbws.com/name.basics.tsv.gz", #Count 14292552
    "https://datasets.imdbws.com/title.akas.tsv.gz",
    "https://datasets.imdbws.com/title.basics.tsv.gz",
    "https://datasets.imdbws.com/title.crew.tsv.gz",
    "https://datasets.imdbws.com/title.episode.tsv.gz",
    "https://datasets.imdbws.com/title.principals.tsv.gz",
    "https://datasets.imdbws.com/title.ratings.tsv.gz"
]

# Função para baixar um arquivo
def download_file(url):
    filename = url.split("/")[-1]  # Extrai o nome do arquivo da URL
    file_path = os.path.join(zip_data_folder, filename)
    print(f"Baixando {filename}...")

    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Download concluído: {filename}")
    else:
        print(f"Falha ao baixar {filename}. Código de status: {response.status_code}")

    return file_path

In [33]:
# Função para descompactar um arquivo .gz
def extract_gz(file_path):
    output_file = file_path.replace(".gz", "")
    output_file = output_file.replace(zip_data_folder,uniziped_data_folder)
    print(f"Descompactando {file_path} para {output_file}...")

    with gzip.open(file_path, "rb") as f_in:
        with open(output_file, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    print(f"Descompactação concluída: {output_file}")
    return output_file

In [None]:
# Loop para baixar e descompactar todos os arquivos
for url in urls:
    gz_file = download_file(url)  # Baixa o arquivo
    if gz_file and os.path.exists(gz_file):
        extract_gz(gz_file)  # Descompacta o arquivo

print("Todos os arquivos foram baixados e descompactados com sucesso!")

#SPARK

In [35]:
from pyspark.sql import SparkSession

In [45]:
spark = SparkSession.builder \
    .appName("Movies") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [46]:

files = [os.path.join(uniziped_data_folder, f) for f in os.listdir(uniziped_data_folder) if os.path.isfile(os.path.join(uniziped_data_folder, f))]
for f in files:
  df = spark.read.csv(f, sep="\t", header=True, inferSchema=True)
  print(f)
  print(df.count())
#+---------------------------------------------------------+------------------+
#| Caminho do Arquivo                                      | Número de Linhas |
#+---------------------------------------------------------+------------------+
#| uniziped_data/title.akas.tsv                            | 51,769,541       |
#| uniziped_data/title.ratings.tsv                         | 1,550,861        |
#| uniziped_data/title.crew.tsv                            | 11,554,731       |
#| uniziped_data/title.episode.tsv                         | 8,888,537        |
#| uniziped_data/title.basics.tsv                          | 11,552,092       |
#| uniziped_data/title.principals.tsv                      | 91,727,348       |
#| uniziped_data/name.basics.tsv                           | 14,292,552       |
#---------------------------------------------------------+-------------------+

uniziped_data/title.akas.tsv
51769541
uniziped_data/title.ratings.tsv
1550861
uniziped_data/title.crew.tsv
11554731
uniziped_data/title.episode.tsv
8888537
uniziped_data/title.basics.tsv
11552092
uniziped_data/title.principals.tsv
91727348
uniziped_data/name.basics.tsv
14292552
