In [1]:
import pandas as pd
import opendatasets as od
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
def get_and_load(url, streaming=True):
    od.download(url, data_dir='downloads', force=True)
    folder_name = url.split('/')[-1]
    
    dataset_name = folder_name.split('-')[1]
    df = pd.read_csv(f'downloads/{folder_name}/data.csv')
    print(f'{dataset_name}: {df.shape}')

    if streaming:
        # Verifique o formato dos dados
        df['availableCountries'] = df['availableCountries'].apply(lambda x: x.split(',') if isinstance(x, str) else x)

        # Remova espaços em branco
        df['availableCountries'] = df['availableCountries'].apply(lambda x: [country.strip() for country in x])

        # Aplique o MultiLabelBinarizer
        mlb = MultiLabelBinarizer()
        transformed_array = mlb.fit_transform(df['availableCountries'])
        df = df.join(pd.DataFrame(transformed_array, columns=mlb.classes_))

        # Verifique a presença de BR
        df['contains_BR'] = df['availableCountries'].apply(lambda x: 'BR' in x)

        # Adiciona a coluna streaming
        df[f'on_{dataset_name}'] = 1

    return df

In [3]:
df_apple = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-apple-tv-dataset')
df_amazon = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-amazon-prime-dataset')
df_netflix = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-netflix-dataset')
df_hbo = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-hbo-max-dataset')

Dataset URL: https://www.kaggle.com/datasets/octopusteam/full-apple-tv-dataset
Downloading full-apple-tv-dataset.zip to downloads/full-apple-tv-dataset


100%|██████████| 424k/424k [00:00<00:00, 1.09MB/s]



apple: (17766, 8)
Dataset URL: https://www.kaggle.com/datasets/octopusteam/full-amazon-prime-dataset
Downloading full-amazon-prime-dataset.zip to downloads/full-amazon-prime-dataset


100%|██████████| 1.92M/1.92M [00:00<00:00, 3.43MB/s]



amazon: (67762, 8)
Dataset URL: https://www.kaggle.com/datasets/octopusteam/full-netflix-dataset
Downloading full-netflix-dataset.zip to downloads/full-netflix-dataset


100%|██████████| 751k/751k [00:00<00:00, 2.51MB/s]



netflix: (20196, 8)
Dataset URL: https://www.kaggle.com/datasets/octopusteam/full-hbo-max-dataset
Downloading full-hbo-max-dataset.zip to downloads/full-hbo-max-dataset


100%|██████████| 169k/169k [00:00<00:00, 378kB/s]


hbo: (5771, 8)





In [4]:
df_amazon.head()

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries,AD,AE,...,US,UY,VA,VE,YE,ZA,ZM,ZW,contains_BR,on_amazon
0,Blondie,movie,"Comedy, Family",1938.0,tt0029927,6.9,886.0,"[US, ZA]",0,0,...,1,0,0,0,0,1,0,0,False,1
1,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,8790.0,[JP],0,0,...,0,0,0,0,0,0,0,0,False,1
2,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,112688.0,"[AT, DE]",0,0,...,0,0,0,0,0,0,0,0,False,1
3,Judgment Night,movie,"Action, Crime, Drama",1993.0,tt0107286,6.6,19322.0,[US],0,0,...,1,0,0,0,0,0,0,0,False,1
4,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2321918.0,"[AD, AT, CU, DE, FR, GF, IN, JP, MC, PF, SN]",1,0,...,0,0,0,0,0,0,0,0,False,1


In [None]:
df_apple.head()

In [None]:
df_hbo.head()

In [None]:
df_netflix.head()

In [5]:
dfs = pd.concat([df_apple, df_amazon, df_netflix, df_hbo])

In [None]:
dfs.title.nunique()

In [6]:
print(dfs.on_apple.value_counts())
print(dfs.on_amazon.value_counts())
print(dfs.on_hbo.value_counts())
print(dfs.on_netflix.value_counts())

on_apple
1.0    17766
Name: count, dtype: int64
on_amazon
1.0    67762
Name: count, dtype: int64
on_hbo
1.0    5771
Name: count, dtype: int64
on_netflix
1.0    20196
Name: count, dtype: int64


In [7]:
dfs.BR.value_counts()

BR
0    94686
1    16809
Name: count, dtype: int64

In [None]:
df_imdb = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-imdb-dataset', streaming=False)

In [None]:
df_imdb.head()

In [None]:
df_imdb.id.nunique()

In [None]:
pd.merge(dfs, df_imdb, left_on='imdbId', right_on='id', how='inner')

In [None]:
dfs.shape

In [None]:
dfs.type.value_counts()

In [None]:
df_imdb.type.value_counts()

In [11]:
dfs[dfs.imdbId.isna()]

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries,AE,AG,...,TZ,UY,VA,YE,ZM,ZW,on_amazon,ME,on_netflix,on_hbo
3133,Siste trikk,movie,,2024.0,,,,"[DE, GB]",0.0,0,...,,,,,,,,,,
3205,El Arte De La Guerra,movie,,2024.0,,,,[BR],0.0,0,...,,,,,,,,,,
3215,The Oracle,movie,"Comedy, Drama",2024.0,,,,[CA],0.0,0,...,,,,,,,,,,
3230,Clean Up Crew,movie,Documentary,2021.0,,,,"[GB, US]",0.0,0,...,,,,,,,,,,
3252,The Many Worlds of Quantum Mechanics,movie,Documentary,2019.0,,,,"[GB, US]",0.0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5757,,tv,,2009.0,,,,"[AR, BO, BR, CL, CO, CR, DO, EC, GT, HN, MX, N...",,0,...,,1.0,,,,,,0.0,,1.0
5758,,tv,,2024.0,,,,"[AG, AR, BB, BO, BR, BS, BZ, CL, CO, CR, DO, E...",,1,...,,1.0,,,,,,0.0,,1.0
5759,,tv,"Crime, Documentary",2024.0,,,,"[AR, ES, US]",,0,...,,0.0,,,,,,0.0,,1.0
5762,,tv,"Drama, Mystery",2012.0,,,,[NL],,0,...,,0.0,,,,,,0.0,,1.0
