In [None]:
import pandas as pd
import opendatasets as od
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
def get_and_load(url, streaming=True):
    od.download(url, data_dir='downloads', force=True)
    folder_name = url.split('/')[-1]
    
    dataset_name = folder_name.split('-')[1]
    df = pd.read_csv(f'downloads/{folder_name}/data.csv')
    print(f'{dataset_name}: {df.shape}')

    if streaming:
        # Verifique o formato dos dados
        df['availableCountries'] = df['availableCountries'].apply(lambda x: x.split(',') if isinstance(x, str) else x)

        # Remova espaços em branco
        df['availableCountries'] = df['availableCountries'].apply(lambda x: [country.strip() for country in x])

        # Aplique o MultiLabelBinarizer
        mlb = MultiLabelBinarizer()
        transformed_array = mlb.fit_transform(df['availableCountries'])
        df = df.join(pd.DataFrame(transformed_array, columns=mlb.classes_))

        # Verifique a presença de BR
        df['contains_BR'] = df['availableCountries'].apply(lambda x: 'BR' in x)

        # Adiciona a coluna streaming
        df[f'on_{dataset_name}'] = 1

    return df

In [None]:
df_apple = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-apple-tv-dataset')
df_amazon = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-amazon-prime-dataset')
df_netflix = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-netflix-dataset')
df_hbo = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-hbo-max-dataset')

In [None]:
df_amazon.head()

In [None]:
df_apple.head()

In [None]:
df_hbo.head()

In [None]:
df_netflix.head()

In [None]:
dfs = pd.concat([df_apple, df_amazon, df_netflix, df_hbo])

In [None]:
dfs.title.nunique()

In [None]:
print(dfs.on_apple.value_counts())
print(dfs.on_amazon.value_counts())
print(dfs.on_hbo.value_counts())
print(dfs.on_netflix.value_counts())

In [None]:
dfs.BR.value_counts()

In [None]:
df_imdb = get_and_load('https://www.kaggle.com/datasets/octopusteam/full-imdb-dataset', streaming=False)

In [None]:
df_imdb.head()

In [None]:
df_imdb.id.nunique()

In [None]:
pd.merge(dfs, df_imdb, left_on='imdbId', right_on='id', how='inner')

In [None]:
dfs.shape

In [None]:
dfs.type.value_counts()

In [None]:
df_imdb.type.value_counts()