
# TCC – Predição do Sucesso Musical (Pop, Spotify 2020–2024)

Notebook de referência para **coleta, processamento, modelagem e exportação de resultados** do TCC do MBA USP/ESALQ.


## 0. Ambiente e dependências

Execute esta célula para importar bibliotecas. Se necessário, descomente os comandos `pip`.


In [None]:
# Load necessary libraries

!pip install pandas numpy matplotlib scikit-learn tqdm python-dotenv
!pip install spotipy lyricsgenius librosa music21 xgboost --upgrade
!pip install beautifulsoup4 requests --quiet
!pip install spotipy --quiet

In [None]:
# Collecting Historical TOP 50 Streaming Data from Pro-Música Brasil

from pathlib import Path
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta
import time
import os

def fetch_pro_musica_data(period='04/2024'):
    """Fetch TOP 50 streaming data from Pro-Música Brasil with proper session handling"""
    
    # Create session for connection reuse
    session = requests.Session()
    
    url = f'https://pro-musicabr.org.br/home/top-50-streaming/?top50sPeriodo={period}'
    
    # Set headers exactly matching the curl command
    session.headers.update({
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'en-US,en;q=0.9,es;q=0.8,pt;q=0.7,fr;q=0.6',
        'Priority': 'u=0, i',
        'Referer': 'https://pro-musicabr.org.br/home/top-50-streaming/?top50sPeriodo=06/2024',
        'Sec-CH-UA': '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
        'Sec-CH-UA-Mobile': '?0',
        'Sec-CH-UA-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
    })
    
    
    # Add cookies to maintain session
    session.cookies.update({
        '_ga': 'GA1.1.1144301064.1757204721',
        '_ga_BGET1L6V9K': 'GS2.1.s1757204720$o1$g1$t1757204994$j25$l0$h209261057',
        'cookieconsent_status': 'dismiss'
    })
    
    try:
        # Add delay to be respectful to the server
        time.sleep(1)
        
        # Make the request with timeout and allow redirects
        response = session.get(url, timeout=30, allow_redirects=True)
        response.raise_for_status()

        print(f"Request URL: {response.url}")
        print(f"Response Status Code: {response.status_code}")
        print(f"Response Content Length: {len(response.content)} bytes")

        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Look for the streaming data table
        table = soup.find('table', class_='table table-top-streams')
        
        if table:
            # Extract data from table rows
            rows = []
            tbody = table.find('tbody')
            
            if tbody:
                for tr in tbody.find_all('tr'):
                    cells = tr.find_all('td')
                    if len(cells) >= 6:
                        # Extract position
                        position = cells[1].get_text(strip=True)
                        
                        # Extract title and artist from the combined cell
                        title_artist_cell = cells[3]
                        text_content = title_artist_cell.get_text(separator='|', strip=True)
                        
                        # Split by | to separate title and artist
                        parts = text_content.split('|')
                        if len(parts) >= 2:
                            title = parts[0].strip()
                            artist = parts[1].strip()
                        else:
                            title = parts[0].strip()
                            artist = ""
                        
                        # Extract record label and distributor
                        gravadora = cells[4].get_text(strip=True)
                        distribuidora = cells[5].get_text(strip=True)
                        
                        rows.append({
                            'posicao': position,
                            'titulo': title,
                            'artista': artist,
                            'gravadora': gravadora,
                            'distribuidora': distribuidora
                        })
            
            # Create DataFrame
            df_top50 = pd.DataFrame(rows)
            return df_top50
        else:
            print("Table with class 'table table-top-streams' not found")
            # Try alternative table parsing
            tables = soup.find_all('table')
            if tables:
                print(f"Found {len(tables)} tables, trying first one...")
                return pd.read_html(str(tables[0]))[0]
            return pd.DataFrame()
            
    except requests.RequestException as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error parsing data: {e}")
        return pd.DataFrame()
    finally:
        session.close()


def collect_historical_data(start_year=2022, start_month=1, end_year=2024, end_month=12):
    """
    Collect TOP 50 streaming data from Pro-Música Brasil for multiple periods
    """
    all_data = []
    failed_periods = []
    
    # Generate all periods from start to end
    current_year = start_year
    current_month = start_month
    
    while (current_year < end_year) or (current_year == end_year and current_month <= end_month):
        period = f"{current_month:02d}/{current_year}"
        print(f"\n=== Collecting data for {period} ===")
        
        try:
            # Fetch data for this period
            df_period = fetch_pro_musica_data(period)
            
            if not df_period.empty:
                # Add period information to the dataframe
                df_period['periodo'] = period
                df_period['ano'] = current_year
                df_period['mes'] = current_month
                all_data.append(df_period)
                print(f"✅ Successfully collected {len(df_period)} tracks for {period}")
            else:
                print(f"❌ No data found for {period}")
                failed_periods.append(period)
            
            # Be respectful to the server - wait between requests
            time.sleep(2)
            
        except Exception as e:
            print(f"❌ Error collecting data for {period}: {e}")
            failed_periods.append(period)
            time.sleep(5)  # Wait longer on error
        
        # Move to next month
        current_month += 1
        if current_month > 12:
            current_month = 1
            current_year += 1
    
    # Combine all data
    if all_data:
        df_combined = pd.concat(all_data, ignore_index=True)
        print(f"\n=== Collection Summary ===")
        print(f"Total periods processed: {len(all_data) + len(failed_periods)}")
        print(f"Successful collections: {len(all_data)}")
        print(f"Failed collections: {len(failed_periods)}")
        if failed_periods:
            print(f"Failed periods: {', '.join(failed_periods)}")
        print(f"Total tracks collected: {len(df_combined)}")
        return df_combined, failed_periods
    else:
        print("❌ No data was collected from any period")
        return pd.DataFrame(), failed_periods

def save_data_with_timestamp(df, base_filename="pro_musica_top50_historical"):
    """
    Save the dataframe with timestamp and create backup
    """
    if df.empty:
        print("No data to save")
        return None
    
    # Create data directory if it doesn't exist
    data_dir = Path('./data')
    data_dir.mkdir(exist_ok=True, parents=True)
    
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{base_filename}_{timestamp}.csv"
    filepath = data_dir / filename
    
    # Save the data
    df.to_csv(filepath, index=False, encoding='utf-8')
    print(f"✅ Data saved to: {filepath}")
    
    # Also save a copy without timestamp for easy access
    latest_filepath = data_dir / f"{base_filename}_latest.csv"
    df.to_csv(latest_filepath, index=False, encoding='utf-8')
    print(f"✅ Latest copy saved to: {latest_filepath}")
    
    # Print summary statistics
    print(f"\n=== Data Summary ===")
    print(f"Total records: {len(df)}")
    print(f"Periods covered: {df['periodo'].nunique()}")
    print(f"Date range: {df['periodo'].min()} to {df['periodo'].max()}")
    print(f"Unique tracks: {df['titulo'].nunique()}")
    print(f"Unique artists: {df['artista'].nunique()}")
    
    return filepath

# Execute the data collection
print("Starting historical data collection from Pro-Música Brasil...")
print("This will collect TOP 50 data from 01/2022 to 12/2024")
print("Please be patient as this will take several minutes...")

# Collect the data
df_historical, failed_periods = collect_historical_data(
    start_year=2019, start_month=1,
    end_year=2024, end_month=12
)

# Save the data if we got any
if not df_historical.empty:
    filepath = save_data_with_timestamp(df_historical)
    
    # Display sample of the data
    print(f"\n=== Sample of collected data ===")
    print(df_historical.head(10))
    
    # Show data by period
    period_summary = df_historical.groupby('periodo').agg({
        'titulo': 'count',
        'artista': 'nunique'
    }).rename(columns={'titulo': 'tracks', 'artista': 'unique_artists'})
    print(f"\n=== Data by period ===")
    print(period_summary)
    
else:
    print("❌ No data was collected. Please check the website availability and try again.")

print("\n=== Collection Complete ===")

Ambiente carregado. HAS_XGB = True


In [None]:
# load song attributes from spotify API
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Authenticate with your credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET"
))

# Search for a specific track
results = sp.search(q='track:Seven Nation Army artist:The White Stripes', limit=1)
track_id = results['tracks']['items'][0]['id']

# Get the audio features for that track
features = sp.audio_features(track_id)
print(features)


In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, accuracy_score, precision_score, recall_score, average_precision_score

# Modelos adicionais (opcional)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    from sklearn.ensemble import GradientBoostingClassifier
    HAS_XGB = False

SEED = 42
rng = np.random.default_rng(SEED)

# Diretório de exportação
EXPORT_DIR = Path('./exports')
EXPORT_DIR.mkdir(exist_ok=True, parents=True)

print('Ambiente carregado. HAS_XGB =', HAS_XGB)


Ambiente carregado. HAS_XGB = True



## 1. Configurações e credenciais (opcional – para coleta por API)

Preencha suas chaves em um arquivo `.env` ou defina variáveis de ambiente.


In [13]:

from dataclasses import dataclass

@dataclass
class Config:
    # Spotify
    SPOTIFY_CLIENT_ID: str = os.getenv('SPOTIFY_CLIENT_ID', '')
    SPOTIFY_CLIENT_SECRET: str = os.getenv('SPOTIFY_CLIENT_SECRET', '')
    # Genius
    GENIUS_TOKEN: str = os.getenv('GENIUS_TOKEN', '')
    # YouTube (opcional)
    YT_API_KEY: str = os.getenv('YOUTUBE_API_KEY', '')

cfg = Config()
cfg


Config(SPOTIFY_CLIENT_ID='', SPOTIFY_CLIENT_SECRET='', GENIUS_TOKEN='', YT_API_KEY='')


## 2. Ingestão de dados

Você pode **(A)** carregar CSVs locais prontos (recomendado para rapidez e reprodutibilidade), ou **(B)** usar APIs (Spotify/Genius/YouTube) para coletar os dados.

**Formato recomendado (CSV)**:  
- `tracks.csv`: `track_id, artist, title, release_date, year, streams_30d, genre`
- `audio_features.csv`: `track_id, danceability, energy, valence, tempo, key, mode`
- `lyrics.csv`: `track_id, lyrics`

> **Nota**: O Spotify Web API não fornece diretamente o *Top 200 Global* histórico. Você pode:  
> - Usar dumps/CSVs públicos de paradas (quando disponíveis), ou  
> - Utilizar *playlists* curadas como *Top 50 Global* em janelas temporais, ou  
> - Definir sucesso por quantis de `streams_30d` na sua própria base consolidada.


In [14]:

def load_csvs_or_raise(base_dir='data'):
    base = Path(base_dir)
    tracks_fp = base / 'tracks.csv'
    feats_fp = base / 'audio_features.csv'
    lyrics_fp = base / 'lyrics.csv'
    if not all(p.exists() for p in [tracks_fp, feats_fp, lyrics_fp]):
        raise FileNotFoundError('CSV(s) não encontrados. Use o caminho correto em base_dir ou gere dados sintéticos.')
    tracks = pd.read_csv(tracks_fp)
    feats = pd.read_csv(feats_fp)
    lyrics = pd.read_csv(lyrics_fp)
    return tracks, feats, lyrics

# Exemplo de merge
def build_dataset_from_csvs(tracks, feats, lyrics):
    df = tracks.merge(feats, on='track_id', how='inner').merge(lyrics, on='track_id', how='left')
    # Limpeza leve
    df['year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
    df = df.dropna(subset=['streams_30d', 'danceability', 'energy', 'valence'])
    df = df[df['genre'].str.lower().str.contains('pop', na=False)]
    return df



### 2.B. Coleta por API (esqueleto)

Stubs com *placeholders* para você completar quando desejar. **Opcional**.


In [5]:

# --- Spotify (exemplo com spotipy) ---
# import spotipy
# from spotipy.oauth2 import SpotifyClientCredentials

# def spotify_client(cfg):
#     auth = SpotifyClientCredentials(client_id=cfg.SPOTIFY_CLIENT_ID, client_secret=cfg.SPOTIFY_CLIENT_SECRET)
#     return spotipy.Spotify(auth_manager=auth)

# def fetch_audio_features(sp, track_ids):
#     feats = []
#     for i in range(0, len(track_ids), 50):
#         chunk = track_ids[i:i+50]
#         feats.extend(sp.audio_features(chunk))
#     return pd.DataFrame(feats)

# --- Genius (lyrics) ---
# import lyricsgenius
# def fetch_lyrics(genius_token, artist, title):
#     genius = lyricsgenius.Genius(genius_token)
#     song = genius.search_song(title, artist)
#     return song.lyrics if song else None

# --- YouTube (opcional) ---
# from googleapiclient.discovery import build
# def fetch_youtube_stats(api_key, query):
#     yt = build('youtube', 'v3', developerKey=api_key)
#     # ...
#     return {}



## 3. Pré-processamento e features

Inclui limpeza, deduplicação, cálculo de sentimento/lexical e geração do **alvo de sucesso** (quantil superior).


In [6]:

import re

def lexical_diversity(text):
    if not isinstance(text, str) or not text.strip():
        return np.nan
    tokens = re.findall(r"\b\w+\b", text.lower())
    if not tokens:
        return np.nan
    return len(set(tokens)) / len(tokens)

def simple_sentiment(text):
    # Placeholder: pontuação simplificada de sentimento (substitua por VADER/BERT conforme desejar)
    if not isinstance(text, str) or not text.strip():
        return np.nan
    pos_words = {'love','good','happy','sun','party','dance','smile','together'}
    neg_words = {'sad','bad','cry','alone','pain','dark'}
    tokens = re.findall(r"\b\w+\b", text.lower())
    score = sum(1 for t in tokens if t in pos_words) - sum(1 for t in tokens if t in neg_words)
    return score / max(1, len(tokens))

def build_features(df, success_quantile=0.80):
    df = df.copy()
    # alvo binário de sucesso pelo quantil de streams_30d
    threshold = df['streams_30d'].quantile(success_quantile)
    df['success'] = (df['streams_30d'] >= threshold).astype(int)

    # features textuais
    df['lexical_diversity'] = df['lyrics'].apply(lexical_diversity)
    df['sentiment_score']   = df['lyrics'].apply(simple_sentiment)

    # harmonia/sonoridade (já disponíveis via Spotify audio_features)
    # mode: 1 = maior, 0 = menor
    df['mode_major'] = (df.get('mode', 1) == 1).astype(int)

    # Seleção final de colunas (ajuste conforme necessário)
    feature_cols = [
        'danceability','energy','valence','tempo','mode_major',
        'sentiment_score','lexical_diversity'
    ]
    df_model = df.dropna(subset=feature_cols + ['success']).copy()
    return df_model, feature_cols



## 4. Treino/validação e modelos

Modelos: **Regressão Logística (baseline)** e **XGBoost** (ou **GradientBoosting** se XGBoost indisponível).


In [7]:

def train_and_eval(df_model, feature_cols):
    X = df_model[feature_cols].values
    y = df_model['success'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y
    )

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    # Baseline
    lr = LogisticRegression(max_iter=200, random_state=SEED)
    lr.fit(X_train_s, y_train)
    p_lr = lr.predict_proba(X_test_s)[:,1]
    m_lr = {
        'name': 'Regressão Logística (baseline)',
        'auc': roc_auc_score(y_test, p_lr),
        'f1': f1_score(y_test, (p_lr>=0.5).astype(int)),
        'acc': accuracy_score(y_test, (p_lr>=0.5).astype(int)),
        'prec': precision_score(y_test, (p_lr>=0.5).astype(int)),
        'rec' : recall_score(y_test, (p_lr>=0.5).astype(int)),
        'pr_auc': average_precision_score(y_test, p_lr),
        'fpr_tpr': roc_curve(y_test, p_lr)[:2] + (roc_curve(y_test, p_lr)[2],),
        'probas': p_lr,
        'y_test': y_test,
    }

    # XGB ou GBT
    if HAS_XGB:
        mdl = XGBClassifier(
            n_estimators=300, learning_rate=0.05, subsample=0.9,
            max_depth=4, colsample_bytree=0.8, eval_metric='logloss',
            random_state=SEED
        )
    else:
        mdl = GradientBoostingClassifier(random_state=SEED)
    mdl.fit(X_train_s, y_train)
    p_m = mdl.predict_proba(X_test_s)[:,1] if hasattr(mdl, 'predict_proba') else mdl.decision_function(X_test_s)
    p_m = (p_m - p_m.min())/(p_m.max()-p_m.min() + 1e-9)  # normaliza se preciso

    m_g = {
        'name': 'XGBoost' if HAS_XGB else 'GradientBoosting',
        'auc': roc_auc_score(y_test, p_m),
        'f1': f1_score(y_test, (p_m>=0.5).astype(int)),
        'acc': accuracy_score(y_test, (p_m>=0.5).astype(int)),
        'prec': precision_score(y_test, (p_m>=0.5).astype(int)),
        'rec' : recall_score(y_test, (p_m>=0.5).astype(int)),
        'pr_auc': average_precision_score(y_test, p_m),
        'fpr_tpr': roc_curve(y_test, p_m)[:2] + (roc_curve(y_test, p_m)[2],),
        'probas': p_m,
        'y_test': y_test,
        'model_obj': mdl
    }

    return [m_lr, m_g]



## 5. Tabelas, correlações e figuras

Gera as **Tabelas 1–3** e a **Figura 1 (ROC)** nos formatos prontos para colar no DOCX.


In [8]:

def table1_characterization(df):
    g = df.groupby('year').agg(
        tracks=('track_id','nunique'),
        artists=('artist','nunique'),
        streams_mean=('streams_30d','mean'),
        streams_median=('streams_30d','median')
    ).reset_index().sort_values('year')
    g.loc['Total'] = [
        'Total',
        g['tracks'].sum(),
        g['artists'].sum(),
        g['streams_mean'].mean(),
        g['streams_median'].median()
    ]
    return g

def table2_spearman(df_model, feature_cols):
    # Spearman com pandas (sem p-valor)
    rho = df_model[feature_cols + ['success']].corr(method='spearman')['success'].drop('success')
    out = pd.DataFrame({'Variavel': rho.index, 'Spearman_rho': rho.values})
    out['Direcao'] = np.where(out['Spearman_rho']>=0, 'Positiva', 'Negativa')
    out['N'] = len(df_model)
    return out

def table3_metrics(models):
    rows = []
    for m in models:
        rows.append([
            m['name'], m['auc'], m['f1'], m['prec'], m['rec'], m['acc'], m['pr_auc']
        ])
    return pd.DataFrame(rows, columns=['Modelo','AUC-ROC','F1','Precisão','Revocação','Acurácia','PR-AUC'])

def figure1_roc(models, out_png):
    plt.figure()
    for m in models:
        fpr, tpr, _ = m['fpr_tpr']
        plt.plot(fpr, tpr, label=m['name'])
    plt.plot([0,1],[0,1], linestyle='--')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('Figura 1 – Curva ROC por modelo')
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.close()



## 6. Fluxo de execução com **dados sintéticos** (exemplo)

Esta seção cria um conjunto de dados **fictício** apenas para validar o pipeline e gerar artefatos (tabelas/figuras).
Substitua por seus dados reais nas seções anteriores.


In [9]:

def make_synthetic(n=1200, years=range(2020, 2025)):
    rows = []
    for y in years:
        for i in range(n//len(list(years))):
            valence = rng.uniform(0,1)
            energy = rng.uniform(0,1)
            dance = rng.uniform(0,1)
            mode = rng.integers(0,2)  # 1=maior
            tempo = rng.normal(120, 20)
            sentiment = (valence - 0.5) * 0.3 + rng.normal(0, 0.02)
            lexdiv = rng.uniform(0.2, 0.6)
            # streams correlaciona com algumas features
            log_streams = (
                10
                + 1.2*valence
                + 0.8*dance
                + 0.5*energy
                + 0.3*mode
                + 0.6*sentiment
                + rng.normal(0, 0.7)
            )
            streams = np.exp(log_streams)  # positivo
            rows.append({
                'track_id': f'{y}_{i}',
                'artist': f'Artist_{rng.integers(1,300)}',
                'title':  f'Track_{i}',
                'release_date': f'{y}-01-01',
                'year': y,
                'streams_30d': streams,
                'genre': 'Pop',
                'danceability': dance,
                'energy': energy,
                'valence': valence,
                'tempo': tempo,
                'mode': mode,
                'lyrics': 'We love to dance and smile together under the sun'
            })
    df = pd.DataFrame(rows)
    return df

# Gera dataset sintético e aplica o pipeline completo
df_syn = make_synthetic()
df_model, feature_cols = build_features(df_syn, success_quantile=0.80)
models = train_and_eval(df_model, feature_cols)

t1 = table1_characterization(df_syn)
t2 = table2_spearman(df_model, feature_cols)
t3 = table3_metrics(models)

# Exporta artefatos
t1_fp = EXPORT_DIR / 'tabela1_caracterizacao.csv'
t2_fp = EXPORT_DIR / 'tabela2_correlacoes.csv'
t3_fp = EXPORT_DIR / 'tabela3_metricas.csv'
fig1_fp = EXPORT_DIR / 'figura1_roc.png'

t1.to_csv(t1_fp, index=False)
t2.to_csv(t2_fp, index=False)
t3.to_csv(t3_fp, index=False)
figure1_roc(models, str(fig1_fp))

t1_fp, t2_fp, t3_fp, fig1_fp


(PosixPath('exports/tabela1_caracterizacao.csv'),
 PosixPath('exports/tabela2_correlacoes.csv'),
 PosixPath('exports/tabela3_metricas.csv'),
 PosixPath('exports/figura1_roc.png'))


## 7. Execução com **dados reais (CSV)**

Quando você tiver os CSVs, a execução típica será:


In [10]:

# tracks, feats, lyrics = load_csvs_or_raise(base_dir='data')
# df = build_dataset_from_csvs(tracks, feats, lyrics)
# df_model, feature_cols = build_features(df, success_quantile=0.80)
# models = train_and_eval(df_model, feature_cols)

# # Tabelas e figura
# t1 = table1_characterization(df)
# t2 = table2_spearman(df_model, feature_cols)
# t3 = table3_metrics(models)
# t1.to_csv(EXPORT_DIR/'tabela1_caracterizacao.csv', index=False)
# t2.to_csv(EXPORT_DIR/'tabela2_correlacoes.csv', index=False)
# t3.to_csv(EXPORT_DIR/'tabela3_metricas.csv', index=False)
# figure1_roc(models, str(EXPORT_DIR/'figura1_roc.png'))



## 8. Logs de reprodutibilidade

Versões de pacotes e *seed*.


In [11]:

import sys, platform
print('Python:', sys.version)
print('Platform:', platform.platform())
import sklearn, pandas, numpy, matplotlib
print('sklearn:', sklearn.__version__)
print('pandas:', pandas.__version__)
print('numpy:', numpy.__version__)
print('matplotlib:', matplotlib.__version__)
print('SEED:', SEED)


Python: 3.12.3 (main, Aug 14 2025, 17:47:21) [GCC 13.3.0]
Platform: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39
sklearn: 1.7.1
pandas: 2.3.2
numpy: 2.2.6
matplotlib: 3.10.6
SEED: 42


In [18]:
# Exemplo de coleta de dados da web (opcional)

url = "https://pro-musicabr.org.br/home/top-50-streaming/index.php?top50sPeriodo=04/2024"
response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

# Exibe o título da página
print("Título da página:", soup.title.string.strip())

# Procura a primeira tabela relevante e exibe como DataFrame
tables = soup.find_all("table")
if tables:
    df = pd.read_html(str(tables[0]))[0]
    display(df.head())
else:
    print("Nenhuma tabela encontrada na página.")

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))