In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

TRAIN_PATH = '/kaggle/input/campeonato-inteli-modulo3-2025/train.csv'
TEST_PATH = '/kaggle/input/campeonato-inteli-modulo3-2025/test.csv'
SAMPLE_PATH = '/kaggle/input/campeonato-inteli-modulo3-2025/sample_submission.csv'

print("=" * 70)
print("CARREGANDO DADOS")
print("=" * 70)

# jogando os dados pro dataframe e já tirando duplicados
df = pd.read_csv(TRAIN_PATH)
df = df.drop_duplicates()

print(f"✓ Dados carregados: {len(df)} linhas")

# ver os nulos
print("\nValores nulos por coluna:")
print(df.isna().sum())

# arrumando os nulos
print("\n" + "=" * 70)
print("LIMPANDO VALORES NULOS")
print("=" * 70)

df['age_first_funding_year'].fillna(0, inplace=True)
df['age_first_milestone_year'].fillna(0, inplace=True)
df['age_last_funding_year'].fillna(df['age_first_funding_year'], inplace=True)
df['age_last_milestone_year'].fillna(df['age_first_milestone_year'], inplace=True)

print("✓ Valores nulos tratados")
print("\nNulos restantes:")
print(df.isna().sum().sum())

# pegar só as numéricas que fazem sentido normalizar
numerical_cols = df.select_dtypes(include='number').columns
filtered_cols = [col for col in numerical_cols if not (col.startswith('is_') or col.startswith('has_'))]
filtered_cols.remove('labels')
filtered_cols.remove('id')

print(f"\nColunas numéricas para normalizar: {len(filtered_cols)}")

# criar uma cópia só pro treino
df_train = df.copy()

# normalizando os dados
print("\n" + "=" * 70)
print("NORMALIZANDO DADOS")
print("=" * 70)

standard = StandardScaler()
df_train[filtered_cols] = standard.fit_transform(df_train[filtered_cols])

print("✓ Dados normalizados com StandardScaler")

# transformando categoria em colunas (One-Hot)
print("\n" + "=" * 70)
print("APLICANDO ONE-HOT ENCODING")
print("=" * 70)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_train = encoder.fit_transform(df_train[['category_code']])
encoded_train_df = pd.DataFrame(
    encoded_train,
    columns=encoder.get_feature_names_out(['category_code']),
    index=df_train.index
)

df_train = pd.concat([df_train.drop(columns=['category_code']), encoded_train_df], axis=1)

print(f"✓ One-Hot Encoding aplicado: {encoded_train.shape[1]} novas colunas")

# tratando os outliers do funding usando IQR
Q1 = df_train['funding_total_usd'].quantile(0.25)
Q3 = df_train['funding_total_usd'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_train_capped = df_train.copy()
df_train_capped['funding_total_usd'] = np.where(
    df_train_capped['funding_total_usd'] < lower_bound, lower_bound,
    np.where(df_train_capped['funding_total_usd'] > upper_bound, upper_bound,
             df_train_capped['funding_total_usd'])
)

print(f"✓ Outliers tratados (IQR method)")

# separando features e target
X_train = df_train.drop('labels', axis=1)
y_train = df_train['labels']

print(f"\n✓ X_train: {X_train.shape}")
print(f"✓ y_train: {y_train.shape}")
print(f"✓ Taxa de positivos: {y_train.mean():.2%}")

# features que escolhi pra treinar
features = [
    'age_first_funding_year',
    'age_last_milestone_year',
    'age_last_funding_year',
    'milestones',
    'funding_rounds',
    'funding_total_usd',
    'relationships',
    'is_ecommerce'
]

print("\n" + "=" * 70)
print("FEATURES SELECIONADAS")
print("=" * 70)
for i, feat in enumerate(features, 1):
    print(f"  {i}. {feat}")

# bora treinar a MLP
print("\n" + "=" * 70)
print("TREINANDO MODELO MLP")
print("=" * 70)

model_kaggle = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='tanh',
    solver='sgd',
    learning_rate_init=0.01,
    alpha=0.001,
    max_iter=500,
    random_state=18
)

print("Configuração do modelo:")
print(f"  Hidden layers: (100, 50)")
print(f"  Activation: tanh")
print(f"  Solver: sgd")
print(f"  Learning rate: 0.01")
print(f"  Alpha (L2): 0.001")
print(f"  Max iterations: 500")
print(f"  Random state: 18")

model_kaggle.fit(X_train[features], y_train)

print("\n✓ Modelo treinado!")

# testando no próprio treino só pra ver
y_pred = model_kaggle.predict(X_train[features])
train_acc = accuracy_score(y_train, y_pred)
cm = confusion_matrix(y_train, y_pred)

print("\n" + "=" * 70)
print("RESULTADOS NO TREINO")
print("=" * 70)
print(f"Acurácia: {train_acc:.4f} ({train_acc*100:.2f}%)")
print("\nMatriz de Confusão:")
print(cm)
print(f"\nTrue Negatives:  {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives:  {cm[1,1]}")

# agora preparar o dataset de teste
print("\n" + "=" * 70)
print("PREPARANDO DADOS DE TESTE")
print("=" * 70)

df_final = pd.read_csv(TEST_PATH)
print(f"✓ Teste carregado: {len(df_final)} linhas")

# normalizar igual no treino
df_final[filtered_cols] = standard.transform(df_final[filtered_cols])
print("✓ Teste normalizado")

# aplicar o One-Hot no teste também
encoded_final = encoder.transform(df_final[['category_code']])
encoded_final_df = pd.DataFrame(
    encoded_final,
    columns=encoder.get_feature_names_out(['category_code']),
    index=df_final.index
)

df_final = pd.concat([df_final, encoded_final_df], axis=1)
df_final.drop('category_code', axis=1, inplace=True)
print("✓ One-Hot Encoding aplicado no teste")

# arrumando nulos no teste também
df_final['age_first_funding_year'].fillna(0, inplace=True)
df_final['age_first_milestone_year'].fillna(0, inplace=True)
df_final['age_last_funding_year'].fillna(df_final['age_first_funding_year'], inplace=True)
df_final['age_last_milestone_year'].fillna(df_final['age_first_milestone_year'], inplace=True)
print("✓ Valores nulos tratados no teste")

# gerar as previsões finais
print("\n" + "=" * 70)
print("GERANDO PREDIÇÕES FINAIS")
print("=" * 70)

y_test_pred = model_kaggle.predict(df_final[features])

# salvar o arquivo de submissão
submission = pd.DataFrame({
    'id': df_final['id'],
    'labels': y_test_pred
})

submission.to_csv('submissionGAP1.csv', index=False)

print(f"✓ Predições geradas: {len(y_test_pred)}")
print(f"  Positivos: {y_test_pred.sum()} ({y_test_pred.mean():.1%})")
print(f"  Negativos: {len(y_test_pred) - y_test_pred.sum()} ({1 - y_test_pred.mean():.1%})")
print(f"\n✅ submissionGAP1.csv salvo com sucesso!")
print(f"\n🎯 Acurácia esperada no leaderboard: ~{train_acc:.1%}")
print("=" * 70)
