# Treinamento B치sico - RandomForestClassifier
Este notebook treina um modelo simples usando o arquivo `features.csv`.

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle


def hex_to_rgb(h):
    try:
        h = str(h).strip().lstrip('#')
        if len(h) != 6:
            return [0, 0, 0]
        return [int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)]
    except:
        return [0, 0, 0]

df = pd.read_csv('/home/kennedy/sistemas-inteligentes-rg/features_v2.csv')

print("游릭 Linhas carregadas:", len(df))
print("游릭 Colunas:", df.columns.tolist())

drop_cols = ['imagem', 'tem_qrcode', 'tem_digital']
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")


X = df.drop(columns=['classe'])
y = df['classe']

hex_cols = ['color1_hex', 'color2_hex', 'color3_hex']

for col in hex_cols:
    if col in X.columns:
        rgb = X[col].apply(hex_to_rgb).tolist()
        rgb = np.array(rgb)

        X[f'{col}_r'] = rgb[:, 0]
        X[f'{col}_g'] = rgb[:, 1]
        X[f'{col}_b'] = rgb[:, 2]

X = X.drop(columns=[c for c in hex_cols if c in X.columns], errors="ignore")

#
X = X.fillna(0)


le = LabelEncoder()
y = le.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

model = Pipeline(steps=[
    ('clf', GradientBoostingClassifier(
        learning_rate=0.05,
        n_estimators=300,
        max_depth=3,
        subsample=0.9,
        random_state=42
    ))
])

model.fit(X_train, y_train)


y_pred = model.predict(X_test)

print(f'Acur치cia: {accuracy_score(y_test, y_pred)*100:.2f}%')
print('\nRelat칩rio de Classifica칞칚o:')
print(classification_report(y_test, y_pred))
print('\nMatriz de Confus칚o:')
print(confusion_matrix(y_test, y_pred))

pickle.dump(model, open('modelo_rg.pkl', 'wb'))
pickle.dump(le, open('label_encoder.pkl', 'wb'))


游릭 Linhas carregadas: 1968
游릭 Colunas: ['imagem', 'classe', 'tem_qrcode', 'tem_digital', 'color1_hex', 'color2_hex', 'color3_hex', 'tem_nome', 'tem_nome_social', 'tem_nacionalidade', 'qtd_palavras', 'qtd_numeros', 'densidade_texto', 'brilho_medio', 'contraste', 'ratio_w_h', 'ratio_h_w', 'bordas', 'area_foto', 'hist_0', 'hist_1', 'hist_2', 'hist_3', 'hist_4', 'hist_5', 'hist_6', 'hist_7', 'hist_8', 'hist_9', 'hist_10', 'hist_11', 'hist_12', 'hist_13', 'hist_14', 'hist_15', 'hist_16', 'hist_17', 'hist_18', 'hist_19', 'hist_20', 'hist_21', 'hist_22', 'hist_23', 'hist_24', 'hist_25', 'hist_26', 'hist_27', 'hist_28', 'hist_29', 'hist_30', 'hist_31', 'hist_32', 'hist_33', 'hist_34', 'hist_35', 'hist_36', 'hist_37', 'hist_38', 'hist_39', 'hist_40', 'hist_41', 'hist_42', 'hist_43', 'hist_44', 'hist_45', 'hist_46', 'hist_47', 'hist_48', 'hist_49', 'hist_50', 'hist_51', 'hist_52', 'hist_53', 'hist_54', 'hist_55', 'hist_56', 'hist_57', 'hist_58', 'hist_59', 'hist_60', 'hist_61', 'hist_62', 'hist_