2. Há alguma influência na corretude de um nome científico registrado para 
uma planta em relação à quem a identificou em primeiro lugar? 

In [6]:
import pandas as pd
import random
from rapidfuzz import process, fuzz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
import json
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import mysql.connector

In [7]:
FILE_PATH = "registros_biodiversidade_ARRUMADO.parquet"
df = pd.read_parquet(FILE_PATH)[["scientificname", "identifiedby_att"]].dropna()

df.head()


Unnamed: 0,scientificname,identifiedby_att
0,Peperomia simulans,T. G. Yuncker
1,Piper anisum,"Callejas, R"
2,Piper anisum,"Callejas, R"
3,Piper arboreum,"Carvalho-Silva, M"
4,Piper corcovadense,"Callejas, R"


In [8]:
def normalize(name):
    return " ".join(name.strip().split())

df["nome_limpo"] = df["scientificname"].apply(normalize)
unique_names = df["nome_limpo"].unique().tolist()

accepted_map = {}
for name in unique_names:
    matches = process.extract(name, unique_names, scorer=fuzz.WRatio, limit=50)
    close = [m[0] for m in matches if m[1] >= 90] # similaridade >=90%
    best = df[df["nome_limpo"].isin(close)]["nome_limpo"].value_counts().idxmax()
    accepted_map[name] = best

df["nome_real"] = df["nome_limpo"].map(accepted_map)
df["correto?"] = (df["nome_limpo"] == df["nome_real"]).astype(int)

In [9]:
X = df[["identifiedby_att"]]
y = df["correto?"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=random.randint(0,10000)
)

In [10]:
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["identifiedby_att"])
])

In [11]:
models = {
    "regressao logistica": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "random forest": RandomForestClassifier(n_estimators=400, random_state=42, class_weight="balanced")
}

results = {}

for name, clf in models.items():
    pipe = Pipeline([
        ("prep", preprocess),
        ("clf", clf)
    ])
    
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    prob = pipe.predict_proba(X_test)[:,1]  # probabilidade de correto

    f1 = f1_score(y_test, pred, average="macro")
    print(f"\n{name} - acurácia: {f1:.4f}")
    print(classification_report(y_test, pred))
    
    # salvar probabilidades no dataframe de teste
    df_test = X_test.copy()
    df_test["correto?"] = y_test
    df_test["probabilidade_corretude"] = prob
    df_test["nome_real"] = df.loc[X_test.index, "nome_real"].values
    df_test["scientificname"] = df.loc[X_test.index, "scientificname"].values
    
    # salvar CSV final
    df_test.to_csv(f"scientificname_corrigido_{name.replace(' ','_')}.csv", index=False)
    
    results[name] = f1

for k, v in results.items():
    print(f"{k}: {v:.4f}")


regressao logistica - acurácia: 0.5779
              precision    recall  f1-score   support

           0       0.35      0.61      0.44      1292
           1       0.83      0.62      0.71      3947

    accuracy                           0.62      5239
   macro avg       0.59      0.62      0.58      5239
weighted avg       0.71      0.62      0.65      5239


random forest - acurácia: 0.5779
              precision    recall  f1-score   support

           0       0.35      0.61      0.44      1292
           1       0.83      0.62      0.71      3947

    accuracy                           0.62      5239
   macro avg       0.59      0.62      0.58      5239
weighted avg       0.71      0.62      0.65      5239

regressao logistica: 0.5779
random forest: 0.5779


In [22]:
# grid-search para regressao logistica

log_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

param_grid_log = {
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__solver": ["lbfgs"],
    "clf__penalty": ["l2"]
}

grid_log = GridSearchCV(
    log_pipe,
    param_grid_log,
    cv=5,
    n_jobs=-1,
    scoring="f1_macro"
)

grid_log.fit(X_train, y_train)

print("\nmelhores parametros para regressao logistica:")
print(grid_log.best_params_)

log_pred = grid_log.predict(X_test)
print("\nregressão logistica com parâmetros do grid search:")
print(classification_report(y_test, log_pred))

f1_log_tuned = f1_score(y_test, log_pred, average="macro")
print(f"\nnovo f1 score: {f1_log_tuned:.4f}")


melhores parametros para regressao logistica:
{'clf__C': 0.01, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}

regressão logistica com parâmetros do grid search:
              precision    recall  f1-score   support

           0       0.36      0.57      0.44      1292
           1       0.83      0.67      0.74      3947

    accuracy                           0.65      5239
   macro avg       0.59      0.62      0.59      5239
weighted avg       0.71      0.65      0.67      5239


novo f1 score: 0.5915


In [23]:
# grid search para random forest

rf_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", RandomForestClassifier(class_weight="balanced", random_state=42))
])

param_dist_rf = {
    "clf__n_estimators": [200, 300, 400, 500],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}

rand_rf = RandomizedSearchCV(
    rf_pipe,
    param_dist_rf,
    n_iter=20,
    cv=5,
    n_jobs=-1,
    scoring="f1_macro",
    random_state=42
)

rand_rf.fit(X_train, y_train)

print("\nmelhores parametros para random forest")
print(rand_rf.best_params_)

rf_pred = rand_rf.predict(X_test)
print("\regressão logistica com parâmetros do random forest")
print(classification_report(y_test, rf_pred))

f1_rf_tuned = f1_score(y_test, rf_pred, average="macro")
print(f"\nnovo f1 score: {f1_log_tuned:.4f}")



melhores parametros para random forest
{'clf__n_estimators': 200, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 2, 'clf__max_depth': 5}
egressão logistica com parâmetros do random forest
              precision    recall  f1-score   support

           0       0.37      0.53      0.44      1292
           1       0.82      0.71      0.76      3947

    accuracy                           0.66      5239
   macro avg       0.60      0.62      0.60      5239
weighted avg       0.71      0.66      0.68      5239


novo f1 score: 0.5915


In [24]:
print("\ncomparação F1 - com e sem grid search:")
print(f"baseline regressao logistica:     {results['regressao logistica']:.4f}")
print(f"gridsearch regressao logistica:       {f1_log_tuned:.4f}")
print(f"baseline random forest:         {results['random forest']:.4f}")
print(f"gridsearch random forest:           {f1_rf_tuned:.4f}")



comparação F1 - com e sem grid search:
baseline regressao logistica:     0.5779
gridsearch regressao logistica:       0.5915
baseline random forest:         0.5779
gridsearch random forest:           0.5971


In [9]:
rf_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42, class_weight="balanced"))
])
rf_pipe.fit(X, y)
prob_all = rf_pipe.predict_proba(X)[:,1]

df_stats = df.copy()
df_stats["probabilidade_corretude"] = prob_all

summary = df_stats.groupby("identifiedby_att")["probabilidade_corretude"].mean().reset_index()
summary = summary.sort_values("probabilidade_corretude")
print("\nmedia de probabilidade de estar correto por identificador:")
print(summary)

summary.to_csv("identifiedby_corretude_sumario.csv", index=False)
print(f"\nesses valores foram salvos em: identifiedby_corretude_sumario.csv")


media de probabilidade de estar correto por identificador:
            identifiedby_att  probabilidade_corretude
8         R. Callejas Posada                 0.342680
9              T. G. Yuncker                 0.354841
0                Callejas, R                 0.378988
3  George Azevedo de Queiroz                 0.452532
7               Marcusso, GM                 0.503179
4              Guimarães, EF                 0.550407
2                D. Monteiro                 0.640110
1          Carvalho-Silva, M                 0.650928
5          M. Carvalho-Silva                 0.660436
6          Machado-Silva, T.                 0.923316

esses valores foram salvos em: identifiedby_corretude_sumario.csv


In [10]:
df_final = df[["scientificname", "identifiedby_att", "correto?", "nome_real"]]
df_final.to_csv("scientificname_corrigido.csv", index=False)
print("\nCSV com nomes atualizados: scientificname_corrigido.csv")


CSV com nomes atualizados: scientificname_corrigido.csv


In [17]:
# 4.2.2 feature engineering da pergunta 2

# número total de identificações feitas por cada identificador
ident_counts = df["identifiedby_att"].value_counts().rename("identifications_count")
df["identifications_count"] = df["identifiedby_att"].map(ident_counts)

# Atualizar X para incluir a nova variável
X = df[["identifiedby_att", "identifications_count"]]

# split com a nova variável
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# Incluir a coluna numérica
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["identifiedby_att"]),
    ("num", StandardScaler(), ["identifications_count"])
])