# Sistema de Detecção de Equivalência de Disciplinas

## 1. Imports e Configuração

In [None]:
# Realiza imports necessários
pip install -r requirements.txt

## 2. Carregamento de Dados

In [None]:
# Carrega dados de treino
df_catalogo = pd.read_csv('catalogo_disciplinas.csv');
# Carrega dados rotulados
labeled_data = pd.read_csv('dados_rotulados.csv')

## 3. Preparação de Dados

In [None]:
# Extrai dados de TPEI
def parse_tpei(tpei_str):
    # Converts "2-0-0-2" to dict with numeric values
    values = tpei_str.split('-')
    return {'T': int(values[0]), 'P': int(values[1]), 
            'E': int(values[2]), 'I': int(values[3])}

catalog_df['TPEI_parsed'] = catalog_df['TPEI'].apply(parse_tpei)

# Limpa e normaliza os dados
# Cria variável com int TPEI
# Cria grafo relacional de pré-requisitos
# Prepare data structures for filtering pipeline
# Create validation splits for model training

## 4. Filtro TPEI

In [None]:
# Define TPEI threshold value
# Filter discipline pairs based on TPEI criteria
# Exclude pairs where TPEI value is below threshold
# Log excluded pairs for tracking
# Return filtered candidate pairs

## 5. Filtro de Pré-requisitos

In [None]:
# Create DiGraph from prerequisite relationships
# Generate node2vec embeddings for disciplines
# Calculate cosine distance between discipline embeddings
# Apply prerequisite similarity threshold
# Filter pairs based on prerequisite similarity

## 6. Cálculo de Score TPEI + Pré-requisitos

In [None]:
# Prepara matriz de caracteristicas
def prepare_combined_features(discipline_pairs, tpei_diff, prereq_similarities):
    features = []
    for pair in discipline_pairs:
        prereq_sim = prereq_similarities.get(pairs, 0.0)
        features.append([tpei_diff, prereq_sim])
    return np.array(features)


# Treina CatBoost usando labeled_data
# Código da Larissa



# Otimiza limiar
# from sklearn.metrics import precision_recall_curve
y_pred_proba = model_catboost.predict_proba(X_val)[:,1]

precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Calcula score de cada par
all_features = prepare_combined_features(candidate_pairs, tpei_diff, prereq_similarities)
combined_scores = model_catboost.predict_proba(all_features)[:. 1]

# Aplica filtro em cada par
filtered_pairs = [
    pair for pair, score in zip(candidate_pairs, combined_scores)
    if score >= optimal_threshold
]

## 7. Filtro de Ementa

In [None]:
# Load BERTimbau pre-trained model
# Generate embeddings for ementa texts
# Apply node2vec to ementa embeddings
# Calculate cosine distance between ementa embeddings
# Filter based on semantic similarity threshold

## 8. Cálculo de Score Final

In [None]:
# Prepara caracterist
# Train SVM with RBF kernel on 'valid pair' data
# Combine all features (TPEI, prerequisites, ementa)
# Calculate overall equivalence score
# Apply final threshold for equivalence classification
# Generate list of equivalent discipline pairs

## 9. Explicação dos Resultados

In [None]:
# Initialize SHAP explainer for SVM model
# Calculate SHAP values for each prediction
# Generate feature importance rankings
# Create individual prediction explanations
# Prepare text explanations for results

## 10. Visualizações dos Resultados

### 10.1 Visualização do Grafo de Pré-requisitos

In [None]:
# Create DiGraph visualization showing prerequisite relationships
# Color nodes based on equivalence status
# Highlight connected discipline pairs
# Add node labels and edge weights
# Save graph visualization

### 10.2 SHAP Summary Plots

In [None]:
# Generate SHAP summary plot for feature importance
# Create dependence plots for key features
# Visualize force plots for individual predictions
# Export plots for presentation

### 10.3 Matrizes de Confusão

In [None]:
# Calculate confusion matrix for model performance
# Create heatmap visualization of confusion matrix
# Add precision, recall, and F1 scores
# Generate performance metrics report

## 11. Exportação de Resultados

In [None]:
# Save filtered results to TSV file
# Export model performance metrics
# Save visualizations in specified formats
# Generate final summary report