In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
# Charger le dataset
file_path = r"C:\Users\sosso\Machine Learning\Projet\Phishing_Email.csv"
df = pd.read_csv(file_path)

# Nettoyer les colonnes
df.columns = df.columns.str.strip()

# Supprimer la colonne inutile
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Supprimer les lignes où Email Text est NaN
df = df.dropna(subset=["Email Text"])

# Encoder la colonne cible Email Type en 0/1
df["Email Type"] = df["Email Type"].map({
    "Safe Email": 0,
    "Phishing Email": 1
})

# Vérifier la distribution des classes
print("Distribution des classes :")
print(df["Email Type"].value_counts())

# Séparer features et cible
X_text = df["Email Text"]
y = df["Email Type"]

# Vectorisation du texte (TF-IDF)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(X_text)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Définir les modèles
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
}
# Entraîner et évaluer chaque modèle
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

In [None]:
from sklearn.model_selection import GridSearchCV

# Paramètres Grid Search
param_grids = {
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs']
    },
    'Decision Tree': {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
    },
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [10, 20, None],
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'probability': [True]
    },
}

best_models = []
results = []

for name, model in models.items():
    print(f"\n Grid Search {name}...")
    
    # GridSearch sur les matrices TF-IDF
    grid = GridSearchCV(model, param_grids[name], cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'Accuracy': acc,
        'F1': f1
    })
    
    print(f"    Accuracy: {acc:.4f} | F1-score: {f1:.4f}")

results_df = pd.DataFrame(results)
print("\nRésumé Grid Search :")
print(results_df)


In [None]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, StackingClassifier

# Top 3 modèles selon l’accuracy
top_3 = results_df.nlargest(3, 'Accuracy')['Model'].tolist()
estimators = [(name, best_models[name]) for name in top_3]

# Voting Classifier (soft voting)
voting = VotingClassifier(estimators=estimators, voting='soft')
voting.fit(X_train, y_train)
y_pred_voting = voting.predict(X_test)
acc_voting = accuracy_score(y_test, y_pred_voting)
print(f" Voting: {acc_voting:.4f}")

# Bagging sur le meilleur modèle
best_name = results_df.loc[results_df['Accuracy'].idxmax(), 'Model']
bagging = BaggingClassifier(best_models[best_name], n_estimators=20, random_state=42)
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
acc_bagging = accuracy_score(y_test, y_pred_bagging)
print(f" Bagging: {acc_bagging:.4f}")

# Stacking Classifier
stacking = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=3
)
stacking.fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_test)
acc_stacking = accuracy_score(y_test, y_pred_stacking)
print(f" Stacking: {acc_stacking:.4f}")


In [None]:
from sklearn.metrics import f1_score

# Créer un DataFrame pour les méthodes d'ensemble
ensemble_df = pd.DataFrame([
    {'Model': 'Voting', 'Accuracy': acc_voting, 'F1': f1_score(y_test, y_pred_voting)},
    {'Model': 'Bagging', 'Accuracy': acc_bagging, 'F1': f1_score(y_test, y_pred_bagging)},
    {'Model': 'Stacking', 'Accuracy': acc_stacking, 'F1': f1_score(y_test, y_pred_stacking)},
])

# Combiner avec les résultats de GridSearch
final = pd.concat([results_df[['Model', 'Accuracy', 'F1']], ensemble_df], ignore_index=True)

# Afficher le tableau complet
print(final.to_string(index=False))

# Sauvegarder le tableau en CSV
final.to_csv('results.csv', index=False)
print("\n Tableau sauvegardé: results.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Graph 1: Comparaison des accuracies
plt.figure(figsize=(12, 5))
colors = ['skyblue']*len(models) + ['orange', 'green', 'red']  # Couleurs pour base + ensemble
plt.bar(final['Model'], final['Accuracy'], color=colors, alpha=0.7)
plt.ylabel('Accuracy')
plt.title('Comparaison des Modèles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('comparison.png', dpi=300)
print(" comparison.png")
plt.show()
plt.close()

# Graph 2: Confusion matrices (top 4)
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.ravel()

predictions = {
    best_name: best_models[best_name].predict(X_test),  # TF-IDF pour tout
    'Voting': y_pred_voting,
    'Bagging': y_pred_bagging,
    'Stacking': y_pred_stacking
}

for idx, (name, pred) in enumerate(predictions.items()):
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx], cbar=False)
    axes[idx].set_title(name)
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('True')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300)
print(" confusion_matrices.png")
plt.show()
plt.close()


In [None]:
# Conclusion
best_overall = final.loc[final['Accuracy'].idxmax()]
print("\n" + "="*60)
print(" MEILLEUR MODÈLE")
print("="*60)
print(f"Nom: {best_overall['Model']}")
print(f"Accuracy: {best_overall['Accuracy']:.4f}")
print(f"F1-Score: {best_overall['F1']:.4f}")
print("\n Fichiers générés:")
print("   - results.csv")
print("   - comparison.png")
print("   - confusion_matrices.png")