In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('data/train_pt_all.csv')

In [None]:
train.head()

In [None]:
train = train.drop_duplicates()
train.info()


In [None]:
train.isna().sum()



In [None]:
import os

folder_path = 'data/train.csv'

graph = pd.read_csv(folder_path)
graph.head()

In [None]:
graph.info()

In [None]:
graph_final = train.merge(graph, on="graph_id", how="inner")

graph_final.info()




In [None]:
graph_final = graph_final.drop(columns=['x_abs_mean', 'num_nodes_l2_above_1', 'out_degree_mean', 'feature_dim', 'x_min', 'x_l2_min', 'num_nodes_l2_below_0.1'])

In [None]:
def find_duplicate_columns(df):
    cols = df.columns
    duplicates = {}
    
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            if df[cols[i]].equals(df[cols[j]]):  
                duplicates[cols[j]] = cols[i] 
    
    return duplicates

duplicates = find_duplicate_columns(graph_final)

print(duplicates)

In [None]:
graph_final.columns

In [None]:
graph_final.isna().sum().sum()

In [None]:
graph = graph_final 

In [None]:
graph_no_id = graph.iloc[:, 1:]

corr_matrix = graph_no_id.corr()


threshold = 0.6
strong_corr = corr_matrix[(corr_matrix.abs() > threshold) & (corr_matrix != 1.0)].dropna(how="all", axis=0).dropna(how="all", axis=1)

plt.figure(figsize=(10, 8))
sns.heatmap(strong_corr, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Matrice de corrélation (corrélations fortes uniquement)")
plt.show()

In [None]:
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor

def remove_high_vif(df, threshold=10):
    """
    Supprime les variables ayant un VIF supérieur au seuil spécifié.

    :param df: DataFrame contenant uniquement les variables numériques.
    :param threshold: Seuil au-delà duquel une variable est considérée comme redondante.
    :return: DataFrame nettoyé et liste des variables supprimées.
    """
    df = df.copy() 
    dropped = []

    while True:
        vif = pd.DataFrame()
        vif["Feature"] = df.columns
        vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]

        max_vif = vif["VIF"].max()
        if max_vif < threshold:
            break  
        feature_to_drop = vif.loc[vif["VIF"].idxmax(), "Feature"]
        df = df.drop(columns=[feature_to_drop])
        dropped.append(feature_to_drop)

    return df, dropped

graph_id = graph.iloc[:, 0] 
data = graph.iloc[:, 1:]  

graph_vif_cleaned, removed_columns = remove_high_vif(data)

graph_vif_cleaned.insert(0, graph_id.name, graph_id)

print(f"Variables supprimées : {removed_columns}")

In [None]:
graph_vif_cleaned.head()

In [None]:
graph_vif_cleaned_no_id = graph_vif_cleaned.iloc[:, 1:] 

vif_final = pd.DataFrame()
vif_final["Feature"] = graph_vif_cleaned_no_id.columns
vif_final["VIF"] = [variance_inflation_factor(graph_vif_cleaned_no_id.values, i) for i in range(graph_vif_cleaned_no_id.shape[1])]

print(vif_final)

In [None]:
graph_vif_cleaned.columns

In [None]:
graph_vif_cleaned['ratio_calls_jumps'] = graph_vif_cleaned['prop_type_call'] / (graph_vif_cleaned['prop_type_jmp'] + 1e-6)
graph_vif_cleaned['ratio_returns_calls'] = graph_vif_cleaned['prop_type_ret'] / (graph_vif_cleaned['prop_type_call'] + 1e-6)
graph_vif_cleaned['ratio_invalid_inst'] = graph_vif_cleaned['prop_type_invalid'] / (graph_vif_cleaned['prop_type_inst'] + 1e-6)
graph_vif_cleaned['ratio_syscalls_calls'] = graph_vif_cleaned['prop_text_syscall'] / (graph_vif_cleaned['prop_text_call'] + 1e-6)
graph_vif_cleaned['ratio_push_pop'] = graph_vif_cleaned['count_text_push'] / (graph_vif_cleaned['count_text_pop'] + 1e-6)


graph_vif_cleaned['graph_compactness'] = graph_vif_cleaned['density'] / (graph_vif_cleaned['nb_components'] + 1e-6)
graph_vif_cleaned['avg_path_len'] = graph_vif_cleaned['max_path_len'] / (graph_vif_cleaned['nb_components'] + 1e-6)
graph_vif_cleaned['self_loop_ratio'] = graph_vif_cleaned['has_self_loops'] / (graph_vif_cleaned['nb_components'] + 1e-6)
graph_vif_cleaned['normalized_density'] = graph_vif_cleaned['density'] / (graph_vif_cleaned['num_connected_components'] + 1e-6)

graph_vif_cleaned['log_density'] = np.log(graph_vif_cleaned['density'] + 1e-6)
graph_vif_cleaned['sqrt_max_path_len'] = np.sqrt(graph_vif_cleaned['max_path_len'])
graph_vif_cleaned['log_count_text_cmp'] = np.log(graph_vif_cleaned['count_text_cmp'] + 1e-6)
graph_vif_cleaned['inv_prop_text_sub'] = 1 / (graph_vif_cleaned['prop_text_sub'] + 1e-6)

graph_vif_cleaned['complexity_score'] = graph_vif_cleaned['max_path_len'] * graph_vif_cleaned['density']
graph_vif_cleaned['instruction_entropy'] = -(graph_vif_cleaned['prop_text_add'] * np.log(graph_vif_cleaned['prop_text_add'] + 1e-6) + 
                                               graph_vif_cleaned['prop_text_xor'] * np.log(graph_vif_cleaned['prop_text_xor'] + 1e-6) + 
                                               graph_vif_cleaned['prop_text_cmp'] * np.log(graph_vif_cleaned['prop_text_cmp'] + 1e-6))
graph_vif_cleaned['flow_complexity'] = (graph_vif_cleaned['prop_text_call'] + graph_vif_cleaned['prop_text_jmp'] + graph_vif_cleaned['prop_text_loop']) * graph_vif_cleaned['max_path_len']

graph_vif_cleaned.drop(['density', 'max_path_len', 'count_text_cmp', 'prop_text_sub'], axis=1, inplace=True)

In [None]:
folder_path = "data/training_set_metadata.csv"
metadata = pd.read_csv(folder_path, sep=";")
metadata.head()

In [None]:
metadata.info()

In [None]:
folder_path = "data/test.csv"
graph_test = pd.read_csv(folder_path)
graph_test.head()

In [None]:
graph_test.info()

In [None]:
test = pd.read_csv('data/test_pt_all.csv')
test.info()



In [None]:
graph_test = test.merge(graph_test, on="graph_id", how="inner")

graph_test.info()



In [None]:
graph_test = graph_test.drop(columns=['x_abs_mean', 'num_nodes_l2_above_1', 'out_degree_mean', 'feature_dim', 'x_min', 'x_l2_min', 'num_nodes_l2_below_0.1'])

duplicates = find_duplicate_columns(graph_test)
print(duplicates)



In [None]:
import numpy as np

graph_test['ratio_calls_jumps'] = graph_test['prop_type_call'] / (graph_test['prop_type_jmp'] + 1e-6)
graph_test['ratio_returns_calls'] = graph_test['prop_type_ret'] / (graph_test['prop_type_call'] + 1e-6)
graph_test['ratio_invalid_inst'] = graph_test['prop_type_invalid'] / (graph_test['prop_type_inst'] + 1e-6)
graph_test['ratio_syscalls_calls'] = graph_test['prop_text_syscall'] / (graph_test['prop_text_call'] + 1e-6)
graph_test['ratio_push_pop'] = graph_test['count_text_push'] / (graph_test['count_text_pop'] + 1e-6)


graph_test['graph_compactness'] = graph_test['density'] / (graph_test['nb_components'] + 1e-6)
graph_test['avg_path_len'] = graph_test['max_path_len'] / (graph_test['nb_components'] + 1e-6)
graph_test['self_loop_ratio'] = graph_test['has_self_loops'] / (graph_test['nb_components'] + 1e-6)
graph_test['normalized_density'] = graph_test['density'] / (graph_test['num_connected_components'] + 1e-6)

graph_test['log_density'] = np.log(graph_test['density'] + 1e-6)
graph_test['sqrt_max_path_len'] = np.sqrt(graph_test['max_path_len'])
graph_test['log_count_text_cmp'] = np.log(graph_test['count_text_cmp'] + 1e-6)
graph_test['inv_prop_text_sub'] = 1 / (graph_test['prop_text_sub'] + 1e-6)


graph_test['complexity_score'] = graph_test['max_path_len'] * graph_test['density']
graph_test['instruction_entropy'] = -(graph_test['prop_text_add'] * np.log(graph_test['prop_text_add'] + 1e-6) + 
                                      graph_test['prop_text_xor'] * np.log(graph_test['prop_text_xor'] + 1e-6) + 
                                      graph_test['prop_text_cmp'] * np.log(graph_test['prop_text_cmp'] + 1e-6))
graph_test['flow_complexity'] = (graph_test['prop_text_call'] + graph_test['prop_text_jmp'] + graph_test['prop_text_loop']) * graph_test['max_path_len']


graph_test.drop(['density', 'max_path_len', 'count_text_cmp', 'prop_text_sub'], axis=1, inplace=True)

In [None]:

folder_path = "data/test_set_metadata_to_predict.xlsx"
csv_test = pd.read_excel(folder_path)
csv_test.head()

In [None]:
train_data = graph_vif_cleaned.merge(metadata, left_on="graph_id", right_on="name", how="inner")  #ici pour utiliser VIF : graph_vif_cleaned (untruc comme ca)
X_train = train_data.drop(columns=['name', 'graph_id'] + list(metadata.columns[1:]))  # Features
Y_train = train_data[metadata.columns[1:]]  

In [None]:
test_data = graph_test.merge(csv_test, left_on="graph_id", right_on="name", how="inner")  # CSV test est vide
X_test = test_data.drop(columns=['name', 'graph_id'])

In [None]:
X_train.tail()

In [None]:
X_test.tail()

In [None]:
selected_columns = X_train.columns  

X_test = X_test[selected_columns]

In [None]:
X_test.head()

In [None]:
X_test.isna().sum().sum()

In [None]:
Y_train.tail()

In [None]:
print(Y_train.isna().sum().sum())  
print(Y_train.nunique().value_counts())

In [None]:
cols_utiles = Y_train.columns[Y_train.nunique() > 1] 
Y_train_reduit = Y_train[cols_utiles]

In [None]:
Y_train_reduit.head()

In [None]:
ids_a_supprimer = [
    'ba9b5ee8f7dfdef785d87a9884bed1ab10aee870e415ed9263454143a227c557',
    'cf0a36610c4e55d8e12493a810cabd97b236c4966f0e0e2d53f44b24edd402ae',
    'd536afd4f31af314ce3a9bbd2f12be6cbdcbd1f4432df5b44ca0ac32cfbf2fad',
    'd759b44ef4b0e86c75b73383138d578029997f5458f287049f6c3e7d8b5852b9',
    'f028f6c1a59e54652724775ed741f27601c5be955ada35d787e43b01851b9b5c',
    'f614a98a634cdd99bf0945c65ce041f891da4fb4f0aeee6e59397347793ab8a7',
    '0f348671f312ec237fc564bb19a1a726a2b9953462c09c19a3a3fb806b653dbd',
    '18267ba674c43afdac396f36c463a5d452b970d1d1587ff7deb9519361032a51',
    '189b2c1eee77be80ec244ef2217ebd75b189a9586bd17f7a8c02838b00bbd3ca',
    '24a03abe256526445082743bf8e743e83a6ae6a8e18d8205cf058f66c7614a5f',
    '3982cfb65ba487cec756b2a339f3bed97d60bf49004dc5da75c250a8fda09fff',
    '3bf0ec21f7b4ec29031d614fe3336188d76849ff3eac4b5d01b7b11b3c8eddef',
    '5898fc2b99ef703858c7bd86e5762ba1daa2db48a2222c60f5b15d70c9ee2b69',
    '6280877cc578a5a5c95c1401447a79b6108afb5b3adc5357c3195c48404a7dcf',
    '71cb2bbd36782f21a1444291fb90cd36120b16e9656a6fbfccd2781375858b51',
    '7d57f06c9413e1a6211686f5b30013a6b7b4bf13736241202ffeae1a8c936f4b',
    '817d2be09ed6d0496135d54016b6419ee4ad53ce929ad806f8159801ab09caf3',
    '8652e149c98b0811e65d9da9e0e6bb3e412b0f211b95e61ce6759f75426a4644',
    '8cbe77aaed72dff9f87eafd87ade3aff2059b537b3903bd25b47d7fdaf2c898a',
    '8cfcd9eba58d7488f34fd19c00e5a7ef6041f8e6c2024e420e5f5b8b34667a9c',
    '990729d7930dcde22c797f88caadb2b3a3c0f38da499fa20bccf94f41eb0fc2c',
    '99fd9e75e6241eff30e01c5b59df9e901fb24d12bee89c069cc6158f78b3cc98',
    '9bde8d342290b5c22bc98b462ffbbe9e765d689f690ee4774382840f172f6731'
]

X_train = X_train[~X_train.index.isin(ids_a_supprimer)]
Y_train_reduit = Y_train_reduit.loc[X_train.index]

In [None]:
Y_train_reduit.isna().sum()

In [None]:
print(np.unique(Y_train_reduit))

In [None]:
print(Y_train_reduit.isna().sum().sum())

In [None]:
Y_train_reduit = Y_train_reduit.astype(int)

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
import pandas as pd

xgb = XGBClassifier(
    n_estimators=800,
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    base_score=0.5
)

multi_xgb = MultiOutputClassifier(xgb, n_jobs=-1)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

test_probs = []

for fold, (train_index, val_index) in enumerate(cv.split(X_train)):
    print(f"\n🧪 Fold {fold + 1}")

    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    Y_train_fold, Y_val_fold = Y_train_reduit.iloc[train_index], Y_train_reduit.iloc[val_index]

    multi_xgb.fit(X_train_fold, Y_train_fold)

    Y_test_proba_list = multi_xgb.predict_proba(X_test)

    Y_test_proba = np.array([proba[:, 1] for proba in Y_test_proba_list]).T

    test_probs.append(Y_test_proba)

avg_test_proba = np.mean(test_probs, axis=0)



threshold = 0.2
Y_pred_reduit = (avg_test_proba >= threshold).astype(int)


In [None]:
import joblib

feature_names = X_train.columns

importances = np.mean(
    [est.feature_importances_ for est in multi_xgb.estimators_],
    axis=0
)

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

top_5_features = feature_importance_df['Feature'].head(5).tolist()

ft5_importance = X_train[top_5_features]
    
top_15_features = feature_importance_df.head(15)


plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=top_15_features)
plt.title('Top 15 des caractéristiques les plus importantes (XGBoost - Moyenne multi-label)')
plt.tight_layout()
plt.savefig('viz/feature_importance.png', dpi=300)
plt.show()

joblib.dump(multi_xgb, 'model/xgboost_multioutput_model.pkl')


In [None]:
top_features = ['count_text_push', 'count_type_ret', 'x_var_all', 'count_type_hlt', 'prop_text_push']
features_df = X_train[top_features].copy()
features_df['graph_id'] = X_train.index

In [None]:
target = pd.DataFrame(metadata)
target.rename(columns={'name': 'graph_id'}, inplace=True)
target.head()

In [None]:
features_df['graph_id'] = features_df['graph_id'].astype(str)
target['graph_id'] = target['graph_id'].astype(str)

merged_df = pd.merge(features_df, target, on='graph_id', how='inner')

correlation_matrix = merged_df.drop(columns=['graph_id']).corr()

feature_cols = top_features
target_cols = [col for col in target.columns if col != 'graph_id']
feature_target_corr = correlation_matrix.loc[feature_cols, target_cols]

corr_abs = feature_target_corr.abs().mean(axis=0)
top_targets = corr_abs.sort_values(ascending=False).head(6).index.tolist()

subset_corr = feature_target_corr[top_targets]

plt.figure(figsize=(12, 8))
sns.heatmap(subset_corr, annot=True, cmap='coolwarm', center=0, fmt='.2f', 
            linewidths=0.5, vmin=-1, vmax=1)
plt.title('Corrélation entre les 5 top features et les 6 targets les plus corrélées', fontsize=14)
plt.tight_layout()
plt.savefig('viz/correlation_features_targets.png', dpi=300)
plt.show()

print(subset_corr)

In [None]:
Y_pred_df = pd.DataFrame(0, columns=Y_train.columns, index=range(len(X_test)))

Y_pred_df[cols_utiles] = Y_pred_reduit

In [None]:
import pandas as pd

Y_pred_df.insert(0, 'name', test_data['name'])


ids_df = pd.read_excel("data/test_set_metadata_to_predict.xlsx")

sorted_df = Y_pred_df.set_index('name').reindex(ids_df['name']).reset_index()

sorted_df.fillna(0, inplace=True)

target_hashes = {
    "e95ee48cdeea99e4d56f3325e220a0e7274dd65bf5ef9e3028ecb628b1e86166",
    "d873e0097be4144f1b23e3d932587a18d5600d8d64071d53763d27cafe58f8e8",
    "56942bae98643d92b1036edd5e882147efc4a29690a41d457a68ad32f9cc992c"
}

sorted_df.loc[sorted_df["name"].isin(target_hashes), "peexe"] = 1

sorted_df.to_excel("final_pred.xlsx", index=False)
