In [2]:
from sklearn.model_selection import cross_validate, LeaveOneGroupOut
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import networkx as nx


In [3]:
df = pd.read_csv('../data/ts_descriptors_with_cycles.csv')  # replace with your data file


In [4]:
X = df.drop(columns=['graph_id','edge_source','edge_dest', 'is_causal'])
y = df['is_causal']

In [5]:
#data stats
print('Number of samples:', len(X))
print('Number of features:', len(X.columns))
print('Number of causal edges:', y.sum())
print('Number of non-causal edges:', len(y) - y.sum())
print('Number of graphs:', len(df['graph_id'].unique()))


Number of samples: 12105
Number of features: 92
Number of causal edges: 3105
Number of non-causal edges: 9000
Number of graphs: 1500


In [9]:
#suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.metrics import f1_score, roc_auc_score

#train test split 
logo = LeaveOneGroupOut()
groups = df['graph_id']
rf_scores = pd.DataFrame(columns=['accuracy_train', 'f1_train', 'auc_train', 'accuracy_test', 'f1_test', 'auc_test'])
brf_scores = pd.DataFrame(columns=['accuracy_train', 'f1_train', 'auc_train', 'accuracy_test', 'f1_test', 'auc_test'])
counter = 0
for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = X_train.iloc[:, :int(len(X_train.columns)/2)]
    X_test = X_test.iloc[:, :int(len(X_test.columns)/2)]
    #measure training error and test error

    # rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    # rf.fit(X_train, y_train)
    # accuracy_train = rf.score(X_train, y_train)
    # f1_train = f1_score(y_train, rf.predict(X_train))
    # auc_train = roc_auc_score(y_train, rf.predict_proba(X_train)[:,1])
    # accuracy_test = rf.score(X_test, y_test)
    # f1_test = f1_score(y_test, rf.predict(X_test))
    # auc_test = roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])

    # rf_scores.loc[len(rf_scores)] = [accuracy_train, f1_train, auc_train, accuracy_test, f1_test, auc_test]

    brf = BalancedRandomForestClassifier(n_estimators=100, max_depth=10, random_state=0, n_jobs=-1)
    brf.fit(X_train, y_train)
    accuracy_train = brf.score(X_train, y_train)
    f1_train = f1_score(y_train, brf.predict(X_train))
    auc_train = roc_auc_score(y_train, brf.predict_proba(X_train)[:,1])
    accuracy_test = brf.score(X_test, y_test)
    f1_test = f1_score(y_test, brf.predict(X_test))
    auc_test = roc_auc_score(y_test, brf.predict_proba(X_test)[:,1])

    brf_scores.loc[len(brf_scores)] = [accuracy_train, f1_train, auc_train, accuracy_test, f1_test, auc_test]

    counter += 1
    if counter == 20:
        break

In [10]:
brf_scores

Unnamed: 0,accuracy_train,f1_train,auc_train,accuracy_test,f1_test,auc_test
0,0.612167,0.543224,0.807799,0.714286,0.5,0.75
1,0.608746,0.539994,0.811599,0.5,0.5,0.833333
2,0.605737,0.539361,0.808259,0.555556,0.333333,0.472222
3,0.610895,0.541719,0.809753,0.375,0.285714,0.458333
4,0.611474,0.541821,0.812117,0.5,0.5,0.375
5,0.610349,0.540277,0.808811,0.714286,0.5,0.916667
6,0.610431,0.540956,0.809466,0.714286,0.5,0.75
7,0.608911,0.540724,0.810388,0.625,0.4,0.666667
8,0.605489,0.538134,0.806756,0.888889,0.857143,0.916667
9,0.608117,0.538229,0.808035,0.714286,0.5,0.916667


In [13]:
brf_scores.to_csv('results/brf_scores.csv')