In [36]:
# Chi-Squared Test on last feature from df
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Temporarily adjust pandas display settings for large DataFrames
pd.set_option('display.max_rows', 100)  # Ensure 100 rows can be displayed
pd.set_option('display.max_columns', None)  # Ensure all columns can be displayed
pd.set_option('display.width', None)  # Automatically adjust display width to terminal size
pd.set_option('display.max_colwidth', None)  # Ensure full width of column content is shown
pd.set_option('display.float_format', '{:.4f}'.format)  # Format the float numbers for better readability


In [37]:
# Load the dataset
file_path = 'synth_data_for_training.csv'  # Replace this with the path to your CSV file
df = pd.read_csv(file_path)

In [38]:
# TODO: Correlation does not imply causation. Two variables might be correlated due to a third variable or by coincidence.
# TODO: The threshold of 0.5 is arbitrary and might need to be adjusted based on your specific dataset and the model you are using. For some models, even moderately correlated features might pose problems, while for others, even higher correlations might not be as concerning.
# TODO: As a baseline model we can also use a model that has built-in mechanisms for feature selection (like L1 regularization for linear models). 

# Assuming df is your DataFrame
# Calculate the correlation matrix
corr_matrix = df.corr()

# Initialize lists to store the results
highly_pos_correlated_pairs = []
highly_neg_correlated_pairs = []

# Threshold for filtering high correlations (you can adjust this value)
threshold = 0.5

# Iterate over the correlation matrix and store pairs of highly correlated features
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):  # i+1 to avoid self-correlation
        if corr_matrix.iloc[i, j] > threshold:  # Positive correlation
            highly_pos_correlated_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
        elif corr_matrix.iloc[i, j] < -threshold:  # Negative correlation
            highly_neg_correlated_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

# Sort the lists based on the correlation value
highly_pos_correlated_pairs.sort(key=lambda x: x[2], reverse=True)
highly_neg_correlated_pairs.sort(key=lambda x: x[2])

# Print out the highest positively and negatively correlated feature pairs
print("Highly Positive Correlated Pairs:")
for pair in highly_pos_correlated_pairs:
    print(f"{pair[0]} and {pair[1]} with correlation {pair[2]:.2f}")

print("\nHighly Negative Correlated Pairs:")
for pair in highly_neg_correlated_pairs:
    print(f"{pair[0]} and {pair[1]} with correlation {pair[2]:.2f}")

# Assuming we choose to remove the second feature from each pair
features_to_remove = {pair[1] for pair in highly_pos_correlated_pairs + highly_neg_correlated_pairs}

# Create a new DataFrame excluding the features identified for removal
df_reduced = df.drop(columns=list(features_to_remove))

print(f"Original number of features: {df.shape[1]}, Reduced number of features: {df_reduced.shape[1]}")


Highly Positive Correlated Pairs:
contacten_onderwerp_boolean_contact_derden and contacten_onderwerp_contact_derden with correlation 0.75
contacten_onderwerp_boolean_ziek__of_afmelding and contacten_onderwerp_ziek__of_afmelding with correlation 0.75
contacten_onderwerp_boolean_contact_met_aanbieder and contacten_onderwerp_contact_met_aanbieder with correlation 0.75
contacten_onderwerp_boolean_uitnodiging and contacten_onderwerp_uitnodiging with correlation 0.75
afspraak_signaal_van_aanbieder and contacten_soort_rapportage_rib with correlation 0.74
contacten_onderwerp_boolean_documenttype__overeenkomst_ and contacten_onderwerp_documenttype__overeenkomst_ with correlation 0.74
afspraak_afgelopen_jaar_signaal_voor_medewerker and afspraak_signaal_voor_medewerker with correlation 0.73
afspraak_afgelopen_jaar_signaal_voor_medewerker and afspraak_laatstejaar_aantal_woorden with correlation 0.73
contacten_onderwerp_boolean_matching and contacten_onderwerp_matching with correlation 0.73
contact

In [39]:
X = df.drop('checked', axis=1)  # Features
y = df['checked']  # Target variable

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and training the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Getting feature importances
feature_importances = clf.feature_importances_

# Converting feature importances into a more readable format
features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sorting features by importance
features_sorted = features.sort_values(by='Importance', ascending=False)

# Now, print the top 100 features with their importance
print(features_sorted.head(50))

                                                      Feature  Importance
216                            persoon_leeftijd_bij_onderzoek      0.0428
276                                 pla_historie_ontwikkeling      0.0224
206  ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden      0.0201
219            persoonlijke_eigenschappen_dagen_sinds_taaleis      0.0189
205                                ontheffing_dagen_hist_mean      0.0154
218             persoonlijke_eigenschappen_dagen_sinds_opvoer      0.0135
53                     belemmering_dagen_financiele_problemen      0.0133
4                                        adres_dagen_op_adres      0.0132
140                               contacten_onderwerp_no_show      0.0132
155         contacten_soort_afgelopenjaar_document__uitgaand_      0.0129
165                       contacten_soort_document__uitgaand_      0.0120
54                belemmering_dagen_lichamelijke_problematiek      0.0119
179                     instrument_lad