In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from prince import FAMD
from scipy import sparse
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
def encoder(df, col_split):
    """
    Prepares the data for the model by standardizing the continuous features, 
    converting the categorical features to strings, 
    and converting sparse columns to a dense format for FAMD.
    """
    categorical_cols = df.columns[:col_split]

    # converting the categorical features to strings
    df[categorical_cols] = df[categorical_cols].astype(str)

    return df

def load_data(data_address='beta_dates/beta_data_7_60.csv', label_address='beta_dates/true_labels.csv'):
    df = pd.read_csv(data_address, index_col=0)
    y = pd.read_csv(label_address, index_col=0).values[:,0]
    y = y + 1
    X = encoder(df, 4)
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

    return X_train, X_test, y_train, y_test
def load_data_cv(data_address='beta_dates/beta_data_7_60.csv', label_address='beta_dates/true_labels.csv'):
    df = pd.read_csv(data_address, index_col=0)
    y = pd.read_csv(label_address, index_col=0).values[:,0]
    y = y + 1
    X = encoder(df, 4)

    return X, y

In [81]:
params = {'rf__max_depth': 3, 'rf__max_features': 0.6639830714160453, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 8, 'rf__n_estimators': 143}

In [80]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import MaxAbsScaler, StandardScaler
# from sklearn.ensemble import RandomForestClassifier
# from prince import FAMD  # Ensure prince is installed
# from sklearn.metrics import classification_report, accuracy_score
# from sklearn.model_selection import StratifiedKFold
# import numpy as np

# # Number of splits for K-Fold Cross-Validation
# n_splits = 5
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

# # Load your data
# X, y = load_data_cv('beta_dates/beta_data_2_42.csv', 'beta_dates/true_labels.csv')

# # Perform FAMD on the dataset
# famd = FAMD(n_components=8)
# famd.fit(X)
# X = famd.transform(X)

# # Create a pipeline with the best parameters
# pipe = Pipeline([
#     ('scaler1', MaxAbsScaler()),
#     ('scaler2', MaxAbsScaler()),  # Consider if you really need two MaxAbsScalers
#     ('standard_scaler', StandardScaler()),
#     ('classifier', RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=6, min_samples_split=17, n_estimators=100))
# ])

# # Perform K-Fold Cross-Validation
# accuracy_scores = []
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     # Fit the pipeline to the training data
#     pipe.fit(X_train, y_train)

#     # Make predictions on the test set
#     predictions = pipe.predict(X_test)

#     # Evaluate the model
#     print(classification_report(y_test, predictions))
#     print("Accuracy:", accuracy_score(y_test, predictions))
#     accuracy_scores.append(accuracy_score(y_test, predictions))

# print("Average Accuracy:", np.mean(accuracy_scores))


In [141]:
# Number of splits for K-Fold Cross-Validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

X, y = load_data_cv('beta_dates/beta_data_2_42.csv', 'beta_dates/true_labels.csv')

famd = FAMD(n_components=12)
famd.fit(X)
X = famd.transform(X)

# create a pipeline with the best parameters
pipe = Pipeline([
    ('rf', RandomForestClassifier()),
])

params = {'rf__max_depth': 4, 'rf__max_features': 0.6407209567554418, 'rf__min_samples_leaf': 6, 'rf__min_samples_split': 17, 'rf__n_estimators': 195}
params = {'rf__max_depth': 3, 'rf__max_features': 0.6639830714160453, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 8, 'rf__n_estimators': 143}
pipe.set_params(**params)

# Perform K-Fold Cross-Validation

accuracy_scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the pipeline to the training data
    pipe.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = pipe.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, predictions))
    print("Accuracy:", accuracy_score(y_test, predictions))
    accuracy_scores.append(accuracy_score(y_test, predictions))

print("Average Accuracy:", np.mean(accuracy_scores))

              precision    recall  f1-score   support

         0.0       1.00      0.29      0.44         7
         1.0       0.81      1.00      0.89        38
         2.0       1.00      0.60      0.75        10

    accuracy                           0.84        55
   macro avg       0.94      0.63      0.70        55
weighted avg       0.87      0.84      0.81        55

Accuracy: 0.8363636363636363
              precision    recall  f1-score   support

         0.0       0.67      0.33      0.44         6
         1.0       0.79      0.97      0.87        38
         2.0       1.00      0.45      0.62        11

    accuracy                           0.80        55
   macro avg       0.82      0.59      0.65        55
weighted avg       0.82      0.80      0.77        55

Accuracy: 0.8
              precision    recall  f1-score   support

         0.0       0.60      0.50      0.55         6
         1.0       0.78      0.92      0.84        38
         2.0       0.80      0.3

In [149]:
overall_accuracy_scores = []
# Load your data
X, y = load_data_cv('beta_dates/beta_data_2_42.csv', 'beta_dates/true_labels.csv')

famd = FAMD(n_components=6)
famd.fit(X)
X_transformed = famd.transform(X)

for iteration in range(10):


    # Define the pipeline
    pipe = Pipeline([
        ('rf', RandomForestClassifier()),
    ])

    # Set pipeline parameters
    params = {'rf__max_depth': 4, 'rf__max_features': 0.65, 'rf__min_samples_leaf': 6, 'rf__min_samples_split': 17, 'rf__n_estimators': 200}
    pipe.set_params(**params)

    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, shuffle=True)

    # List to store accuracy scores for this iteration
    accuracy_scores = []

    # Perform K-Fold Cross-Validation
    for train_index, test_index in skf.split(X_transformed, y):
        X_train, X_test = X_transformed.iloc[train_index], X_transformed.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Fit the pipeline and make predictions
        pipe.fit(X_train, y_train)
        predictions = pipe.predict(X_test)

        # Calculate and store accuracy
        accuracy_scores.append(accuracy_score(y_test, predictions))

    # Calculate average accuracy for this iteration
    avg_accuracy = np.mean(accuracy_scores)
    overall_accuracy_scores.append(avg_accuracy)
    print(f"Iteration {iteration + 1}, Average Accuracy: {avg_accuracy}")

# Calculate and print the overall average accuracy across all iterations
print("Overall Average Accuracy:", np.mean(overall_accuracy_scores))

Iteration 1, Average Accuracy: 0.8065993265993265
Iteration 2, Average Accuracy: 0.7991919191919192
Iteration 3, Average Accuracy: 0.7992592592592592
Iteration 4, Average Accuracy: 0.7993265993265993
Iteration 5, Average Accuracy: 0.7882828282828283
Iteration 6, Average Accuracy: 0.799057239057239
Iteration 7, Average Accuracy: 0.7921885521885523
Iteration 8, Average Accuracy: 0.7811447811447811
Iteration 9, Average Accuracy: 0.7842424242424243
Iteration 10, Average Accuracy: 0.7808080808080808
Overall Average Accuracy: 0.7930101010101009


In [26]:
X, y = load_data_cv('beta_dates/beta_data_2_42.csv', 'beta_dates/true_labels.csv')

# Original features plus the case of dropping no feature
original_features = X.columns.tolist()
original_features.append(None)  # Represents the case of dropping no feature

# Initialize Stratified K-Fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

# Store average accuracies for each feature drop scenario
feature_drop_accuracies = {}

for drop_feature in original_features:
    iteration_accuracies = []

    for iteration in range(20):
        # Drop one feature for this iteration, if specified
        X_dropped = X.drop(columns=[drop_feature]) if drop_feature is not None else X

        # Perform FAMD on the modified dataset
        famd = FAMD(n_components=8)
        famd.fit(X_dropped)
        X_famd = famd.transform(X_dropped)

        # Create a pipeline
        pipe = Pipeline([
            ('rf', RandomForestClassifier()),  # Assuming params are set for RandomForest
        ])

        pipe.set_params(**params)

        accuracy_scores = []
        for train_index, test_index in skf.split(X_famd, y):
            X_train, X_test = X_famd.iloc[train_index], X_famd.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Fit the pipeline to the training data
            pipe.fit(X_train, y_train)

            # Make predictions on the test set
            predictions = pipe.predict(X_test)

            # Evaluate the model
            accuracy_scores.append(accuracy_score(y_test, predictions))

        # Calculate average accuracy for this iteration
        iteration_accuracies.append(np.mean(accuracy_scores))

    # Calculate the overall average accuracy after dropping the feature
    avg_accuracy = np.mean(iteration_accuracies)
    feature_name = drop_feature if drop_feature is not None else "No Feature Dropped"
    feature_drop_accuracies[feature_name] = avg_accuracy
    print(f"Average Accuracy dropping '{feature_name}':", avg_accuracy)

# Print overall results
print("Feature Drop Accuracies:", feature_drop_accuracies)


Average Accuracy dropping 'fed_party': 0.7608047138047139
Average Accuracy dropping 'potus_party': 0.7447171717171717
Average Accuracy dropping 'recess': 0.7534175084175084
Average Accuracy dropping 'mom': 0.7231683501683503
Average Accuracy dropping 'pce': 0.763888888888889
Average Accuracy dropping 'ue': 0.7665521885521887
Average Accuracy dropping 'cars': 0.7761010101010102
Average Accuracy dropping 'house': 0.7695084175084175
Average Accuracy dropping 'cli': 0.7819090909090909
Average Accuracy dropping 'exports': 0.7817542087542088
Average Accuracy dropping 'rgdp': 0.7745521885521887
Average Accuracy dropping 'gdpd': 0.7637508417508418
Average Accuracy dropping 'veloc': 0.7805353535353536
Average Accuracy dropping 'ffr': 0.7824814814814814
Average Accuracy dropping 'mich': 0.7782693602693603
Average Accuracy dropping 'd_pce': 0.7841616161616162
Average Accuracy dropping 'd_ue': 0.7866666666666667
Average Accuracy dropping 'd_cars': 0.7835454545454545
Average Accuracy dropping 'd_ho

In [29]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

# Load your data
X, y = load_data_cv('beta_dates/beta_data_2_42.csv', 'beta_dates/true_labels.csv')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and run the TPOT classifier
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)

# Export the best pipeline
tpot.export('best_pipeline.py')

# Evaluate the final model
print("Test score:", tpot.score(X_test, y_test))


                                                                             
Generation 1 - Current best internal CV score: 0.7898520084566596
                                                                              
Generation 2 - Current best internal CV score: 0.7942917547568711
                                                                              
Generation 3 - Current best internal CV score: 0.7942917547568711
                                                                              
Generation 4 - Current best internal CV score: 0.7942917547568711
                                                                              
Generation 5 - Current best internal CV score: 0.8081395348837208
                                                                              
Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.55, min_samples_leaf=5, min_samples_split=6, n_estimators=100)
Test score: 0.7454545454545455


In [6]:
from tpot import TPOTClassifier
from sklearn.model_selection import StratifiedKFold

# Load your data
X, y = load_data_cv('beta_dates/beta_data_2_42.csv', 'beta_dates/true_labels.csv')

# Specify K-Fold Cross-Validation
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True)

# Instantiate and run the TPOT classifier with K-Fold CV
tpot = TPOTClassifier(cv=cv, n_jobs=-1)
tpot.fit(X, y)

# Export the best pipeline
tpot.export('new_best_pipeline.py')

# Evaluate the final model using the test score function
# Note: The test score function will perform a separate train-test split internally
print("Test score:", tpot.score(X, y))


Test score: 0.9708029197080292


In [797]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
# load the data without the index column
tpot_data = pd.read_csv('famd_data.csv', index_col=0)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8322558922558922
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=39),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=5, max_features=0.9000000000000001, min_samples_leaf=13, min_samples_split=9, n_estimators=100, subsample=0.9500000000000001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print(classification_report(testing_target, results))

              precision    recall  f1-score   support

         0.0       0.33      0.17      0.22         6
         1.0       0.84      0.91      0.88        47
         2.0       0.87      0.81      0.84        16

    accuracy                           0.83        69
   macro avg       0.68      0.63      0.65        69
weighted avg       0.80      0.83      0.81        69

