In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from itertools import product
import sys
import os

# Add the parent directory to the Python path and import function
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from src.generate_data import DataGenerator
from src.prepare_data import DataPreprocessor

In [13]:
# Define all parameter variations

# for the DGP
n_classes_dep_var = [2, 3, 4]
n_ind_vars = [3, 5, 7]
n_categorical_vars = np.arange(7)
n_classes_ind_vars = [2, 3]
n_samples = [100, 500, 1000, 10000]
preprocessor = DataPreprocessor()

# for LR
link_function = ['Probit', 'Logit']

In [14]:
results_list = []
# loop over all data sets
np.random.seed(3103)
for n_classes_dep, n_ind, n_categorical, n_classes_ind, n_samp in product(n_classes_dep_var, n_ind_vars, n_categorical_vars, n_classes_ind_vars, n_samples):
    data_generator = DataGenerator(
        n_classes_dep_var=n_classes_dep, 
        n_ind_vars=n_ind,        
        n_categorical_vars=n_categorical,
        n_classes_ind_vars=n_classes_ind,
        n_samples=n_samp         
    )
    data = data_generator.generate_data()
    # data = generate_data(n_classes_dep, n_ind, n_categorical, n_classes_ind, n_samp)
    if data is not None:  # Avoid using if no data is generated 
        X_train, X_test, y_train, y_test = preprocessor.preprocess(data)

        clf = LogisticRegression(penalty=None, random_state = 3103)  # use no penalty
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        misclassification = 1 - accuracy_score(y_test, y_pred)
        
        results_list.append([f'S{len(results_list)+1}', n_classes_dep, n_ind, n_categorical, n_classes_ind, n_samp, misclassification])

In [15]:
results = pd.DataFrame(results_list, columns=['ID', 'n_classes_dep_var', 'n_ind_vars', 'n_categorical_vars', 'n_classes_ind_vars', 'n_samples', 'misclassification_rate'])
results

Unnamed: 0,ID,n_classes_dep_var,n_ind_vars,n_categorical_vars,n_classes_ind_vars,n_samples,misclassification_rate
0,S1,2,3,0,2,100,0.333333
1,S2,2,3,0,2,500,0.213333
2,S3,2,3,0,2,1000,0.183333
3,S4,2,3,0,2,10000,0.211667
4,S5,2,3,1,2,100,0.233333
...,...,...,...,...,...,...,...
319,S320,4,7,6,2,10000,0.374000
320,S321,4,7,6,3,100,0.333333
321,S322,4,7,6,3,500,0.346667
322,S323,4,7,6,3,1000,0.330000


TODO: Add the multiclass probit regression (problem is that sklearn has not implemented a probit link function and statsmodels only for binary problems)