In [50]:
import numpy as np
import pandas as pd
from logistic_regression import LogRegCCD
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def generate_data(p, n, d, g, random_state=None):
    """
    p - bernoulli distibution probability of class 1
    n - number of samples
    d - number of features
    g - covariance decay factor (controls how strongly features are correlated in the covariance matrix) 

    0 < p, g < 1
    n, d >= 1
    """

    if random_state is not None:
        np.random.seed(random_state)
    
    Y = np.random.binomial(n=1, p=p, size=n)
    
    mean_0 = np.zeros(d)
    mean_1 = np.array([1/i for i in range(1, d + 1)])

    S = np.fromfunction(lambda i, j: g ** np.abs(i - j), (d, d), dtype=float)
    
    X = np.array([
        np.random.multivariate_normal(mean_0 if y == 0 else mean_1, S)
        for y in Y
    ])
    
    return X, Y



In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, roc_auc_score

def create_results_df(random_state=12, repeats=10):
    
    rows = []

    np.random.seed(random_state)

    iterations = 10
    alpha = 0.9
    lmbda = 0.9

    
    for p in [0.1, 0.25, 0.5]:
        for n in [100, 500, 1000]: 
            for d in [50, 100, 500, 2000]:
                for g in [0.1, 0.5, 0.9]:
                    print(p, n, d, g)
                    test_acc_ccd = [None] * repeats
                    test_roc_ccd = [None] * repeats
                    test_acc_sklearn = [None] * repeats
                    test_roc_sklearn = [None] * repeats

                    for i in range(repeats): # 10 examples
                        
                        X, y = generate_data(p, n, d, g, np.random.randint(low=0, high=2**31))
                        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)


                        lr = LogRegCCD()
                        lr.fit(iterations, X_train, y_train, alpha, lmbda)

                        test_acc_ccd[i] = lr.validate(X_test, y_test, 'balanced_accuracy')
                        test_roc_ccd[i] = lr.validate(X_test, y_test, 'roc_auc')

                        lr_sklearn = LogisticRegression(penalty=None)
                        lr_sklearn.fit(X_train, y_train)
                        y_pred_sklearn = lr_sklearn.predict(X_test)
                        y_prob_sklearn = lr_sklearn.predict_proba(X_test)[:, 1]
                        
                        test_acc_sklearn[i] = balanced_accuracy_score(y_test, y_pred_sklearn)
                        test_roc_sklearn[i] = roc_auc_score(y_test, y_prob_sklearn)
                        

                    rows.append([p, n, d, g, 
                                    np.mean(test_acc_ccd), np.mean(test_acc_sklearn),
                                    np.mean(test_roc_ccd), np.mean(test_roc_sklearn)])

    df = pd.DataFrame(rows, columns=['p', 'n', 'd', 'g',
                                        'test_acc_ccd', 'test_acc_sklearn',
                                        'test_roc_ccd', 'test_roc_sklearn'])

    return df
                    

In [52]:
df = create_results_df(repeats=1)


0.1 100 50 0.1
0.1 100 50 0.5
0.1 100 50 0.9
0.1 100 100 0.1
0.1 100 100 0.5
0.1 100 100 0.9
0.1 100 500 0.1
0.1 100 500 0.5
0.1 100 500 0.9
0.1 100 2000 0.1
0.1 100 2000 0.5
0.1 100 2000 0.9
0.1 500 50 0.1
0.1 500 50 0.5
0.1 500 50 0.9
0.1 500 100 0.1
0.1 500 100 0.5
0.1 500 100 0.9
0.1 500 500 0.1
0.1 500 500 0.5
0.1 500 500 0.9
0.1 500 2000 0.1
0.1 500 2000 0.5
0.1 500 2000 0.9
0.1 1000 50 0.1
0.1 1000 50 0.5
0.1 1000 50 0.9
0.1 1000 100 0.1
0.1 1000 100 0.5
0.1 1000 100 0.9
0.1 1000 500 0.1
0.1 1000 500 0.5
0.1 1000 500 0.9
0.1 1000 2000 0.1
0.1 1000 2000 0.5
0.1 1000 2000 0.9
0.25 100 50 0.1
0.25 100 50 0.5
0.25 100 50 0.9
0.25 100 100 0.1
0.25 100 100 0.5
0.25 100 100 0.9
0.25 100 500 0.1
0.25 100 500 0.5
0.25 100 500 0.9
0.25 100 2000 0.1
0.25 100 2000 0.5
0.25 100 2000 0.9
0.25 500 50 0.1
0.25 500 50 0.5
0.25 500 50 0.9
0.25 500 100 0.1
0.25 500 100 0.5
0.25 500 100 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.25 500 500 0.1
0.25 500 500 0.5
0.25 500 500 0.9
0.25 500 2000 0.1
0.25 500 2000 0.5
0.25 500 2000 0.9
0.25 1000 50 0.1
0.25 1000 50 0.5
0.25 1000 50 0.9
0.25 1000 100 0.1
0.25 1000 100 0.5
0.25 1000 100 0.9
0.25 1000 500 0.1
0.25 1000 500 0.5
0.25 1000 500 0.9
0.25 1000 2000 0.1
0.25 1000 2000 0.5
0.25 1000 2000 0.9
0.5 100 50 0.1
0.5 100 50 0.5
0.5 100 50 0.9
0.5 100 100 0.1
0.5 100 100 0.5
0.5 100 100 0.9
0.5 100 500 0.1
0.5 100 500 0.5
0.5 100 500 0.9
0.5 100 2000 0.1
0.5 100 2000 0.5
0.5 100 2000 0.9
0.5 500 50 0.1
0.5 500 50 0.5
0.5 500 50 0.9
0.5 500 100 0.1
0.5 500 100 0.5
0.5 500 100 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5 500 500 0.1
0.5 500 500 0.5
0.5 500 500 0.9
0.5 500 2000 0.1
0.5 500 2000 0.5
0.5 500 2000 0.9
0.5 1000 50 0.1
0.5 1000 50 0.5
0.5 1000 50 0.9
0.5 1000 100 0.1
0.5 1000 100 0.5
0.5 1000 100 0.9
0.5 1000 500 0.1
0.5 1000 500 0.5
0.5 1000 500 0.9
0.5 1000 2000 0.1
0.5 1000 2000 0.5
0.5 1000 2000 0.9


In [53]:
df.to_excel('p_n_d_g_results.xlsx', index=False)

In [None]:
from pandasql import sqldf

def df_to_latex(df, table_name="Table name"):
    latex_str = """
    \\begin{table}[H]
        \centering
        \\begin{tabular}{|m{1cm}|m{2.5cm}|m{3.1cm}|m{2.5cm}|m{3.1cm}|}
        \hline
    """
    
    # Column headers
    latex_str += "        " + " & ".join(["\\textbf{{{}}}".format(col.replace('_', '\\_' )) for col in df.columns]) + "\\\\ \hline\n"
    
    # Data rows
    for _, row in df.iterrows():
        latex_str += "        " + " & ".join(map(str, row)) + "\\\\ \hline\n"
    
    latex_str += f"""
        \end{{tabular}}
        \caption{{{table_name}}}
    \end{{table}}
    """
    
    return latex_str

for parameter in ['p', 'n', 'd', 'g']:

    df_parameter = sqldf('''
    select
    {0}, 
    round(avg(test_acc_ccd), 2) as test_acc_ccd, 
    round(avg(test_acc_sklearn), 2) as test_acc_sklearn, 
    round(avg(test_roc_ccd), 2) as test_roc_ccd, 
    round(avg(test_roc_sklearn), 2) as test_roc_sklearn

    from df
    group by {0}
    order by {0}
    '''.format(parameter))

    # display(df_parameter)
    print(df_to_latex(df_parameter, f'Comparison of parameter: {parameter}'))


    \begin{table}[H]
        \centering
        \begin{tabular}{|m{1cm}|m{2.5cm}|m{3.1cm}|m{2.5cm}|m{3.1cm}|}
        \hline
            \textbf{p} & \textbf{test\_acc\_ccd} & \textbf{test\_acc\_sklearn} & \textbf{test\_roc\_ccd} & \textbf{test\_roc\_sklearn}\\ \hline
        0.1 & 0.54 & 0.55 & 0.66 & 0.6\\ \hline
        0.25 & 0.61 & 0.6 & 0.69 & 0.64\\ \hline
        0.5 & 0.64 & 0.59 & 0.69 & 0.63\\ \hline

        \end{tabular}
        \caption{C omparison of parameter: p}
    \end{table}
    

    \begin{table}[H]
        \centering
        \begin{tabular}{|m{1cm}|m{2.5cm}|m{3.1cm}|m{2.5cm}|m{3.1cm}|}
        \hline
            \textbf{n} & \textbf{test\_acc\_ccd} & \textbf{test\_acc\_sklearn} & \textbf{test\_roc\_ccd} & \textbf{test\_roc\_sklearn}\\ \hline
        100.0 & 0.58 & 0.54 & 0.64 & 0.56\\ \hline
        500.0 & 0.6 & 0.59 & 0.69 & 0.64\\ \hline
        1000.0 & 0.62 & 0.61 & 0.71 & 0.67\\ \hline

        \end{tabular}
        \caption{C omparison of parameter: n}
  

In [55]:
df_parameter = sqldf('''
select
n,
d,
round(avg(test_acc_ccd), 2) as test_acc_ccd, 
round(avg(test_acc_sklearn), 2) as test_acc_sklearn, 
round(avg(test_roc_ccd), 2) as test_roc_ccd, 
round(avg(test_roc_sklearn), 2) as test_roc_sklearn

from df
group by n, d
order by n, d
'''.format(parameter))

display(df_parameter)
print(df_to_latex(df_parameter))

Unnamed: 0,n,d,test_acc_ccd,test_acc_sklearn,test_roc_ccd,test_roc_sklearn
0,100,50,0.66,0.66,0.77,0.7
1,100,100,0.6,0.54,0.65,0.55
2,100,500,0.52,0.46,0.57,0.46
3,100,2000,0.52,0.52,0.56,0.54
4,500,50,0.62,0.64,0.73,0.72
5,500,100,0.62,0.62,0.7,0.66
6,500,500,0.58,0.54,0.66,0.61
7,500,2000,0.59,0.56,0.66,0.56
8,1000,50,0.63,0.63,0.76,0.75
9,1000,100,0.64,0.64,0.74,0.73



    \begin{{table}}[H]
        \centering
        \begin{{tabular}}{{{|m{0.5cm}|m{2.5cm}|m{3.1cm}|m{2.5cm}|m{3.1cm}|}}}
        \hline
            \textbf{n} & \textbf{d} & \textbf{test\_acc\_ccd} & \textbf{test\_acc\_sklearn} & \textbf{test\_roc\_ccd} & \textbf{test\_roc\_sklearn}\\ \hline
        100.0 & 50.0 & 0.66 & 0.66 & 0.77 & 0.7\\ \hline
        100.0 & 100.0 & 0.6 & 0.54 & 0.65 & 0.55\\ \hline
        100.0 & 500.0 & 0.52 & 0.46 & 0.57 & 0.46\\ \hline
        100.0 & 2000.0 & 0.52 & 0.52 & 0.56 & 0.54\\ \hline
        500.0 & 50.0 & 0.62 & 0.64 & 0.73 & 0.72\\ \hline
        500.0 & 100.0 & 0.62 & 0.62 & 0.7 & 0.66\\ \hline
        500.0 & 500.0 & 0.58 & 0.54 & 0.66 & 0.61\\ \hline
        500.0 & 2000.0 & 0.59 & 0.56 & 0.66 & 0.56\\ \hline
        1000.0 & 50.0 & 0.63 & 0.63 & 0.76 & 0.75\\ \hline
        1000.0 & 100.0 & 0.64 & 0.64 & 0.74 & 0.73\\ \hline
        1000.0 & 500.0 & 0.62 & 0.59 & 0.67 & 0.63\\ \hline
        1000.0 & 2000.0 & 0.58 & 0.57 & 0.67 & 0.59\\ \hlin