In [1]:
from IPython.display import display as ipy_display
import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt

%config InlineBackend.figure_format = 'svg'

In [2]:
best_models_df = pd.read_csv('best_models.csv', encoding='utf-8')
best_models_df.sample(5)

Unnamed: 0,model,base,mean_test_score,rank_test_score,clf__alpha,clf__loss,clf__penalty,cxf__chars__min_df,cxf__postf__ngram_range,cxf__words__ngram_range,clf__C
230,SGD,SGDClassifier(max_iter=5000),0.578891,231.0,,,,,"(1, 1)","(1, 2)",0.01
157,SGD,SGDClassifier(max_iter=5000),0.668777,158.0,1.0,hinge,l1,0.25,"(1, 2)","(1, 3)",
27,SGD,SGDClassifier(max_iter=5000),0.761204,28.0,0.1,hinge,l2,0.25,"(1, 3)","(1, 3)",
240,SGD,SGDClassifier(max_iter=5000),0.576845,241.0,0.001,log_loss,l2,0.1,"(1, 3)","(1, 1)",
356,SGD,SGDClassifier(max_iter=5000),0.332344,357.0,1.0,hinge,l2,0.1,"(1, 1)","(1, 3)",


In [3]:
sgd = (
    best_models_df
       [(best_models_df['model'] == 'SGD') & (best_models_df['clf__loss'].notna())]
       .sort_values('mean_test_score', ascending=True)
       .reset_index(drop=True)
)
sgd

Unnamed: 0,model,base,mean_test_score,rank_test_score,clf__alpha,clf__loss,clf__penalty,cxf__chars__min_df,cxf__postf__ngram_range,cxf__words__ngram_range,clf__C
0,SGD,SGDClassifier(max_iter=5000),0.332292,359.0,0.0100,hinge,l1,0.10,"(1, 2)","(1, 2)",
1,SGD,SGDClassifier(max_iter=5000),0.332293,358.0,0.1000,hinge,l1,0.10,"(1, 2)","(1, 2)",
2,SGD,SGDClassifier(max_iter=5000),0.332344,357.0,1.0000,hinge,l2,0.10,"(1, 1)","(1, 3)",
3,SGD,SGDClassifier(max_iter=5000),0.332395,355.0,0.0100,log_loss,l2,0.25,"(1, 1)","(1, 2)",
4,SGD,SGDClassifier(max_iter=5000),0.332445,354.0,1.0000,hinge,l1,0.25,"(1, 1)","(1, 2)",
...,...,...,...,...,...,...,...,...,...,...,...
292,SGD,SGDClassifier(max_iter=5000),0.770176,5.0,0.0100,hinge,l2,0.25,"(1, 2)","(1, 2)",
293,SGD,SGDClassifier(max_iter=5000),0.770615,4.0,0.0001,log_loss,l2,0.25,"(1, 2)","(1, 2)",
294,SGD,SGDClassifier(max_iter=5000),0.770894,3.0,0.0010,log_loss,l2,0.25,"(1, 2)","(1, 3)",
295,SGD,SGDClassifier(max_iter=5000),0.770950,2.0,0.0100,log_loss,l2,0.10,"(1, 2)","(1, 3)",


In [4]:
sgd.columns

Index(['model', 'base', 'mean_test_score', 'rank_test_score', 'clf__alpha',
       'clf__loss', 'clf__penalty', 'cxf__chars__min_df',
       'cxf__postf__ngram_range', 'cxf__words__ngram_range', 'clf__C'],
      dtype='object')

In [5]:
from ast import literal_eval

colors = ['#a52040', '#404080']
markers = ['o', 's']

params = [col for col in sgd.columns
          if col.startswith('cxf__') or col.startswith('clf__')
          and col not in ['clf__C',]]

ngram_cols = [col for col in params if col.endswith('__ngram_range')]
# convert ngram_range to float for better visualization
for col in ngram_cols:
    sgd[col] = sgd[col].apply(lambda x: literal_eval(f"{x[1:-1]}".replace(', ', '.')))

all_params = pd.DataFrame()

# which are the best hyperparameters for each loss function?
for loss in sgd['clf__loss'].unique():
    hyperparams_df = (
        pd.DataFrame(sgd[sgd['clf__loss'] == loss][params + ['mean_test_score']])
            .sort_values('mean_test_score', ascending=False)
            .reset_index(drop=True)
    )
    hyperparams_df.index += 1
    hyperparams_df.index.name = 'rank'
    hyperparams_df = hyperparams_df.rename(columns={'mean_test_score': 'F1_score'})
    model_name = 'lsvm' if loss == 'hinge' else 'logreg'
    ipy_display(
        hyperparams_df
            .style
            .format({
                'mean_test_score': '{:.3f}'.format, # format score
                'clf__alpha': '{:g}'.format, # format alpha, exponent notation
                'cxf__chars__min_df': '{:g}'.format, # format alpha, exponent notation
                'clf__penalty': lambda x: rf"$\ell_{{{x}}}$", # format penalty
                **{col: lambda x: ", ".join(f"{x}".split('.'))
                for col in ngram_cols}, # convert ngram_range back
            })
            .background_gradient(cmap='RdBu', axis=0)
            .set_caption(f'Best hyperparameters for {loss=}')
    )

    all_params = pd.concat([all_params, hyperparams_df])

Unnamed: 0_level_0,clf__alpha,clf__loss,clf__penalty,cxf__chars__min_df,cxf__postf__ngram_range,cxf__words__ngram_range,F1_score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.001,hinge,$\ell_{l1}$,0.1,"1, 1","1, 2",0.772847
2,0.01,hinge,$\ell_{l2}$,0.25,"1, 2","1, 2",0.770176
3,0.01,hinge,$\ell_{l2}$,0.1,"1, 2","1, 1",0.767881
4,0.01,hinge,$\ell_{l1}$,0.1,"1, 1","1, 1",0.767764
5,0.001,hinge,$\ell_{l1}$,0.1,"1, 2","1, 1",0.764003
6,0.0001,hinge,$\ell_{l1}$,0.25,"1, 2","1, 3",0.763041
7,0.1,hinge,$\ell_{l2}$,0.1,"1, 3","1, 3",0.762649
8,0.001,hinge,$\ell_{l1}$,0.25,"1, 1","1, 2",0.762445
9,0.0001,hinge,$\ell_{l1}$,0.1,"1, 1","1, 3",0.761977
10,0.1,hinge,$\ell_{l2}$,0.25,"1, 3","1, 3",0.761204


Unnamed: 0_level_0,clf__alpha,clf__loss,clf__penalty,cxf__chars__min_df,cxf__postf__ngram_range,cxf__words__ngram_range,F1_score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.01,log_loss,$\ell_{l2}$,0.1,"1, 2","1, 3",0.77095
2,0.001,log_loss,$\ell_{l2}$,0.25,"1, 2","1, 3",0.770894
3,0.0001,log_loss,$\ell_{l2}$,0.25,"1, 2","1, 2",0.770615
4,0.1,log_loss,$\ell_{l1}$,0.1,"1, 2","1, 3",0.769417
5,0.1,log_loss,$\ell_{l1}$,0.1,"1, 1","1, 2",0.767357
6,1.0,log_loss,$\ell_{l2}$,0.25,"1, 1","1, 1",0.765477
7,0.1,log_loss,$\ell_{l2}$,0.25,"1, 2","1, 1",0.765296
8,0.001,log_loss,$\ell_{l2}$,0.1,"1, 2","1, 2",0.764283
9,0.001,log_loss,$\ell_{l1}$,0.25,"1, 3","1, 2",0.764195
10,0.0001,log_loss,$\ell_{l2}$,0.25,"1, 2","1, 1",0.763298


In [6]:
# group by loss function, and rank by F1 score
all_params = (
    all_params
        .groupby('clf__loss')
        .apply(lambda x: x.sort_values('F1_score', ascending=False))
)

all_params

Unnamed: 0_level_0,Unnamed: 1_level_0,clf__alpha,clf__loss,clf__penalty,cxf__chars__min_df,cxf__postf__ngram_range,cxf__words__ngram_range,F1_score
clf__loss,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
hinge,1,0.001,hinge,l1,0.10,1.1,1.2,0.772847
hinge,2,0.010,hinge,l2,0.25,1.2,1.2,0.770176
hinge,3,0.010,hinge,l2,0.10,1.2,1.1,0.767881
hinge,4,0.010,hinge,l1,0.10,1.1,1.1,0.767764
hinge,5,0.001,hinge,l1,0.10,1.2,1.1,0.764003
...,...,...,...,...,...,...,...,...
log_loss,152,1.000,log_loss,l1,0.10,1.3,1.3,0.333327
log_loss,153,0.010,log_loss,l1,0.10,1.3,1.2,0.333276
log_loss,154,1.000,log_loss,l2,0.10,1.1,1.1,0.333276
log_loss,155,0.001,log_loss,l2,0.10,1.1,1.3,0.333276


In [7]:
all_params = all_params.drop(columns=['clf__loss'])
all_params

Unnamed: 0_level_0,Unnamed: 1_level_0,clf__alpha,clf__penalty,cxf__chars__min_df,cxf__postf__ngram_range,cxf__words__ngram_range,F1_score
clf__loss,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
hinge,1,0.001,l1,0.10,1.1,1.2,0.772847
hinge,2,0.010,l2,0.25,1.2,1.2,0.770176
hinge,3,0.010,l2,0.10,1.2,1.1,0.767881
hinge,4,0.010,l1,0.10,1.1,1.1,0.767764
hinge,5,0.001,l1,0.10,1.2,1.1,0.764003
...,...,...,...,...,...,...,...
log_loss,152,1.000,l1,0.10,1.3,1.3,0.333327
log_loss,153,0.010,l1,0.10,1.3,1.2,0.333276
log_loss,154,1.000,l2,0.10,1.1,1.1,0.333276
log_loss,155,0.001,l2,0.10,1.1,1.3,0.333276


In [8]:
all_params['clf__alpha'] = all_params['clf__alpha'].apply("{:g}".format)

In [9]:
for col in ngram_cols:
    all_params[col] = (
        all_params[col]
         .astype(str)
         .apply(lambda x: 
                        "$n = " + "-".join(x.split('.'))
                        .removeprefix('[')
                        .removesuffix(']') + "$")
                        .astype(str)
    )

all_params = (
    all_params
   .rename(columns={
        col: col.removeprefix('clf__').removeprefix('cxf__')
        for col in all_params.columns
   })
)
all_params['F1_score'] = all_params['F1_score'].round(3)
all_params = all_params.rename_axis(index={'clf__loss': 'loss'})
best_params = all_params.groupby('loss').head(10)
best_params

Unnamed: 0_level_0,Unnamed: 1_level_0,alpha,penalty,chars__min_df,postf__ngram_range,words__ngram_range,F1_score
loss,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
hinge,1,0.001,l1,0.1,$n = 1-1$,$n = 1-2$,0.773
hinge,2,0.01,l2,0.25,$n = 1-2$,$n = 1-2$,0.77
hinge,3,0.01,l2,0.1,$n = 1-2$,$n = 1-1$,0.768
hinge,4,0.01,l1,0.1,$n = 1-1$,$n = 1-1$,0.768
hinge,5,0.001,l1,0.1,$n = 1-2$,$n = 1-1$,0.764
hinge,6,0.0001,l1,0.25,$n = 1-2$,$n = 1-3$,0.763
hinge,7,0.1,l2,0.1,$n = 1-3$,$n = 1-3$,0.763
hinge,8,0.001,l1,0.25,$n = 1-1$,$n = 1-2$,0.762
hinge,9,0.0001,l1,0.1,$n = 1-1$,$n = 1-3$,0.762
hinge,10,0.1,l2,0.25,$n = 1-3$,$n = 1-3$,0.761


In [10]:
best_params.style.to_latex('best_hyperparams_sgdclassifier.tex', 
                            position='h!', position_float='centering', hrules=True, 
                            multicol_align='c', multirow_align='t', encoding='utf-8',
                            caption="Top 10 best hyperparameters for the two linear classifiers.")

In [11]:
import plotly.express as px


def shorten_param(param_name: str) -> str:
    """Remove components' prefixes in param_name."""
    if "__" in param_name:
        return param_name.split("__", 1)[1]
    return param_name


plot_results_df = best_models_df.copy().drop(columns=['model', 'base', 'rank_test_score', 'clf__C'])
plot_results_df = plot_results_df.rename(shorten_param, axis=1)
plot_results_df = plot_results_df.rename(columns={
                                            'mean_test_score': 'F1-macro',
                                            'chars__min_df': 'min_df',
                                            'postf__ngram_range': 'POS n-grams',
                                            'words__ngram_range': 'Word n-grams',
                                        })

column_results = [shorten_param(col) for col in plot_results_df.columns]
column_results = column_results[1:] + column_results[:1]

# Create a dictionary of functions to transform the values of the columns
transform_funcs = dict.fromkeys(column_results, lambda x: x)
transform_funcs['min_df'] = lambda x: x
transform_funcs['F1-macro'] = lambda x: round(x, 2)
# Using a logarithmic scale for alpha
transform_funcs['alpha'] = lambda x: x
# L1 norms are mapped to index 1, and L2 norms to index 2
transform_funcs['loss'] = lambda x: 2 if x == "hinge" else 1
# L1 norms are mapped to index 1, and L2 norms to index 2
transform_funcs['penalty'] = lambda x: 2 if x == "l2" else 1
# Unigrams are mapped to index 1 and bigrams to index 2
transform_funcs['POS n-grams'] = lambda x: x[0]
# Unigrams are mapped to index 1 and bigrams to index 2
transform_funcs['Word n-grams'] = lambda x: x[1]


fig = px.parallel_coordinates(
    plot_results_df[column_results].apply(transform_funcs),
    color='F1-macro',
    color_continuous_scale=px.colors.sequential.RdBu,
)
fig.update_layout(
    title={
        'text': "Best hyperparameters (LSVM vs LogReg)",
        'y': 0.99,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    }
)
fig.show(renderer='svg')

  dims = [
