Skip to content

Commit

Permalink
Merge pull request #81 from jjc2718/sgd_simulations
Browse files Browse the repository at this point in the history
SGD learning rate tuning + pytorch implementation, simulations
  • Loading branch information
jjc2718 committed Jun 1, 2023
2 parents a577d11 + c00f8f2 commit 0564770
Show file tree
Hide file tree
Showing 32 changed files with 12,324 additions and 2,649 deletions.
1,408 changes: 1,059 additions & 349 deletions 01_stratified_classification/lasso_range_all_optimizers.ipynb

Large diffs are not rendered by default.

1,737 changes: 1,737 additions & 0 deletions 01_stratified_classification/lasso_range_gene_bins.ipynb

Large diffs are not rendered by default.

594 changes: 594 additions & 0 deletions 01_stratified_classification/lasso_range_gene_learning_rate.ipynb

Large diffs are not rendered by default.

2,451 changes: 824 additions & 1,627 deletions 01_stratified_classification/lasso_range_gene_optimizers.ipynb

Large diffs are not rendered by default.

148 changes: 140 additions & 8 deletions 01_stratified_classification/nbconverted/lasso_range_all_optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,16 @@


ll_results_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_ll', 'gene'
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_ll_lr_range', 'gene'
)

sgd_results_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_sgd', 'gene'
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_sgd_lr_constant_search', 'gene'
)

plot_gene = 'EGFR'
metric = 'aupr'

output_plots = True
output_plots = False
output_plots_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'optimizers_plots'
)
Expand Down Expand Up @@ -282,15 +281,15 @@ def get_top_optimizer_diff(gene):
# In[12]:


sns.set({'figure.figsize': (10, 2.75)})
sns.set({'figure.figsize': (10, 3)})
sns.set_style('whitegrid')

sns.histplot(all_top_optimizer_diff_df.ll_sgd_diff, binwidth=0.025, binrange=(-0.1, 0.45))
plt.xlim(-0.1, 0.45)
sns.histplot(all_top_optimizer_diff_df.ll_sgd_diff, binwidth=0.0125, binrange=(-0.2, 0.2))
plt.xlim(-0.2, 0.2)
plt.title('Differences between liblinear and SGD optimizers, across all Vogelstein genes', size=14, y=1.05)
plt.xlabel('AUPR(liblinear) - AUPR(SGD)', size=13)
plt.ylabel('Count', size=13)
plt.yticks(np.arange(0, 15, 2))
plt.yticks(np.arange(0, 25, 5))
plt.gca().axvline(0, color='black', linestyle='--')

if output_plots:
Expand Down Expand Up @@ -405,3 +404,136 @@ def color_boxes(ax):
if output_plots:
plt.savefig(os.path.join(output_plots_dir, f'all_optimizers_coef_count_per_gene.png'), bbox_inches='tight')


# ### Get difference between "best" and "largest" (least regularized) models
#
# We can use this to determine how much a model overfits: that is, as model complexity increases, how much (or if at all) performance decreases from its peak.

# In[19]:


def get_top_largest_diff(gene):
# TODO: put some of repeated code in functions
ll_top_lasso_param = ll_top_df.loc[gene, 'lasso_param']
sgd_top_lasso_param = sgd_top_df.loc[gene, 'lasso_param']

ll_mean_test_perf_df = (
ll_perf_df[(ll_perf_df.gene == gene) &
(ll_perf_df.data_type == 'test') &
(ll_perf_df.signal == 'signal')]
.groupby(['lasso_param'])
.agg(np.mean)
.drop(columns=['seed', 'fold'])
.rename(columns={'auroc': 'mean_auroc', 'aupr': 'mean_aupr'})
)
sgd_mean_test_perf_df = (
sgd_perf_df[(sgd_perf_df.gene == gene) &
(sgd_perf_df.data_type == 'test') &
(sgd_perf_df.signal == 'signal')]
.groupby(['lasso_param'])
.agg(np.mean)
.drop(columns=['seed', 'fold'])
.rename(columns={'auroc': 'mean_auroc', 'aupr': 'mean_aupr'})
)

# use maximum liblinear parameter = largest (least regularized) model
ll_top_largest_diff = (
ll_mean_test_perf_df.loc[ll_top_lasso_param, 'mean_aupr'] -
ll_mean_test_perf_df.loc[ll_perf_df.lasso_param.max(), 'mean_aupr']
)
# use minimum SGD parameter = largest (least regularized) model
sgd_top_largest_diff = (
sgd_mean_test_perf_df.loc[sgd_top_lasso_param, 'mean_aupr'] -
sgd_mean_test_perf_df.loc[sgd_perf_df.lasso_param.min(), 'mean_aupr']
)

return [gene,
ll_top_lasso_param,
sgd_top_lasso_param,
ll_top_largest_diff,
sgd_top_largest_diff]

print(get_top_largest_diff('PTEN'))


# In[20]:


all_top_largest_diff_df = []

for gene in ll_perf_df.gene.unique():
all_top_largest_diff_df.append(
get_top_largest_diff(gene)
)

all_top_largest_diff_df = pd.DataFrame(
all_top_largest_diff_df,
columns=['gene',
'll_top_lasso_param',
'sgd_top_lasso_param',
'll_top_largest_diff',
'sgd_top_largest_diff']
)

all_top_largest_diff_df['largest_diff'] = 'liblinear'
all_top_largest_diff_df.loc[
all_top_largest_diff_df.ll_top_largest_diff < all_top_largest_diff_df.sgd_top_largest_diff,
'largest_diff'
] = 'SGD'
# this probably won't happen but just in case
all_top_largest_diff_df.loc[
all_top_largest_diff_df.ll_top_largest_diff == all_top_largest_diff_df.sgd_top_largest_diff,
'largest_diff'
] = 'equal'

print(all_top_largest_diff_df.largest_diff.value_counts())
all_top_largest_diff_df.head()


# In[21]:


plot_df = (all_top_largest_diff_df
.rename(columns={'ll_top_largest_diff': 'liblinear',
'sgd_top_largest_diff': 'SGD'})
.melt(
id_vars=['gene', 'll_top_lasso_param', 'sgd_top_lasso_param', 'largest_diff'],
var_name='optimizer',
value_name='top_largest_diff'
)
)

plot_df.head()


# In[22]:


sns.set({'figure.figsize': (7, 3)})
sns.set_style('whitegrid')

sns.violinplot(data=plot_df, x='optimizer', y='top_largest_diff', cut=0)
plt.title('Difference between best and largest model, across genes', y=1.05)
plt.xlabel('Optimizer')
plt.ylabel('AUPR(best model) - AUPR(largest model)')


# In[23]:


# genes that overfit the most using liblinear
(all_top_largest_diff_df
.sort_values(by='ll_top_largest_diff', ascending=False)
.head(10)
)


# In[24]:


# genes that overfit the most using SGD
(all_top_largest_diff_df
.sort_values(by='sgd_top_largest_diff', ascending=False)
.head(10)
)

Loading

0 comments on commit 0564770

Please sign in to comment.