Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SGD learning rate tuning + pytorch implementation, simulations #81

Merged
merged 24 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f872f5f
add command line option for max iterations
jjc2718 May 8, 2023
7ef2eba
refactor filenames + add max iter option
jjc2718 May 8, 2023
6c950c4
testing more flexible approach to filename functions
jjc2718 May 8, 2023
595b3d0
script to look at max iter variation
jjc2718 May 9, 2023
fc36f43
pytorch logistic regression seems to work
jjc2718 May 14, 2023
93d606b
save loss function values + add script to run
jjc2718 May 14, 2023
eb7f8d2
add batch size command line argument
jjc2718 May 15, 2023
82bec4f
batch size analysis
jjc2718 May 15, 2023
7439bef
fix loss function logging
jjc2718 May 16, 2023
0af9674
simulate optimizer comparison
jjc2718 May 16, 2023
6aa0a08
Merge branch 'master' into sgd_vary_params
jjc2718 May 16, 2023
15e92d9
Merge branch 'sgd_vary_params' into sgd_simulations
jjc2718 May 16, 2023
70b820c
add learning rate variation script
jjc2718 May 18, 2023
45fc941
compare learning rate schedules
jjc2718 May 22, 2023
d9d3851
plot KRAS results for varying learning rates
jjc2718 May 22, 2023
2fbab2d
learning rate search and output loss
jjc2718 May 23, 2023
8669670
plot learning rate schedule comparison results
jjc2718 May 23, 2023
57bd3b6
split bins code out + add per-param coef visualization
jjc2718 May 25, 2023
a998f72
scripts to run stuff on slurm cluster
jjc2718 May 25, 2023
f7a484e
plot loss values + add liblinear baseline
jjc2718 May 26, 2023
6c349a3
quantify overfitting in all script
jjc2718 May 26, 2023
6cca544
clean up slightly
jjc2718 May 26, 2023
ba72543
add constant search to simulation script
jjc2718 May 26, 2023
c00f8f2
changes from code review
jjc2718 Jun 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,408 changes: 1,059 additions & 349 deletions 01_stratified_classification/lasso_range_all_optimizers.ipynb

Large diffs are not rendered by default.

1,737 changes: 1,737 additions & 0 deletions 01_stratified_classification/lasso_range_gene_bins.ipynb

Large diffs are not rendered by default.

594 changes: 594 additions & 0 deletions 01_stratified_classification/lasso_range_gene_learning_rate.ipynb

Large diffs are not rendered by default.

2,451 changes: 824 additions & 1,627 deletions 01_stratified_classification/lasso_range_gene_optimizers.ipynb

Large diffs are not rendered by default.

148 changes: 140 additions & 8 deletions 01_stratified_classification/nbconverted/lasso_range_all_optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,16 @@


ll_results_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_ll', 'gene'
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_ll_lr_range', 'gene'
)

sgd_results_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_sgd', 'gene'
cfg.repo_root, '01_stratified_classification', 'results', 'optimizer_compare_sgd_lr_constant_search', 'gene'
)

plot_gene = 'EGFR'
metric = 'aupr'

output_plots = True
output_plots = False
output_plots_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'optimizers_plots'
)
Expand Down Expand Up @@ -282,15 +281,15 @@ def get_top_optimizer_diff(gene):
# In[12]:


sns.set({'figure.figsize': (10, 2.75)})
sns.set({'figure.figsize': (10, 3)})
sns.set_style('whitegrid')

sns.histplot(all_top_optimizer_diff_df.ll_sgd_diff, binwidth=0.025, binrange=(-0.1, 0.45))
plt.xlim(-0.1, 0.45)
sns.histplot(all_top_optimizer_diff_df.ll_sgd_diff, binwidth=0.0125, binrange=(-0.2, 0.2))
plt.xlim(-0.2, 0.2)
plt.title('Differences between liblinear and SGD optimizers, across all Vogelstein genes', size=14, y=1.05)
plt.xlabel('AUPR(liblinear) - AUPR(SGD)', size=13)
plt.ylabel('Count', size=13)
plt.yticks(np.arange(0, 15, 2))
plt.yticks(np.arange(0, 25, 5))
plt.gca().axvline(0, color='black', linestyle='--')

if output_plots:
Expand Down Expand Up @@ -405,3 +404,136 @@ def color_boxes(ax):
if output_plots:
plt.savefig(os.path.join(output_plots_dir, f'all_optimizers_coef_count_per_gene.png'), bbox_inches='tight')


# ### Get difference between "best" and "largest" (least regularized) models
#
# We can use this to determine how much a model overfits: that is, as model complexity increases, how much (or if at all) performance decreases from its peak.

# In[19]:


def get_top_largest_diff(gene):
# TODO: put some of repeated code in functions
ll_top_lasso_param = ll_top_df.loc[gene, 'lasso_param']
sgd_top_lasso_param = sgd_top_df.loc[gene, 'lasso_param']

ll_mean_test_perf_df = (
ll_perf_df[(ll_perf_df.gene == gene) &
(ll_perf_df.data_type == 'test') &
(ll_perf_df.signal == 'signal')]
.groupby(['lasso_param'])
.agg(np.mean)
.drop(columns=['seed', 'fold'])
.rename(columns={'auroc': 'mean_auroc', 'aupr': 'mean_aupr'})
)
sgd_mean_test_perf_df = (
sgd_perf_df[(sgd_perf_df.gene == gene) &
(sgd_perf_df.data_type == 'test') &
(sgd_perf_df.signal == 'signal')]
.groupby(['lasso_param'])
.agg(np.mean)
.drop(columns=['seed', 'fold'])
.rename(columns={'auroc': 'mean_auroc', 'aupr': 'mean_aupr'})
)

# use maximum liblinear parameter = largest (least regularized) model
ll_top_largest_diff = (
ll_mean_test_perf_df.loc[ll_top_lasso_param, 'mean_aupr'] -
ll_mean_test_perf_df.loc[ll_perf_df.lasso_param.max(), 'mean_aupr']
)
# use minimum SGD parameter = largest (least regularized) model
sgd_top_largest_diff = (
sgd_mean_test_perf_df.loc[sgd_top_lasso_param, 'mean_aupr'] -
sgd_mean_test_perf_df.loc[sgd_perf_df.lasso_param.min(), 'mean_aupr']
)

return [gene,
ll_top_lasso_param,
sgd_top_lasso_param,
ll_top_largest_diff,
sgd_top_largest_diff]

print(get_top_largest_diff('PTEN'))


# In[20]:


all_top_largest_diff_df = []

for gene in ll_perf_df.gene.unique():
all_top_largest_diff_df.append(
get_top_largest_diff(gene)
)

all_top_largest_diff_df = pd.DataFrame(
all_top_largest_diff_df,
columns=['gene',
'll_top_lasso_param',
'sgd_top_lasso_param',
'll_top_largest_diff',
'sgd_top_largest_diff']
)

all_top_largest_diff_df['largest_diff'] = 'liblinear'
all_top_largest_diff_df.loc[
all_top_largest_diff_df.ll_top_largest_diff < all_top_largest_diff_df.sgd_top_largest_diff,
'largest_diff'
] = 'SGD'
# this probably won't happen but just in case
all_top_largest_diff_df.loc[
all_top_largest_diff_df.ll_top_largest_diff == all_top_largest_diff_df.sgd_top_largest_diff,
'largest_diff'
] = 'equal'

print(all_top_largest_diff_df.largest_diff.value_counts())
all_top_largest_diff_df.head()


# In[21]:


plot_df = (all_top_largest_diff_df
.rename(columns={'ll_top_largest_diff': 'liblinear',
'sgd_top_largest_diff': 'SGD'})
.melt(
id_vars=['gene', 'll_top_lasso_param', 'sgd_top_lasso_param', 'largest_diff'],
var_name='optimizer',
value_name='top_largest_diff'
)
)

plot_df.head()


# In[22]:


sns.set({'figure.figsize': (7, 3)})
sns.set_style('whitegrid')

sns.violinplot(data=plot_df, x='optimizer', y='top_largest_diff', cut=0)
plt.title('Difference between best and largest model, across genes', y=1.05)
plt.xlabel('Optimizer')
plt.ylabel('AUPR(best model) - AUPR(largest model)')


# In[23]:


# genes that overfit the most using liblinear
(all_top_largest_diff_df
.sort_values(by='ll_top_largest_diff', ascending=False)
.head(10)
)


# In[24]:


# genes that overfit the most using SGD
(all_top_largest_diff_df
.sort_values(by='sgd_top_largest_diff', ascending=False)
.head(10)
)

Loading