In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from lmmvae.dim_reduction import run_dim_reduction
from lmmvae.simulation import Count

In [3]:
news = pd.read_csv('../../data/news_df2.csv')
cols_to_drop = ['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate', 'GooglePlus', 'LinkedIn']
news.drop(cols_to_drop, axis=1, inplace=True)

In [4]:
print(news.shape)
news.head()

(81634, 179)


Unnamed: 0,SentimentTitle,SentimentHeadline,Facebook,hour,day,month,title_id,source_id,w_palestine,w_president,...,w_congress,w_final,w_white,w_saturday,w_islamic,w_palestinian,topic_economy,topic_microsoft,topic_obama,topic_palestine
0,0.0,-0.005906,0,14,1,2,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0.048546,0,9,3,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,-0.243068,0.048546,0,0,3,3,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0.0,-0.132812,0,5,1,11,2,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.082022,0.205537,0,5,1,11,3,2,0,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
n_cats_source = len(news['source_id'].unique())
n_cats_title = len(news['title_id'].unique())
print(f'no. of sources: {n_cats_source}')
print(f'no. of titles: {n_cats_title}')

no. of sources: 5475
no. of titles: 72663


In [6]:
# categorical features name change
news.rename(columns={'source_id': 'z0',
                    'title_id': 'z1'}, inplace=True)

In [7]:
# special features: t for longitudinal, D1 and D2 for spatial (longitude, latitude)
news.columns = [col.lower() if col not in ['D1', 'D2', 't'] else col for col in news.columns]
print([col for col in news.columns if col.startswith('t')])
print([col for col in news.columns if col.startswith('z')])

['topic_economy', 'topic_microsoft', 'topic_obama', 'topic_palestine']
['z1', 'z0']


In [8]:
# move special features to end
y = news['facebook']
new_cols = [col for col in news.columns if col not in ['D1', 'D2', 'z0', 't', 'z1', 'facebook']] + ['z0', 'z1']
news = news[new_cols]
news.columns

Index(['sentimenttitle', 'sentimentheadline', 'hour', 'day', 'month',
       'w_palestine', 'w_president', 'w_thursday', 'w_visit', 'w_government',
       ...
       'w_white', 'w_saturday', 'w_islamic', 'w_palestinian', 'topic_economy',
       'topic_microsoft', 'topic_obama', 'topic_palestine', 'z0', 'z1'],
      dtype='object', length=178)

In [9]:
# final X matrix
news = news.astype(np.float64)
X = news

In [10]:
# params for LMMVAE and other methods, some unnecessary for current use-case therefore are none
mode = 'categorical'
n_sig2bs = 2
n_sig2bs_spatial = 0
n_neurons = [1000, 500]
dropout = None
activation = 'relu'
RE_cols_prefix = 'z'
thresh = None
epochs = 200
qs = [n_cats_source, n_cats_title]
q_spatial = None
batch_size = 1000
patience = None
kernel_root = None
U = None
B_list = None
est_cors = []
n_neurons_re = n_neurons
pred_unknown_clusters = False
max_spatial_locs = 100
time2measure_dict = None

In [11]:
res = pd.DataFrame(columns=['d', 'beta', 're_prior', 'experiment', 'exp_type', 'mse_X', 'sigma_b0_est', 'sigma_b1_est', 'n_epoch', 'time'])
kf = KFold(n_splits=5, shuffle=True, random_state=40)
counter = Count().gen()
x_cols = [col for col in X.columns if col not in ['z0', 'z1']]
x_cols_to_scale = [col for col in x_cols if col not in ['D1', 'D2', 't']]

In [12]:
def iterate_reg_types(X_train, X_test, counter, d, beta, re_prior, i, verbose):
    mse_lmmvae, sigmas, _, n_epochs_lmmvae, time_lmmvae = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'lmmvae',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished lmmvae, mse: %.3f' % mse_lmmvae)
    mse_ig, _, _, n_epochs_ig, time_ig = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'pca-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished pca-ignore, mse: %.3f' % mse_ig)
    mse_vaeig, _, _, n_epochs_vaeig, time_vaeig = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'vae-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-ignore, mse: %.3f' % mse_vaeig)
    mse_vaeem, _, _, n_epochs_vaeem, time_vaeem = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'vae-embed',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-embed, mse: %.3f' % mse_vaeem)
    res.loc[next(counter)] = [d, beta, re_prior, i, 'lmmvae', mse_lmmvae, sigmas[1][0], sigmas[1][1], n_epochs_lmmvae, time_lmmvae]
    res.loc[next(counter)] = [d, beta, re_prior, i, 'pca-ignore', mse_ig, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-ignore', mse_vaeig, np.nan, np.nan, n_epochs_vaeig, time_vaeig]
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-embed', mse_vaeem, np.nan, np.nan, n_epochs_vaeem, time_vaeem]
    res.to_csv('res_news.csv')

In [13]:
betas = [0.001]
ds = [1, 2, 5]
re_priors = [0.1]

In [15]:
for beta in betas:
  for d in ds:
    for re_prior in re_priors:
      print(f'beta: {beta}, d: {d}, re_prior: {re_prior}:')
      for i, (train_index, test_index) in enumerate(kf.split(X)):
        print('  iteration %d' % i)
        X_train, X_test = X.loc[train_index].copy(), X.loc[test_index].copy()
        scaler = StandardScaler()
        X_train[x_cols_to_scale] = scaler.fit_transform(X_train[x_cols_to_scale])
        X_test[x_cols_to_scale] = scaler.transform(X_test[x_cols_to_scale])
        iterate_reg_types(X_train, X_test, counter, d, beta, re_prior, i, verbose=False)