# CaFe relation for giants

In [None]:
from matplotlib import pyplot as plt

import numpy as np 

import pandas as pd

from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from utils.prepare_data import clean_data, filter_rows_by_std, get_magnitude_diffs, join_rows_from_raw, load_data


plt.style.use('seaborn-darkgrid')
plt.rcParams["figure.figsize"] = (10, 10)

## Prepare the data for the modelling

In [None]:
# load the data
df = load_data()
target_variable = 'cafe'

In [None]:
df = clean_data(df)  # Basic cleaning
# construct the magnitude differences
ordered_mag_columns = ['magcr3', 'magbr3', 'magar3', 'bpmag', 'gmag', 'rpmag', 'jmag', 'kmag']
df_diffs = get_magnitude_diffs(df, ordered_mag_columns)
df_diffs['bpmag_rpmag'] = df['bpmag'] - df['rpmag']
# filter Na's and measurements with too large standard deviations
df_diffs_filtered = filter_rows_by_std(df_diffs, df, std_thresholds={
    'sigcr3': 0.05,
    'sigbr3': 0.05,
    'sigar3': 0.05,
    'ejmag': 0.05,
    'ekmag': 0.05,
}).dropna()

In [None]:
df.shape, df_diffs_filtered.shape

### Field shifts from Bayes model

In [None]:
df['cb_field_shift'] = df['field'].map({
    'Field-1': 0.3319922760263798,
    'Field-2': 0.41525421695893566,
    'Field-3': 0.4374036431066466,
    'Field-4': 0.41137730050813004,
    'Field-5': 0.35876049518667247,
    'Field-6': 0.42670825326670114,
    'Field-8': 0.4220114380659433,
    'Field-9': 0.38748489552795107,
    'Field-10': 0.38493277215437155,
    'Field-11': 0.3910498880361436,
    'Field-15': 0.37523785738068216,
    'Field-16': 0.37873686102483073,
    'Field-17': 0.37748938994827175,
    'Field-18': 0.3650478096668249,
    'Field-19': 0.39866733152517186,
    'Field-20': 0.39821298519891424,
    'Field-24': 0.42353001068319074,
    'Field-25': 0.432486767263635,
    'Field-27': 0.40814162530040954,
    'Field-28': 0.37364547999152314,
    'Field-29': 0.4262784749766956,
    'Field-32': 0.4809766006917441,
    'Field-33': 0.42479356859998646,
    'Field-34': 0.4177341235655455,
    'Field-35': 0.45922559449923883,
    'Field-38': 0.5008953767354531,
    'Field-39': 0.3607422231174212,
    'Field-40': 0.3808902958085871,
    'Field-41': 0.3588893561248595,
    'Field-42': 0.36663408584866286,
    'Field-45': 0.3572882366819113,
    'Field-46': 0.36898328437951294,
    'Field-47': 0.40957565746657026,
})

df['ba_field_shift'] = df['field'].map({
    'Field-1': 0.18080835515053956,
    'Field-2': 0.19971150768603052,
    'Field-3': 0.16673795188561255,
    'Field-4': 0.22764694756560977,
    'Field-5': 0.16393258009286565,
    'Field-6': 0.2218356785159324,
    'Field-8': 0.21095491325215732,
    'Field-9': 0.19161996881699248,
    'Field-10': 0.1837026412942269,
    'Field-11': 0.18115561005808317,
    'Field-15': 0.15200887755485093,
    'Field-16': 0.1656647564454164,
    'Field-17': 0.1667862329753091,
    'Field-18': 0.13141335282697575,
    'Field-19': 0.12365755419411407,
    'Field-20': 0.23729289979478213,
    'Field-24': 0.15642818357320198,
    'Field-25': 0.1840500586940485,
    'Field-27': 0.22263591110330505,
    'Field-28': 0.14583983959304825,
    'Field-29': 0.1633080989641097,
    'Field-32': 0.10157314049954563,
    'Field-33': 0.1320147960009749,
    'Field-34': 0.14638373988925385,
    'Field-35': 0.1275999413428845,
    'Field-38': 0.12160791831422073,
    'Field-39': 0.17665871089194196,
    'Field-40': 0.1635198872846553,
    'Field-41': 0.2009430095532145,
    'Field-42': 0.1866283299753808,
    'Field-45': 0.1722000325556393,
    'Field-46': 0.15581685616524635,
    'Field-47': 0.22921627154498236,
})

In [None]:
X = df_diffs_filtered
y = df[target_variable][X.index]
y_sum = df['cafe'][X.index] + df['feh'][X.index]

# giant indicator
giant_indicator = df['logg'][X.index] < 3.5

## Modelling

### Model selection

In [None]:
X = df_diffs_filtered[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
X = join_rows_from_raw(X, df, ['logg', 'teff'])[giant_indicator]
y = df[target_variable][X.index]
y_sum = df['cafe'][X.index] + df['feh'][X.index]

In [None]:
cv = RepeatedKFold(n_splits=8, n_repeats=4, random_state=316)

# define the pipeline to evaluate
pipeline = Pipeline(steps=[
    ('normalizer', StandardScaler()),
    ('feature_selector', SelectKBest(score_func=mutual_info_regression)),
    ('model', MLPRegressor(max_iter=4000))
])
# With target transform pipeline
# pipeline = Pipeline(steps=[
#     ('normalizer', StandardScaler()),
#     ('feature_selector', SelectKBest(score_func=mutual_info_regression)),
#     ('model', TransformedTargetRegressor(regressor=MLPRegressor(max_iter=4000), transformer=PowerTransformer()))
# ])
# define the grid : from 1 to all features
grid = {
    'feature_selector__k': [i for i in range(2, 7)],
    'model__hidden_layer_sizes': [(9,), (12,), (16,)],
    'model__activation': ['logistic', 'tanh', 'relu'],
    'model__solver': [
        'lbfgs',
        # 'sgd',
        # 'adam'
    ],
    # 'model__regressor__hidden_layer_sizes': [(9,), (12,), (16,)],
    # 'model__regressor__activation': ['logistic', 'tanh', 'relu'],
    # 'model__regressor__solver': ['lbfgs', 'sgd', 'adam'],
}
# define the grid search
search = GridSearchCV(
    pipeline,
    grid,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    cv=cv,
    # verbose=2
)
# perform the search
results = search.fit(X, y)
# summarize best
print(f'Best nMAE: {results.best_score_:.6f}', )
print(f'Best Config: {results.best_params_}:', )

print('Feature scores:')
for f, sc in zip(X.columns, search.best_estimator_.get_params().get('feature_selector').scores_):
    print(f'{f:16s} {sc:.6f}')
print(f'Selected features: {list(X.columns[search.best_estimator_.get_params().get("feature_selector").get_support()])}')

In [None]:
y_predicted = search.predict(X)
plt.scatter(y, y_predicted)
plt.plot([-.2, .5], [-.2, .5], color='r')
print(r2_score(y, y_predicted), mean_absolute_error(y, y_predicted))

### Prepare train & test set

In [None]:
X_use = df_diffs_filtered[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
# X_use = join_rows_from_raw(X_use, df, ['teff', 'logg'])

X_use['magcr3_magbr3'] = (X_use['magcr3_magbr3'] - df['cb_field_shift'][X_use.index] + .75)**(1/3)
X_use['magbr3_magar3'] = X_use['magbr3_magar3'] - df['ba_field_shift'][X_use.index]
X_use['jmag_kmag'] = (X_use['jmag_kmag'])**(1/2)
X_use['bpmag_rpmag'] = (X_use['bpmag_rpmag'])**(1/2)

'''
ind = X_use.index
cols = X_use.columns
normalizer = StandardScaler()
X_use = normalizer.fit_transform(X_use)
X_use = pd.DataFrame(
    data=X_use,
    index=ind,
    columns=cols
)'''

# logg and teff are well predictable from X_use, while CaFe and FeH are pure :/ 
y_use = y
# y_use = np.cbrt(y_use)
# y_use = y_use**3

X_use = X_use[giant_indicator]
y_use = y_use[giant_indicator]

X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size=0.25, random_state=314)

### Fit models for teff & logg

In [None]:
# loggflag = df['loggflag'][X_train.index]>.3*10**8

teff = df['teff'][X_train.index]
m_teff = LinearRegression()
m_teff.fit(X_train, teff)
teff_calc = m_teff.predict(X_train)
logg = df['logg'][X_train.index]
m_logg = LinearRegression()
m_logg.fit(X_train, logg)
# m_logg.fit(X_train[~loggflag], logg[~loggflag])
logg_calc = m_logg.predict(X_train)
X_train['teff'] = teff_calc
# X_train['logg'] = logg_calc

ind = X_train.index
cols = X_train.columns
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_train = pd.DataFrame(
    data=X_train,
    index=ind,
    columns=cols
)

### Inspect models for teff & logg

In [None]:
mask = df.loggflag > .6*10**8
alpha = .4

# mask = (df.magbr3 < df.magar3)*(df.magbr3 > df.magcr3)
# alpha = 1

fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(20, 20))
y_min = min(min(df['logg'][X_use.index]), min(logg_calc))
y_max = max(max(df['logg'][X_use.index]), max(logg_calc))
axes[0, 0].scatter(df['logg'][X_train.index], logg_calc)
axes[0, 0].scatter(df['logg'][X_train.index][mask[X_train.index]], logg_calc[mask[X_train.index]],
                   color='r', alpha=alpha)
axes[0, 0].set_title('logg from magnitude diffs on the train set')
axes[0, 0].set_xlabel('true')
axes[0, 0].set_ylabel('predicted')
axes[0, 0].plot([y_min, y_max], [y_min, y_max], color='r')

axes[0, 1].scatter(df['logg'][X_test.index], m_logg.predict(X_test))
axes[0, 1].scatter(df['logg'][X_test.index][mask[X_test.index]], m_logg.predict(X_test)[mask[X_test.index]],
                   color='r', alpha=alpha)
axes[0, 1].set_title('logg from magnitude diffs on the tet set')
axes[0, 1].set_xlabel('true')
axes[0, 1].set_ylabel('predicted')
axes[0, 1].plot([y_min, y_max], [y_min, y_max], color='r')

mask = df.teffflag > 100

y_min = min(min(df['teff'][X_use.index]), min(teff_calc))
y_max = max(max(df['teff'][X_use.index]), max(teff_calc))
axes[1, 0].scatter(df['teff'][X_train.index], teff_calc)
axes[1, 0].scatter(df['teff'][X_train.index][mask[X_train.index]], teff_calc[mask[X_train.index]],
                   color='r', alpha=alpha)
axes[1, 0].set_title('teff from magnitude diffs on the train set')
axes[1, 0].set_xlabel('true')
axes[1, 0].set_ylabel('predicted')
axes[1, 0].plot([y_min, y_max], [y_min, y_max], color='r')

axes[1, 1].scatter(df['teff'][X_test.index], m_teff.predict(X_test))
axes[1, 1].scatter(df['teff'][X_test.index][mask[X_test.index]], m_teff.predict(X_test)[mask[X_test.index]],
                   color='r', alpha=alpha)
axes[1, 1].set_title('teff from magnitude diffs on the tet set')
axes[1, 1].set_xlabel('true')
axes[1, 1].set_ylabel('predicted')
axes[1, 1].plot([y_min, y_max], [y_min, y_max], color='r')


### Fit & test model for CaFe

In [None]:
def tf(x):
    return (x+.25)**(1/3)

def tf_inv(x):
    return x**3 - .25

In [None]:
# Extend test set predictors with predicted teff, logg 
teff_test = m_teff.predict(X_test)
logg_test = m_logg.predict(X_test)
X_test['teff'] = teff_test
# X_test['logg'] = logg_test
ind = X_test.index
cols = X_test.columns
X_test = normalizer.transform(X_test)
X_test = pd.DataFrame(
    data=X_test,
    index=ind,
    columns=cols
)

In [None]:
transform = False

y_train_transformed = tf(y_train) if transform else y_train

# nn_model = MLPRegressor(hidden_layer_sizes=(16,), activation='relu', solver='lbfgs', max_iter=4000, random_state=316)
# Ca/Fe directly
nn_model = MLPRegressor(hidden_layer_sizes=(12,), activation='tanh', solver='lbfgs', max_iter=4000, random_state=316)
X_train = X_train[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'teff']]
X_test = X_test[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'teff']]
nn_model.fit(X_train, y_train_transformed)

In [None]:
y_train_predicted = tf_inv(nn_model.predict(X_train)) if transform else nn_model.predict(X_train)
y_test_predicted = tf_inv(nn_model.predict(X_test)) if transform else nn_model.predict(X_test)
y_min = min(min(y_train), min(y_test), min(y_train_predicted), min(y_test_predicted))
y_max = max(max(y_train), max(y_test), max(y_train_predicted), max(y_test_predicted))
# y_min, y_max = -.1, .4

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

# axes.grid(True)  # plt.rc('grid', linestyle=':', color='red', linewidth=2)

axes.scatter(y_train, y_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_test, y_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted CaFe on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_train, y_pred=y_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_train, y_pred=y_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_test, y_pred=y_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_test, y_pred=y_test_predicted)}')

### Statistics

In [None]:
# Error distribution
fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))

axes[0].hist(y_train_predicted-y_train, color='b', bins=25)
axes[0].set_title('Error distribution on train set')
axes[1].hist(y_test_predicted-y_test, color='r', bins=25)
axes[1].set_title('Error distribution on test set')
print('Train 16-84 percentile: ', np.percentile(y_train_predicted-y_train, [16, 84]))
print('Test 16-84 percentile: ', np.percentile(y_test_predicted-y_test, [16, 84]))

In [None]:
steps = np.linspace(start=-0.05, stop=.175, num=10)

y_all = pd.concat([y_train, y_test])
y_all_predicted = np.concatenate([y_train_predicted, y_test_predicted])

xx, yy16, yy84 = [], [], []
for i in range(len(steps) - 1):
    step = (steps[i] + steps[i+1])/2
    xx.append(step)
    # yy16.append(np.percentile(y_all_predicted[(y_all>steps[i]) * (y_all<=steps[i+1])]-y_all[(y_all>steps[i]) * (y_all<=steps[i+1])], 16))
    # yy84.append(np.percentile(y_all_predicted[(y_all>steps[i]) * (y_all<=steps[i+1])]-y_all[(y_all>steps[i]) * (y_all<=steps[i+1])], 84))
    yy16.append(np.percentile(y_all_predicted[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])]
                              - y_all[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])], 16))
    yy84.append(np.percentile(y_all_predicted[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])]
                              - y_all[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])], 84))

y_min = min(min(y_train), min(y_test), min(y_train_predicted), min(y_test_predicted))
y_max = max(max(y_train), max(y_test), max(y_train_predicted), max(y_test_predicted))
y_min, y_max = -.1, .4 

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_train, y_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_test, y_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])

# Errors
# axes.plot(xx, [x0+y0 for x0, y0 in zip(xx, yy16)], color='magenta', label='16th percentile', linewidth=3)
# axes.plot(xx, [x0+y0 for x0, y0 in zip(xx, yy84)], color='orange', label='84th percentile', linewidth=3)
axes.plot([x0+y0 for x0, y0 in zip(xx, yy16)], xx, color='magenta', label='16th percentile', linewidth=3)
axes.plot([x0+y0 for x0, y0 in zip(xx, yy84)], xx, color='orange', label='84th percentile', linewidth=3)

axes.set_title('True vs. predicted FeH on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_train, y_pred=y_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_train, y_pred=y_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_test, y_pred=y_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_test, y_pred=y_test_predicted)}')

print('Dataset splitted to 10 chunks, 16-th and 84-th percentile of the error calculated in each chunk and the percentile lines plotted to the graph.')

### CaFe + FeH and combined

In [None]:
y_sum_train = y_sum[y_train.index]
y_sum_test = y_sum[y_test.index]

transform = False

y_sum_train_transformed = tf(y_sum_train) if transform else y_sum_train

# nn_model = MLPRegressor(hidden_layer_sizes=(16,), activation='relu', solver='lbfgs', max_iter=4000, random_state=316)
# Ca/Fe directly
# nn_model = MLPRegressor(hidden_layer_sizes=(12,), activation='tanh', solver='lbfgs', max_iter=4000, random_state=316)
# Ca/Fe + Fe/H
nn_model = MLPRegressor(hidden_layer_sizes=(9,), activation='relu', solver='lbfgs', max_iter=4000, random_state=316)
# Use only for Ca/Fe directly
# X_train = X_train[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'teff']]
# X_test = X_test[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'teff']]
nn_model.fit(X_train, y_sum_train_transformed)

In [None]:
y_sum_train_predicted = tf_inv(nn_model.predict(X_train)) if transform else nn_model.predict(X_train)
y_sum_test_predicted = tf_inv(nn_model.predict(X_test)) if transform else nn_model.predict(X_test)
y_min = min(min(y_sum_train), min(y_sum_test), min(y_sum_train_predicted), min(y_sum_test_predicted))
y_max = max(max(y_sum_train), max(y_sum_test), max(y_sum_train_predicted), max(y_sum_test_predicted))
# y_min, y_max = -.1, .4

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

# axes.grid(True)  # plt.rc('grid', linestyle=':', color='red', linewidth=2)

axes.scatter(y_sum_train, y_sum_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_sum_test, y_sum_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted CaFe on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_sum_train, y_pred=y_sum_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_sum_train, y_pred=y_sum_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_sum_test, y_pred=y_sum_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_sum_test, y_pred=y_sum_test_predicted)}')

In [None]:
y_feh_train = y_sum[y_train.index] - y_train
y_feh_test = y_sum[y_test.index] - y_test

nn_model = MLPRegressor(hidden_layer_sizes=(9,), activation='logistic', solver='lbfgs', max_iter=4000, random_state=316)
#X_train = X_train[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag', 'teff']]
#X_test = X_test[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag', 'teff']]
nn_model.fit(X_train, y_feh_train)

In [None]:
y_feh_train_predicted = nn_model.predict(X_train)
y_feh_test_predicted = nn_model.predict(X_test)
y_min = min(min(y_feh_train), min(y_feh_test), min(y_feh_train_predicted), min(y_feh_test_predicted))
# y_min = -1
y_max = max(max(y_feh_train), max(y_feh_test), max(y_feh_train_predicted), max(y_feh_test_predicted))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_feh_train, y_feh_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_feh_test, y_feh_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted FeH on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_feh_train, y_pred=y_feh_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_feh_train, y_pred=y_feh_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_feh_test, y_pred=y_feh_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_feh_test, y_pred=y_feh_test_predicted)}')

#### Combined -- bad :(

In [None]:
plt.scatter(y_sum_train - y_feh_train, y_sum_train_predicted - y_feh_train_predicted)
plt.scatter(y_sum_test - y_feh_test, y_sum_test_predicted - y_feh_test_predicted, color='r')
plt.plot([-.2, .4], [-.2, .4], color='g')

print(r2_score(y_sum_train - y_feh_train, y_sum_train_predicted - y_feh_train_predicted))
print(mean_absolute_error(y_sum_train - y_feh_train, y_sum_train_predicted - y_feh_train_predicted))

print(r2_score(y_sum_test - y_feh_test, y_sum_test_predicted - y_feh_test_predicted))
print(mean_absolute_error(y_sum_test - y_feh_test, y_sum_test_predicted - y_feh_test_predicted))