# FeH relation for giants

In [None]:
from matplotlib import pyplot as plt

import numpy as np

import pandas as pd

from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RepeatedKFold
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from utils.prepare_data import clean_data, filter_rows_by_std, get_magnitude_diffs, join_rows_from_raw, load_data


plt.style.use('seaborn-darkgrid')
plt.rcParams["figure.figsize"] = (10, 10)

## Prepare the data for the modelling

In [None]:
# load the data
df = load_data()
target_variable = 'feh'
target_predictors_base = ['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']
# Other target predictors - extracted from target predictors base - teff & logg

In [None]:
df = clean_data(df)  # Basic cleaning
# construct the magnitude differences
ordered_mag_columns = ['magcr3', 'magbr3', 'magar3', 'bpmag', 'gmag', 'rpmag', 'jmag', 'kmag']
df_diffs = get_magnitude_diffs(df, ordered_mag_columns)
df_diffs['bpmag_rpmag'] = df['bpmag'] - df['rpmag']
# filter Na's and measurements with too large standard deviations
df_diffs_filtered = filter_rows_by_std(df_diffs, df, std_thresholds={
    'sigcr3': 0.05,
    'sigbr3': 0.05,
    'sigar3': 0.05,
    'ejmag': 0.05,
    'ekmag': 0.05,
}).dropna()

In [None]:
df.shape, df_diffs_filtered.shape

### Field shifts from Bayes model

In [None]:
df['cb_field_shift'] = df['field'].map({
    'Field-1': 0.3319922760263798,
    'Field-2': 0.41525421695893566,
    'Field-3': 0.4374036431066466,
    'Field-4': 0.41137730050813004,
    'Field-5': 0.35876049518667247,
    'Field-6': 0.42670825326670114,
    'Field-8': 0.4220114380659433,
    'Field-9': 0.38748489552795107,
    'Field-10': 0.38493277215437155,
    'Field-11': 0.3910498880361436,
    'Field-15': 0.37523785738068216,
    'Field-16': 0.37873686102483073,
    'Field-17': 0.37748938994827175,
    'Field-18': 0.3650478096668249,
    'Field-19': 0.39866733152517186,
    'Field-20': 0.39821298519891424,
    'Field-24': 0.42353001068319074,
    'Field-25': 0.432486767263635,
    'Field-27': 0.40814162530040954,
    'Field-28': 0.37364547999152314,
    'Field-29': 0.4262784749766956,
    'Field-32': 0.4809766006917441,
    'Field-33': 0.42479356859998646,
    'Field-34': 0.4177341235655455,
    'Field-35': 0.45922559449923883,
    'Field-38': 0.5008953767354531,
    'Field-39': 0.3607422231174212,
    'Field-40': 0.3808902958085871,
    'Field-41': 0.3588893561248595,
    'Field-42': 0.36663408584866286,
    'Field-45': 0.3572882366819113,
    'Field-46': 0.36898328437951294,
    'Field-47': 0.40957565746657026,
})

df['ba_field_shift'] = df['field'].map({
    'Field-1': 0.18080835515053956,
    'Field-2': 0.19971150768603052,
    'Field-3': 0.16673795188561255,
    'Field-4': 0.22764694756560977,
    'Field-5': 0.16393258009286565,
    'Field-6': 0.2218356785159324,
    'Field-8': 0.21095491325215732,
    'Field-9': 0.19161996881699248,
    'Field-10': 0.1837026412942269,
    'Field-11': 0.18115561005808317,
    'Field-15': 0.15200887755485093,
    'Field-16': 0.1656647564454164,
    'Field-17': 0.1667862329753091,
    'Field-18': 0.13141335282697575,
    'Field-19': 0.12365755419411407,
    'Field-20': 0.23729289979478213,
    'Field-24': 0.15642818357320198,
    'Field-25': 0.1840500586940485,
    'Field-27': 0.22263591110330505,
    'Field-28': 0.14583983959304825,
    'Field-29': 0.1633080989641097,
    'Field-32': 0.10157314049954563,
    'Field-33': 0.1320147960009749,
    'Field-34': 0.14638373988925385,
    'Field-35': 0.1275999413428845,
    'Field-38': 0.12160791831422073,
    'Field-39': 0.17665871089194196,
    'Field-40': 0.1635198872846553,
    'Field-41': 0.2009430095532145,
    'Field-42': 0.1866283299753808,
    'Field-45': 0.1722000325556393,
    'Field-46': 0.15581685616524635,
    'Field-47': 0.22921627154498236,
})

In [None]:
X = df_diffs_filtered
y = df[target_variable][X.index]

# giant indicator
giant_indicator = df['logg'][X.index] < 3.5

## Modelling

### Model selection with original data

In [None]:
X = df_diffs_filtered[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
X = join_rows_from_raw(X, df, ['logg', 'teff'])[giant_indicator]
y = df[target_variable][X.index]

In [None]:
cv = RepeatedKFold(n_splits=8, n_repeats=4, random_state=314)

# define the pipeline to evaluate
normalizer = StandardScaler()
fs = SelectKBest(score_func=mutual_info_regression)
pipeline = Pipeline(steps=[
    ('normalizer', normalizer),
    ('feature_selector',fs),
    ('neural_network', MLPRegressor(max_iter=4000))
])
# define the grid : from 1 to all features
grid = {
    'feature_selector__k': [i for i in range(2, 7)],
    'neural_network__hidden_layer_sizes': [(9,), (12,), (16,)],
    'neural_network__activation': ['logistic', 'tanh', 'relu'],
    'neural_network__solver': [
        'lbfgs',
        # 'sgd',
        # 'adam'
    ],
}
# define the grid search
search = GridSearchCV(
    pipeline,
    grid,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    cv=cv,
    # verbose=2
)
# perform the search
results = search.fit(X, y)
# summarize best
print(f'Best nMAE: {results.best_score_:.6f}', )
print(f'Best Config: {results.best_params_}:', )

print('Feature scores:')
for f, sc in zip(X.columns, search.best_estimator_.get_params().get('feature_selector').scores_):
    print(f'{f:16s} {sc:.6f}')
print(f'Selected features: {list(X.columns[search.best_estimator_.get_params().get("feature_selector").get_support()])}')

In [None]:
y_predicted = search.predict(X)
plt.scatter(y, y_predicted)
plt.plot([-1.5, .5], [-1.5, .5], color='r')

r2_score(y, y_predicted), mean_absolute_error(y, y_predicted)

### Prepare train & test set

In [None]:
X_use = df_diffs_filtered[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
# X_use = join_rows_from_raw(X_use, df, ['teff', 'logg'])
X_use['magcr3_magbr3'] = (X_use['magcr3_magbr3']
                          - df['cb_field_shift'][X_use.index]
                          + .75)**(1/3)
X_use['magbr3_magar3'] = X_use['magbr3_magar3'] - df['ba_field_shift'][X_use.index]
X_use['jmag_kmag'] = (X_use['jmag_kmag'])**(1/2)
X_use['bpmag_rpmag'] = (X_use['bpmag_rpmag'])**(1/2)
'''
ind = X_use.index
cols = X_use.columns
normalizer = StandardScaler()
X_use = normalizer.fit_transform(X_use)
X_use = pd.DataFrame(
    data=X_use,
    index=ind,
    columns=cols
)
'''
# logg and teff are well predictable from X_use, while CaFe and FeH are pure :/ 
y_use = y
# y_use = np.cbrt(y_use)
# y_use = y_use**3

X_use = X_use[giant_indicator]
y_use = y_use[giant_indicator]

X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size=0.25, random_state=314)

In [None]:
for c in ['magbr3_magar3', 'jmag_kmag', 'bpmag_rpmag', 'magcr3_magbr3']:  # X_use.columns:
    plt.hist(X_use[c], label=c, bins=40)
plt.legend()

### Fit models for teff & logg

In [None]:
teff = df['teff'][X_train.index]
m_teff = LinearRegression()
m_teff.fit(X_train, teff)
teff_calc = m_teff.predict(X_train)
logg = df['logg'][X_train.index]
m_logg = LinearRegression()
m_logg.fit(X_train, logg)
logg_calc = m_logg.predict(X_train)
X_train['teff'] = teff_calc
X_train['logg'] = logg_calc

ind = X_train.index
cols = X_train.columns
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_train = pd.DataFrame(
    data=X_train,
    index=ind,
    columns=cols
)

In [None]:
print(f"logg: mean_absolute_error: train: {mean_absolute_error(logg, logg_calc)}; test: {mean_absolute_error(df['logg'][X_test.index], m_logg.predict(X_test))}")
print(f"logg: r2_score: train: {r2_score(logg, logg_calc)}; test: {r2_score(df['logg'][X_test.index], m_logg.predict(X_test))}")

print(f"teff: mean_absolute_error: train: {mean_absolute_error(teff, teff_calc)}; test: {mean_absolute_error(df['teff'][X_test.index], m_teff.predict(X_test))}")
print(f"teff: r2_score: train: {r2_score(teff, teff_calc)}; test: {r2_score(df['teff'][X_test.index], m_teff.predict(X_test))}")

### Inspect models for teff & logg

In [None]:
mask = df.loggflag > .3*10**8
alpha = .4

mask = (df.magbr3 < df.magar3)*(df.magbr3 > df.magcr3)
alpha = 1

fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(20, 20))
y_min = min(min(df['logg'][X_use.index]), min(logg_calc))
y_max = max(max(df['logg'][X_use.index]), max(logg_calc))
axes[0, 0].scatter(df['logg'][X_train.index], logg_calc)
# axes[0, 0].scatter(df['logg'][X_train.index][mask[X_train.index]], logg_calc[mask[X_train.index]],
#                    color='r', alpha=alpha)
axes[0, 0].set_title('logg from magnitude diffs on the train set')
axes[0, 0].set_xlabel('true')
axes[0, 0].set_ylabel('predicted')
axes[0, 0].plot([y_min, y_max], [y_min, y_max], color='r')

axes[0, 1].scatter(df['logg'][X_test.index], m_logg.predict(X_test))
# axes[0, 1].scatter(df['logg'][X_test.index][mask[X_test.index]], m_logg.predict(X_test)[mask[X_test.index]],
#                    color='r', alpha=alpha)
axes[0, 1].set_title('logg from magnitude diffs on the tet set')
axes[0, 1].set_xlabel('true')
axes[0, 1].set_ylabel('predicted')
axes[0, 1].plot([y_min, y_max], [y_min, y_max], color='r')

# mask = df.teffflag > 100

y_min = min(min(df['teff'][X_use.index]), min(teff_calc))
y_max = max(max(df['teff'][X_use.index]), max(teff_calc))
axes[1, 0].scatter(df['teff'][X_train.index], teff_calc)
# axes[1, 0].scatter(df['teff'][X_train.index][mask[X_train.index]], teff_calc[mask[X_train.index]],
#                    color='r', alpha=alpha)
axes[1, 0].set_title('teff from magnitude diffs on the train set')
axes[1, 0].set_xlabel('true')
axes[1, 0].set_ylabel('predicted')
axes[1, 0].plot([y_min, y_max], [y_min, y_max], color='r')

axes[1, 1].scatter(df['teff'][X_test.index], m_teff.predict(X_test))
# axes[1, 1].scatter(df['teff'][X_test.index][mask[X_test.index]], m_teff.predict(X_test)[mask[X_test.index]],
#                    color='r', alpha=alpha)
axes[1, 1].set_title('teff from magnitude diffs on the tet set')
axes[1, 1].set_xlabel('true')
axes[1, 1].set_ylabel('predicted')
axes[1, 1].plot([y_min, y_max], [y_min, y_max], color='r')


### Fit & test model for FeH

In [None]:
# Extend test set predictors with predicted teff, logg 
teff_test = m_teff.predict(X_test)
logg_test = m_logg.predict(X_test)
X_test['teff'] = teff_test
X_test['logg'] = logg_test
ind = X_test.index
cols = X_test.columns
X_test = normalizer.transform(X_test)
X_test = pd.DataFrame(
    data=X_test,
    index=ind,
    columns=cols
)

In [None]:
nn_model = MLPRegressor(hidden_layer_sizes=(9,), activation='logistic', solver='lbfgs', max_iter=4000, random_state=316)
#X_train = X_train[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag', 'teff']]
#X_test = X_test[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag', 'teff']]
nn_model.fit(X_train, y_train)

In [None]:
y_train_predicted = nn_model.predict(X_train)
y_test_predicted = nn_model.predict(X_test)
y_min = min(min(y_train), min(y_test), min(y_train_predicted), min(y_test_predicted))
# y_min = -1
y_max = max(max(y_train), max(y_test), max(y_train_predicted), max(y_test_predicted))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_train, y_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_test, y_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted FeH on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_train, y_pred=y_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_train, y_pred=y_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_test, y_pred=y_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_test, y_pred=y_test_predicted)}')

### Statistics

In [None]:
# Error distribution
fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))

axes[0].hist(y_train_predicted-y_train, color='b', bins=25)
axes[0].set_title('Error distribution on train set')
axes[1].hist(y_test_predicted-y_test, color='r', bins=25)
axes[1].set_title('Error distribution on test set')
print('Train 16-84 percentile: ', np.percentile(y_train_predicted-y_train, [16, 84]))
print('Test 16-84 percentile: ', np.percentile(y_test_predicted-y_test, [16, 84]))

In [None]:
steps = np.linspace(start=-1, stop=.5, num=11)

y_all = pd.concat([y_train, y_test])
y_all_predicted = np.concatenate([y_train_predicted, y_test_predicted])

xx, yy16, yy84 = [], [], []
for i in range(len(steps) - 1):
    step = (steps[i] + steps[i+1])/2
    xx.append(step)
    # yy16.append(np.percentile(y_all_predicted[(y_all>steps[i]) * (y_all<=steps[i+1])]-y_all[(y_all>steps[i]) * (y_all<=steps[i+1])], 16))
    # yy84.append(np.percentile(y_all_predicted[(y_all>steps[i]) * (y_all<=steps[i+1])]-y_all[(y_all>steps[i]) * (y_all<=steps[i+1])], 84))
    yy16.append(np.percentile(y_all_predicted[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])]
                              - y_all[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])], 16))
    yy84.append(np.percentile(y_all_predicted[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])]
                              - y_all[(y_all_predicted>steps[i]) * (y_all_predicted<=steps[i+1])], 84))

y_min = min(min(y_train), min(y_test), min(y_train_predicted), min(y_test_predicted))
# y_min = -1
y_max = max(max(y_train), max(y_test), max(y_train_predicted), max(y_test_predicted))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_train, y_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_test, y_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])

# Errors
# axes.plot(xx, [x0+y0 for x0, y0 in zip(xx, yy16)], color='magenta', label='16th percentile', linewidth=3)
# axes.plot(xx, [x0+y0 for x0, y0 in zip(xx, yy84)], color='orange', label='84th percentile', linewidth=3)
axes.plot([x0+y0 for x0, y0 in zip(xx, yy16)], xx, color='magenta', label='16th percentile', linewidth=3)
axes.plot([x0+y0 for x0, y0 in zip(xx, yy84)], xx, color='orange', label='84th percentile', linewidth=3)

axes.set_title('True vs. predicted FeH on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_train, y_pred=y_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_train, y_pred=y_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_test, y_pred=y_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_test, y_pred=y_test_predicted)}')

print('Dataset splitted to 10 chunks, 16-th and 84-th percentile of the error calculated in each chunk and the percentile lines plotted to the graph.')

### Removing evaluated teff and logg, try with original

No teff/logg at all - larger mean absolute errors.

In [None]:
X = df_diffs_filtered[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
y = df[target_variable][X.index]

X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size=0.25, random_state=314)

In [None]:
cv = RepeatedKFold(n_splits=8, n_repeats=4, random_state=314)

# define the pipeline to evaluate
normalizer = StandardScaler()
fs = SelectKBest(score_func=mutual_info_regression)
pipeline = Pipeline(steps=[
    ('normalizer', normalizer),
    ('feature_selector',fs),
    ('neural_network', MLPRegressor(max_iter=4000))
])
# define the grid : from 1 to all features
grid = {
    'feature_selector__k': [i for i in range(2, 5)],
    'neural_network__hidden_layer_sizes': [(5,), (7,), (9,), (12,)],
    'neural_network__activation': ['logistic'],
    'neural_network__solver': ['lbfgs'],
}
# define the grid search
search = GridSearchCV(
    pipeline,
    grid,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    cv=cv,
    # verbose=2
)
# perform the search
results = search.fit(X, y)
# summarize best
print(f'Best nMAE: {results.best_score_:.6f}', )
print(f'Best Config: {results.best_params_}:', )

print('Feature scores:')
for f, sc in zip(X.columns, search.best_estimator_.get_params().get('feature_selector').scores_):
    print(f'{f:16s} {sc:.6f}')
print(f'Selected features: {list(X.columns[search.best_estimator_.get_params().get("feature_selector").get_support()])}')

In [None]:
nn_model = MLPRegressor(hidden_layer_sizes=(7,), activation='logistic', solver='lbfgs', max_iter=4000, random_state=316)
X_train = X_train[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
X_test = X_test[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
nn_model.fit(X_train, y_train)

In [None]:
y_train_predicted = nn_model.predict(X_train)
y_test_predicted = nn_model.predict(X_test)
y_min = min(min(y_train), min(y_test), min(y_train_predicted), min(y_test_predicted))
# y_min = -1
y_max = max(max(y_train), max(y_test), max(y_train_predicted), max(y_test_predicted))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_train, y_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_test, y_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted FeH on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_train, y_pred=y_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_train, y_pred=y_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_test, y_pred=y_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_test, y_pred=y_test_predicted)}')

With 'original' teff - better results

In [None]:
nn_model = MLPRegressor(hidden_layer_sizes=(7,), activation='logistic', solver='lbfgs', max_iter=4000, random_state=316)
X_train = X_train[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
X_train = join_rows_from_raw(X_train, df, ['teff'])
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_test = X_test[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
X_test = join_rows_from_raw(X_test, df, ['teff'])
X_test = normalizer.transform(X_test)
nn_model.fit(X_train, y_train)

In [None]:
y_train_predicted = nn_model.predict(X_train)
y_test_predicted = nn_model.predict(X_test)
y_min = min(min(y_train), min(y_test), min(y_train_predicted), min(y_test_predicted))
# y_min = -1
y_max = max(max(y_train), max(y_test), max(y_train_predicted), max(y_test_predicted))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_train, y_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_test, y_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted FeH on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_train, y_pred=y_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_train, y_pred=y_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_test, y_pred=y_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_test, y_pred=y_test_predicted)}')

## Appendix: additional apogee data from Mexico

### Load and clean

In [None]:
# load the data
df_mex = load_data(filename='mexico_labeled.csv')
df_mex.shape

In [None]:
df_mex = clean_data(df_mex)  # Basic cleaning
# construct the magnitude differences
ordered_mag_columns = ['magcr3', 'magbr3', 'magar3', 'bpmag', 'gmag', 'rpmag', 'jmag', 'kmag']
df_diffs_mex = get_magnitude_diffs(df_mex, ordered_mag_columns)
df_diffs_mex['bpmag_rpmag'] = df_mex['bpmag'] - df_mex['rpmag']
# filter Na's and measurements with too large standard deviations
df_diffs_filtered_mex = filter_rows_by_std(df_diffs_mex, df_mex, std_thresholds={
    'sigcr3': 0.05,
    'sigbr3': 0.05,
    'sigar3': 0.05,
    'ejmag': 0.05,
    'ekmag': 0.05,
}).dropna()

# Additional cleaning

df_diffs_filtered_mex = df_diffs_filtered_mex[
    (df_diffs_filtered_mex['magcr3_magbr3']>-50) 
    * (df_diffs_filtered_mex['magcr3_magbr3']<50)
    * (df_diffs_filtered_mex['magbr3_magar3']<50)
]

df_diffs_filtered_mex.shape, len(set(df_mex.field))

In [None]:
# Giants
giant_indicator_mex = df_mex['logg'] < 3.5
print(sum(giant_indicator_mex))
giant_indicator_mex = giant_indicator_mex[df_diffs_filtered_mex.index]
print(sum(giant_indicator_mex))

### Additional data inspection

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(20, 20))

axes[0, 0].hist(df_diffs_filtered_mex['magcr3_magbr3'][(df_diffs_filtered_mex['magcr3_magbr3']>-50) * (df_diffs_filtered_mex['magcr3_magbr3']<50)], bins = 25)
axes[0, 1].hist(df_diffs_filtered_mex['magbr3_magar3'][df_diffs_filtered_mex['magbr3_magar3']<50], bins=25)
axes[1, 0].hist(df_diffs_filtered_mex['jmag_kmag'], bins=25)
axes[1, 1].hist(df_diffs_filtered_mex['bpmag_rpmag'], bins=25)

In [None]:
sum((df_diffs_filtered_mex['magcr3_magbr3']>-50) * (df_diffs_filtered_mex['magcr3_magbr3']<50))

### Re-calculate shifts

In [None]:
cb_field_shifts = {
    'Field-1': 0.3085008050927565,
    'Zwitter-001': 0.6378044929094755,
    'Field-2': 0.396353091141992,
    'Zwitter-002': 0.4215366270069197,
    'Field-3': 0.4152001059345346,
    'Zwitter-003': 0.5812455601267135,
    'Field-4': 0.39102849917065374,
    'Zwitter-005': 0.6859590774257387,
    'Field-5': 0.3381333019903701,
    'Zwitter-006': 0.5816987303403534,
    'Field-6': 0.4018856301296543,
    'Zwitter-007': 0.5167182753341542,
    'Field-8': 0.4009829871482615,
    'Zwitter-009': 0.641563228662612,
    'Field-9': 0.3665552321735659,
    'Field-10': 0.3618080992571904,
    'Zwitter-010': 0.45525077676062653,
    'Zwitter-011': 0.34446556171800496,
    'Field-11': 0.36901642992659495,
    'Zwitter-012': 0.34486052340161555,
    'Zwitter-015': 0.21789292038826585,
    'Field-15': 0.35060912849958054,
    'Zwitter-016': 0.4030459240865568,
    'Field-16': 0.3576766782298215,
    'Field-17': 0.3571674257141912,
    'Field-18': 0.343843886478273,
    'Field-19': 0.3749770823363773,
    'Field-20': 0.37708072235370926,
    'Field-24': 0.4013473838938863,
    'Zwitter-025': 0.2092596585716291,
    'Field-25': 0.40710509402356093,
    'Field-27': 0.38718069146760475,
    'Field-28': 0.3534016314562236,
    'Field-29': 0.39954940660111826,
    'Field-32': 0.47857202060102744,
    'Zwitter-032': 0.4166755173518369,
    'Field-33': 0.4056788129018717,
    'Field-34': 0.39452144481929746,
    'Field-35': 0.43844880825302496,
    'Zwitter-036': 0.6620846227384977,
    'Field-38': 0.48422762415745146,
    'Field-39': 0.3369308910397964,
    'Zwitter-039': 0.2680534043876005,
    'Field-40': 0.3605332450260206,
    'Field-41': 0.33635976377216725,
    'Field-42': 0.3461977364695828,
    'Zwitter-043': 0.5266649059608399,
    'Field-45': 0.3318460360531659,
    'Field-46': 0.3426564246286824,
    'Field-47': 0.3884440829920037,
    'Zwitter-049': 0.3589795658960031,
    'Zwitter-070': 0.4132670259681095,
    'Zwitter-074': 0.27306509583711114,
    'Zwitter-075': 0.27229892579794224,
    'Zwitter-078': 0.24886548410501647,
    'Zwitter-081': 0.37012522355791355,
    'Zwitter-104': 0.24425816627665173,
    'Zwitter-108': 0.3377743159741208,
}

previous_cb_shifts = {df['field'].loc[i]: df['cb_field_shift'].loc[i] for i in df.index}

fields = previous_cb_shifts.keys()
plt.scatter([previous_cb_shifts[x] for x in fields], [cb_field_shifts[x] for x in fields])
plt.title('Previous vs. newly evaluated C-B offsets')

shifts_shifts = [previous_cb_shifts[x] - cb_field_shifts[x] for x in fields]

move = sum(shifts_shifts) / len(shifts_shifts)

cb_field_shifts = {x: cb_field_shifts[x] + move for x in cb_field_shifts.keys()}

plt.scatter([previous_cb_shifts[x] for x in fields], [cb_field_shifts[x] for x in fields], color='r')
plt.plot([.3, .5], [.3, .5], color='g')

df_mex['cb_field_shift'] = df_mex['field'].map(cb_field_shifts)

In [None]:
ba_field_shifts = {
    'Field-1': 0.17056282745873208,
    'Zwitter-001': 0.2519492828443053,
    'Field-2': 0.1900727308563549,
    'Zwitter-002': 0.12946751731162473,
    'Field-3': 0.15676561857692517,
    'Zwitter-003': 0.21470618204140082,
    'Field-4': 0.21750784155802386,
    'Zwitter-005': 0.2610466354756117,
    'Field-5': 0.15399322053015518,
    'Zwitter-006': 0.2533334316382239,
    'Field-6': 0.21176879523660022,
    'Zwitter-007': 0.23628915342725104,
    'Field-8': 0.20139997325647813,
    'Zwitter-009': 0.2620847989640088,
    'Field-9': 0.18197254591648596,
    'Field-10': 0.17349931724220602,
    'Zwitter-010': 0.2131233051958813,
    'Zwitter-011': 0.08267338895080018,
    'Field-11': 0.1709213692783184,
    'Zwitter-012': 0.11896434975367143,
    'Zwitter-015': 0.007262233491898352,
    'Field-15': 0.1417001417914365,
    'Zwitter-016': 0.13863982706869876,
    'Field-16': 0.15561561874994098,
    'Field-17': 0.1567116625352581,
    'Field-18': 0.1220567170105989,
    'Field-19': 0.1137210096360391,
    'Field-20': 0.2270200786744885,
    'Field-24': 0.1467555244352789,
    'Zwitter-025': 0.007255019319873782,
    'Field-25': 0.17422667634636751,
    'Field-27': 0.21237181561536725,
    'Field-28': 0.13593421935329522,
    'Field-29': 0.15294902159566798,
    'Field-32': 0.09153140645579726,
    'Zwitter-032': 0.19890711997208396,
    'Field-33': 0.12238108640800323,
    'Field-34': 0.13612453979442343,
    'Field-35': 0.11757561558882221,
    'Zwitter-036': 0.32083030921177264,
    'Field-38': 0.11246401690293864,
    'Field-39': 0.166571980720964,
    'Zwitter-039': 0.0583772140261829,
    'Field-40': 0.1531174942512961,
    'Field-41': 0.19097475960673183,
    'Field-42': 0.1763849240689791,
    'Zwitter-043': 0.22625650096649264,
    'Field-45': 0.1621252746229151,
    'Field-46': 0.1456851385293077,
    'Field-47': 0.21890106877144494,
    'Zwitter-049': 0.1887868019297796,
    'Zwitter-070': 0.20278115163597124,
    'Zwitter-074': 0.06778247337143434,
    'Zwitter-075': 0.106845488301897,
    'Zwitter-078': 0.005240474083212578,
    'Zwitter-081': 0.21894648708151357,
    'Zwitter-104': 0.07371169413303105,
    'Zwitter-108': 0.23038539632655616,
}

previous_ba_shifts = {df['field'].loc[i]: df['ba_field_shift'].loc[i] for i in df.index}

fields = previous_ba_shifts.keys()
plt.scatter([previous_ba_shifts[x] for x in fields], [ba_field_shifts[x] for x in fields])
plt.title('Previous vs. newly evaluated B-A offsets')

shifts_shifts = [previous_ba_shifts[x] - ba_field_shifts[x] for x in previous_ba_shifts.keys()]

move = sum(shifts_shifts) / len(shifts_shifts)

ba_field_shifts = {x: ba_field_shifts[x] + move for x in ba_field_shifts.keys()}

plt.scatter([previous_ba_shifts[x] for x in fields], [ba_field_shifts[x] for x in fields], color='r')
plt.plot([.1, .25], [.1, .25], color='g')

df_mex['ba_field_shift'] = df_mex['field'].map(ba_field_shifts)

### Try trained model

In [None]:
X_use = df_diffs_filtered_mex[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
# X_use = join_rows_from_raw(X_use, df, ['teff', 'logg'])

X_use['magcr3_magbr3'] = (X_use['magcr3_magbr3']
                          - df_mex['cb_field_shift'][X_use.index]
                          + .75)**(1/3)
X_use['magbr3_magar3'] = X_use['magbr3_magar3']  - df_mex['ba_field_shift'][X_use.index]
X_use['jmag_kmag'] = (X_use['jmag_kmag'])**(1/2)
X_use['bpmag_rpmag'] = (X_use['bpmag_rpmag'])**(1/2)

# logg and teff are well predictable from X_use, while CaFe and FeH are pure :/ 
y_use = df_mex[target_variable][X_use.index]
# y_use = np.cbrt(y_use)
# y_use = y_use**3

X_use = X_use[giant_indicator_mex]
y_use = y_use[giant_indicator_mex]

In [None]:
X_use.shape

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))

axes[0].scatter(df_mex['logg'][X_use.index], m_logg.predict(X_use), color='g')
axes[0].plot([0, 3.5], [0, 3.5], color='r')
axes[1].scatter(df_mex['teff'][X_use.index], m_teff.predict(X_use), color='g')
axes[1].plot([3500, 5500], [3500, 5500], color='r')

In [None]:
# Extend test set predictors with predicted teff, logg 
teff_test = m_teff.predict(X_use)
logg_test = m_logg.predict(X_use)
X_use['teff'] = teff_test
X_use['logg'] = logg_test
ind = X_use.index
cols = X_use.columns
X_use = normalizer.transform(X_use)
X_use = pd.DataFrame(
    data=X_use,
    index=ind,
    columns=cols
)

In [None]:
y_use_predicted = nn_model.predict(X_use)
y_min = min(min(y_use_predicted), min(y_use))
# y_min = -1
y_max = max(max(y_use_predicted), max(y_use))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_use, y_use_predicted, color='g', alpha=.6, label='additional')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted FeH on additional data set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on additional data set: {mean_absolute_error(y_true=y_use, y_pred=y_use_predicted)}')
print(f'r2 score on additional data set: {r2_score(y_true=y_use, y_pred=y_use_predicted)}')

### Re-model

In [None]:
use_apogee = False  # Whether to use teff and logg measurements while modelling Fe/H or predict teff and logg from magnitude diffs using linear regression.

In [None]:
X_mex = df_diffs_filtered_mex
X_mex = X_mex[['magcr3_magbr3', 'magbr3_magar3', 'jmag_kmag', 'bpmag_rpmag']]

# Transform as Crni vrh
X_mex['magcr3_magbr3'] = (X_mex['magcr3_magbr3']
                          - df_mex['cb_field_shift'][X_mex.index]
                          + .75)**(1/3)
X_mex['magbr3_magar3'] = X_mex['magbr3_magar3'] - df['ba_field_shift'][X_mex.index]
X_mex['jmag_kmag'] = (X_mex['jmag_kmag'])**(1/2)
X_mex['bpmag_rpmag'] = (X_mex['bpmag_rpmag'])**(1/2)

X_mex = X_mex[giant_indicator_mex]

y_mex = df_mex['feh'][X_mex.index]
X_mex_train, X_mex_test, y_mex_train, y_mex_test = train_test_split(X_mex, y_mex, test_size=0.25, random_state=314)

In [None]:
X_mex.shape

In [None]:
if use_apogee:
    # Use apogee's T and log(g)
    X_mex = join_rows_from_raw(df_diffs_filtered_mex, df_mex, ['teff', 'logg'])
    # Normalize
    ind = X_mex.index
    cols = X_mex.columns
    normalizer = StandardScaler()
    X_mex = normalizer.fit_transform(X_mex)
    X_mex = pd.DataFrame(
        data=X_mex,
        index=ind,
        columns=cols
    )
else:
    # Model T and log(g) ...
    teff = df_mex['teff'][X_mex_train.index]
    m_teff = LinearRegression()
    m_teff.fit(X_mex_train, teff)
    teff_calc = m_teff.predict(X_mex_train)
    logg = df['logg'][X_mex_train.index]
    m_logg = LinearRegression()
    m_logg.fit(X_mex_train, logg)
    logg_calc = m_logg.predict(X_mex_train)
    X_mex_train['teff'] = teff_calc
    X_mex_train['logg'] = logg_calc

    ind = X_mex_train.index
    cols = X_mex_train.columns
    normalizer = StandardScaler()
    X_mex_train = normalizer.fit_transform(X_mex_train)
    X_mex_train = pd.DataFrame(
        data=X_mex_train,
        index=ind,
        columns=cols
    )
    # ... and extend test set predictors with predicted teff, logg 
    teff_test = m_teff.predict(X_mex_test)
    logg_test = m_logg.predict(X_mex_test)
    X_mex_test['teff'] = teff_test
    X_mex_test['logg'] = logg_test
    ind = X_mex_test.index
    cols = X_mex_test.columns
    X_mex_test = normalizer.transform(X_mex_test)
    X_mex_test = pd.DataFrame(
        data=X_mex_test,
        index=ind,
        columns=cols
    )


In [None]:
nn_model = MLPRegressor(hidden_layer_sizes=(9,), activation='logistic', solver='lbfgs', max_iter=4000, random_state=316)
#X_train = X_train[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag', 'teff']]
#X_test = X_test[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag', 'teff']]
nn_model.fit(X_mex_train, y_mex_train)

In [None]:
y_train_predicted = nn_model.predict(X_mex_train)
y_test_predicted = nn_model.predict(X_mex_test)
y_min = min(min(y_mex_train), min(y_mex_test), min(y_train_predicted), min(y_test_predicted))
# y_min = -1
y_max = max(max(y_mex_train), max(y_mex_test), max(y_train_predicted), max(y_test_predicted))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_mex_train, y_train_predicted, color='b', alpha=.6, label='train')
axes.scatter(y_mex_test, y_test_predicted, color='r', alpha=.6, label='test')
axes.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted FeH on train and test set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_mex_train, y_pred=y_train_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_mex_train, y_pred=y_train_predicted)}')

print(f'Mean absolute error on test set: {mean_absolute_error(y_true=y_mex_test, y_pred=y_test_predicted)}')
print(f'r2 score on test set: {r2_score(y_true=y_mex_test, y_pred=y_test_predicted)}')

#### Try remodeled Fe/H on allMex with Gaia and MASS

In [None]:
# load the data
df_allmex = load_data(filename='masterallmatchwithGaiaand2MASS.csv')
# clean column names
df_allmex.columns = [x.strip(' |#') for x in df_allmex.columns]  # clean the column names

In [None]:
f = None

# Take all the data wfrom the selected field where
# J, K, Bp, Rp magnitudes as well as parallax are given
field_all_data_filter = (df_allmex.field==f if f is not None else 1-df_allmex.field.isnull()) * (1-df_allmex.j_m.isnull()) \
    * (1-df_allmex.ks_m.isnull()) * (1-df_allmex.phot_bp_mean_mag.isnull()) * (1-df_allmex.phot_rp_mean_mag.isnull())
print(f'Field {f} has {sum(field_all_data_filter)} points with all the data.')

df_filtered = df_allmex[field_all_data_filter==1]
df_filtered['cb_field_shift'] = df_filtered['field'].map(cb_field_shifts)
df_filtered['ba_field_shift'] = df_filtered['field'].map(ba_field_shifts)
ordered_mag_columns = ['magcr3', 'magbr3', 'magar3', 'phot_bp_mean_mag', 'phot_rp_mean_mag', 'j_m', 'ks_m']
df_diffs = get_magnitude_diffs(df_filtered, ordered_mag_columns)[['magcr3_magbr3', 'magbr3_magar3', 'phot_bp_mean_mag_phot_rp_mean_mag', 'j_m_ks_m']]
df_diffs.columns = ['magbr3_magar3', 'magcr3_magbr3', 'bpmag_rpmag', 'jmag_kmag']
# Drop also the columns with out-of-range B-A, C-B magnitude diffs
df_filtered = df_filtered[(df_diffs['magcr3_magbr3']>-4) 
    * (df_diffs['magcr3_magbr3']<4)
    * (df_diffs['magbr3_magar3']<4)]
df_diffs = df_diffs[(df_diffs['magcr3_magbr3']>-4) 
    * (df_diffs['magcr3_magbr3']<4)
    * (df_diffs['magbr3_magar3']<4)]
X_mex = df_diffs[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
print(f'After C-B, B-A out-of-range filtering, there remains {X_mex.shape[0]} observations from field {f}.')

In [None]:
X_mex.shape, X_mex.columns

In [None]:
# df_allmex['teff'] = (df_allmex['teff_gspspec_lower'] + df_allmex['teff_gspspec_upper'])/2
# df_allmex['logg'] = (df_allmex['logg_gspspec_lower'] + df_allmex['logg_gspspec_upper'])/2
# X_mex = join_rows_from_raw(X_mex, df_allmex, ['teff', 'logg'])

# X_mex = join_rows_from_raw(X_mex, df_filtered, ['teff_gspspec', 'logg_gspspec'])
# X_mex.columns = ['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag', 'teff', 'logg']

# Transform as Crni vrh
X_mex['magcr3_magbr3'] = (X_mex['magcr3_magbr3']
                          - df_filtered['cb_field_shift'][X_mex.index]
                          + .75)**(1/3)
X_mex['magbr3_magar3'] = X_mex['magbr3_magar3'] - df_filtered['ba_field_shift'][X_mex.index]
X_mex['jmag_kmag'] = (X_mex['jmag_kmag'])**(1/2)
X_mex['bpmag_rpmag'] = (X_mex['bpmag_rpmag'])**(1/2)

X_mex.dropna(inplace=True)
print(X_mex.shape)

In [None]:
teff_mex = m_teff.predict(X_mex)
logg_mex = m_logg.predict(X_mex)
X_mex['teff'] = teff_mex
X_mex['logg'] = logg_mex

ind = X_mex.index
cols = X_mex.columns
X_mex = normalizer.transform(X_mex)
X_mex = pd.DataFrame(
    data=X_mex,
    index=ind,
    columns=cols
)

y_mex = df_allmex['mh_gspspec'][ind]

In [None]:
X_mex.shape, y_mex.shape, sum(y_mex.isnull())

In [None]:
X_mex = X_mex[~y_mex.isnull()]
y_mex = y_mex[~y_mex.isnull()]

In [None]:
X_mex.shape, y_mex.shape

In [None]:
y_predicted = nn_model.predict(X_mex)
y_min = min(min(y_predicted), min(y_mex))
# y_min = -1
y_max = max(max(y_predicted), max(y_mex))

fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(10, 10))

axes.scatter(y_mex, y_predicted, color='g', label='mex')
axes.plot([y_min, y_max], [y_min, y_max], color='k', linestyle=':')
axes.set_xlim([y_min, y_max])
axes.set_ylim([y_min, y_max])
axes.set_title('True vs. predicted FeH on additional Mexico data set')
axes.legend()
axes.set_xlabel('true')
axes.set_ylabel('predicted')

print(f'Mean absolute error on train set: {mean_absolute_error(y_true=y_mex, y_pred=y_predicted)}')
print(f'r2 score on train set: {r2_score(y_true=y_mex, y_pred=y_predicted)}')

## Appendix: predictors' correlations

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(20, 30))
axes[0, 0].scatter(X['magcr3_magbr3'], X['magbr3_magar3'])
axes[0, 0].set_title('magcr3_magbr3 vs. magbr3_magar3')
axes[0, 1].scatter(X['magcr3_magbr3'], X['jmag_kmag'])
axes[0, 1].set_title('magcr3_magbr3 vs. jmag_kmag')
axes[1, 0].scatter(X['magcr3_magbr3'], X['bpmag_rpmag'])
axes[1, 0].set_title('magcr3_magbr3 vs. bpmag_rpmag')
axes[1, 1].scatter(X['magbr3_magar3'], X['jmag_kmag'])
axes[1, 1].set_title('magbr3_magar3 vs. jmag_kmag')
axes[2, 0].scatter(X['bpmag_rpmag'], X['jmag_kmag'])
axes[2, 0].set_title('bpmag_rpmag vs. jmag_kmag')
axes[2, 1].scatter(X['magbr3_magar3'], X['bpmag_rpmag'])
axes[2, 1].set_title('magbr3_magar3 vs. bpmag_rpmag')

In [None]:
np.corrcoef(np.transpose(X_use))

## Appendix: clusters

In [None]:
# load the data
df_allmex = load_data(filename='mexico_all.csv')
# drop last columns
df_allmex.drop(df_allmex.columns[-1], axis=1, inplace=True)  # last column is empty when loaded
# clean column names
df_allmex.columns = [x.strip(' |#') for x in df_allmex.columns]  # clean the column names


In [None]:
# plt.scatter(df_allmex['xx'], df_allmex['yy'])
for f in set(df_allmex.field):
    plt.scatter(df_allmex['alfa'][df_allmex['field']==f], df_allmex['delta'][df_allmex['field']==f], label=f)
plt.xlim([25, 300])
plt.xlabel('alpha')
plt.ylabel('delta')
plt.title('alpha vs. delta by fields')
plt.legend()

In [None]:
# df_allmex[(df_allmex['delta']>10) * ]
fig, axes = plt.subplots(ncols=2, nrows=13, figsize=(20, 130))
for e, f in enumerate(list(set(df_allmex.field))):
    axes[int(e/2), e%2].scatter(df_allmex['alfa'][df_allmex['field']==f], df_allmex['delta'][df_allmex['field']==f])
    axes[int(e/2), e%2].set_title(f)

### Extended with 2MASS (J, K, Bp & Rp magnitudes)

In [None]:
# load the data
df_allmex = load_data(filename='masterallmatchwithGaiaand2MASS.csv')
# clean column names
df_allmex.columns = [x.strip(' |#') for x in df_allmex.columns]  # clean the column names

In [None]:
df_allmex.columns

In [None]:
f = 'Zwitter-007'  # In this field, we expect cluster

# Take all the data wfrom the selected field where
# J, K, Bp, Rp magnitudes as well as parallax are given
field_all_data_filter = (df_allmex.field==f if f is not None else 1-df_allmex.field.isnull()) * (1-df_allmex.parallax.isnull()) * (1-df_allmex.j_m.isnull()) \
    * (1-df_allmex.ks_m.isnull()) * (1-df_allmex.phot_bp_mean_mag.isnull()) * (1-df_allmex.phot_rp_mean_mag.isnull())
print(f'Field {f} has {sum(field_all_data_filter)} points with all the data.')

df_filtered = df_allmex[field_all_data_filter==1]
df_filtered['cb_field_shift'] = df_filtered['field'].map(cb_field_shifts)
df_filtered['ba_field_shift'] = df_filtered['field'].map(ba_field_shifts)
ordered_mag_columns = ['magcr3', 'magbr3', 'magar3', 'phot_bp_mean_mag', 'phot_rp_mean_mag', 'j_m', 'ks_m']
df_diffs = get_magnitude_diffs(df_filtered, ordered_mag_columns)[['magcr3_magbr3', 'magbr3_magar3', 'phot_bp_mean_mag_phot_rp_mean_mag', 'j_m_ks_m']]
df_diffs.columns = ['magbr3_magar3', 'magcr3_magbr3', 'bpmag_rpmag', 'jmag_kmag']
# Drop also the columns with out-of-range B-A, C-B magnitude diffs
df_filtered = df_filtered[(df_diffs['magcr3_magbr3']>-4) 
    * (df_diffs['magcr3_magbr3']<4)
    * (df_diffs['magbr3_magar3']<4)]
df_diffs = df_diffs[(df_diffs['magcr3_magbr3']>-4) 
    * (df_diffs['magcr3_magbr3']<4)
    * (df_diffs['magbr3_magar3']<4)]
X_mex = df_diffs[['magbr3_magar3', 'magcr3_magbr3', 'jmag_kmag', 'bpmag_rpmag']]
print(f'After C-B, B-A out-of-range filtering, there remains {X_mex.shape[0]} observations from field {f}.')

In [None]:
fig = plt.figure(figsize=(10, 10))
#axes = figure.add_subplot(projection='3d')
#fig, axes = plt.subplots(ncols=2, nrows=13, figsize=(20, 130), projection='3d')
filter_paralax = (0 < 1/df_filtered['parallax']) * (1/df_filtered['parallax'] < 10)

axes = fig.add_subplot(1, 1, 1, projection='3d')
axes.scatter(df_filtered['alfa'][filter_paralax], df_filtered['delta'][filter_paralax], 1/df_filtered['parallax'][filter_paralax])
axes.set_title(f)

In [None]:
plt.hist(1/df_filtered['parallax'][(df_filtered['parallax']>1/10)], bins=40)

In [None]:
for c in ['magbr3_magar3', 'jmag_kmag', 'bpmag_rpmag', 'magcr3_magbr3']:  # X_use.columns:
    plt.hist(X_mex[c], label=c, bins=40)
    plt.xlim([-1, 2])
plt.legend()

In [None]:
X_mex['magcr3_magbr3'] = (X_mex['magcr3_magbr3'] - df_filtered['cb_field_shift'] + .75)**(1/3)
X_mex['magbr3_magar3'] = X_mex['magbr3_magar3'] - df_filtered['ba_field_shift']
X_mex['jmag_kmag'] = (X_mex['jmag_kmag'])**(1/2)
X_mex['bpmag_rpmag'] = (X_mex['bpmag_rpmag'])**(1/2)

X_mex = X_mex.dropna()
df_filtered = df_filtered.loc[X_mex.index]
X_mex.shape

In [None]:
for c in ['magbr3_magar3', 'jmag_kmag', 'bpmag_rpmag', 'magcr3_magbr3']:  # X_mex.columns:
    plt.hist(X_mex[c], label=c, bins=40)
    plt.xlim([-1, 2])
plt.legend()

In [None]:
teff_mex = m_teff.predict(X_mex)
logg_mex = m_logg.predict(X_mex)

In [None]:
dist_filter = (1/df_filtered['parallax']<2.1)*(1/df_filtered['parallax']>1.9)

fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))
axes[0].scatter(df_filtered['teff_gspphot'], teff_mex, label='all')
axes[0].scatter(df_filtered['teff_gspphot'][df_filtered['logg_gspphot']<3.5], teff_mex[df_filtered['logg_gspphot']<3.5], color='r', label='giants')
axes[0].scatter(df_filtered['teff_gspphot'][dist_filter], teff_mex[dist_filter],
                color='k', marker='+', label='cluster')
axes[0].plot([4000, 8000], [4000, 8000], color='g')
axes[0].set_title(f'teff from magnitude diffs on Mexico observations field {f}')
axes[0].set_xlabel('true')
axes[0].set_xlim([4000, 8000])
axes[0].set_ylabel('predicted')
axes[0].legend()
# axes[0].plot([y_min, y_max], [y_min, y_max], color='r')

axes[1].scatter(df_filtered['logg_gspphot'], logg_mex, label='all')
axes[1].scatter(df_filtered['logg_gspphot'][df_filtered['logg_gspphot']<3.5], logg_mex[df_filtered['logg_gspphot']<3.5], color='r', label='giants')
axes[1].scatter(df_filtered['logg_gspphot'][dist_filter], logg_mex[dist_filter],
                color='k', marker='+', label='cluster')
axes[1].plot([1.5, 5], [1.5, 5], color='g')
axes[1].set_title(f'logg from magnitude diffs on Mexico observations field {f}')
axes[1].set_xlabel('true')
axes[1].set_xlim([1.5, 5])
axes[1].set_ylabel('predicted')
axes[1].legend()
# axes[1].plot([y_min, y_max], [y_min, y_max], color='r')

print(mean_absolute_error(df_filtered['teff_gspphot'][df_filtered['logg_gspphot']<3.5], teff_mex[df_filtered['logg_gspphot']<3.5]))

In [None]:
plt.hist(df_filtered['mh_gspphot'], bins=40, color='b')
plt.hist(df_filtered['mh_gspphot'][dist_filter], bins=40, color='r')


In [None]:
X_mex['teff'] = teff_mex
X_mex['logg'] = logg_mex
ind = X_mex.index
cols = X_mex.columns
X_mex = normalizer.transform(X_mex)
X_mex = pd.DataFrame(
    data=X_mex,
    index=ind,
    columns=cols
)

In [None]:
plt.hist(nn_model.predict(X_mex), bins=40, color='r', alpha=.5)
plt.hist(nn_model.predict(X_mex[dist_filter]), bins=40, color='g', alpha=.5)