In [None]:
from matplotlib import pyplot as plt

import numpy as np

import pandas as pd

from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler

from utils.prepare_data import clean_data, filter_rows_by_std, get_magnitude_diffs, load_data


plt.style.use('seaborn-darkgrid')
plt.rcParams["figure.figsize"] = (10, 10)


# Data preparation

In [None]:
# load the data
df = load_data()
df = clean_data(df)

In [None]:
# check NAs
for x in df.columns:
    try:
        s = sum([np.isnan(y) for y in df[x].values])
        if s:
            print(x, 'have', s, 'missing data')
    except:
        print('non-numeric column: ', x)

# Modelling

## Magnitude diffs & excessive cleanup

In [None]:
df = clean_data(df)  # Basic cleaning
# construct the magnitude differences
ordered_mag_columns = ['magcr3', 'magbr3', 'magar3', 'bpmag', 'gmag', 'rpmag', 'jmag', 'kmag']
df_diffs = get_magnitude_diffs(df, ordered_mag_columns)
df_diffs['bpmag_rpmag'] = df['bpmag'] - df['rpmag']
# filter Na's and measurements with too large standard deviations
df_diffs_filtered = filter_rows_by_std(df_diffs, df, std_thresholds={
    'sigcr3': 0.05,
    'sigbr3': 0.05,
    'sigar3': 0.05,
    'ejmag': 0.05,
    'ekmag': 0.05,
}).dropna()

In [None]:
df.shape[0], df_diffs.shape[0], df_diffs_filtered.shape[0]

## Predicting magnitudes from gravity, temperature and chemistry

### Field shifts from Bayesian MCMC approach

In [None]:
df['cb_field_shift'] = df['field'].map({
    'Field-1': 0.3319922760263798,
    'Field-2': 0.41525421695893566,
    'Field-3': 0.4374036431066466,
    'Field-4': 0.41137730050813004,
    'Field-5': 0.35876049518667247,
    'Field-6': 0.42670825326670114,
    'Field-8': 0.4220114380659433,
    'Field-9': 0.38748489552795107,
    'Field-10': 0.38493277215437155,
    'Field-11': 0.3910498880361436,
    'Field-15': 0.37523785738068216,
    'Field-16': 0.37873686102483073,
    'Field-17': 0.37748938994827175,
    'Field-18': 0.3650478096668249,
    'Field-19': 0.39866733152517186,
    'Field-20': 0.39821298519891424,
    'Field-24': 0.42353001068319074,
    'Field-25': 0.432486767263635,
    'Field-27': 0.40814162530040954,
    'Field-28': 0.37364547999152314,
    'Field-29': 0.4262784749766956,
    'Field-32': 0.4809766006917441,
    'Field-33': 0.42479356859998646,
    'Field-34': 0.4177341235655455,
    'Field-35': 0.45922559449923883,
    'Field-38': 0.5008953767354531,
    'Field-39': 0.3607422231174212,
    'Field-40': 0.3808902958085871,
    'Field-41': 0.3588893561248595,
    'Field-42': 0.36663408584866286,
    'Field-45': 0.3572882366819113,
    'Field-46': 0.36898328437951294,
    'Field-47': 0.40957565746657026,
})

df['ba_field_shift'] = df['field'].map({
    'Field-1': 0.18080835515053956,
    'Field-2': 0.19971150768603052,
    'Field-3': 0.16673795188561255,
    'Field-4': 0.22764694756560977,
    'Field-5': 0.16393258009286565,
    'Field-6': 0.2218356785159324,
    'Field-8': 0.21095491325215732,
    'Field-9': 0.19161996881699248,
    'Field-10': 0.1837026412942269,
    'Field-11': 0.18115561005808317,
    'Field-15': 0.15200887755485093,
    'Field-16': 0.1656647564454164,
    'Field-17': 0.1667862329753091,
    'Field-18': 0.13141335282697575,
    'Field-19': 0.12365755419411407,
    'Field-20': 0.23729289979478213,
    'Field-24': 0.15642818357320198,
    'Field-25': 0.1840500586940485,
    'Field-27': 0.22263591110330505,
    'Field-28': 0.14583983959304825,
    'Field-29': 0.1633080989641097,
    'Field-32': 0.10157314049954563,
    'Field-33': 0.1320147960009749,
    'Field-34': 0.14638373988925385,
    'Field-35': 0.1275999413428845,
    'Field-38': 0.12160791831422073,
    'Field-39': 0.17665871089194196,
    'Field-40': 0.1635198872846553,
    'Field-41': 0.2009430095532145,
    'Field-42': 0.1866283299753808,
    'Field-45': 0.1722000325556393,
    'Field-46': 0.15581685616524635,
    'Field-47': 0.22921627154498236,
})


### Modeling C-B magnitude diffs

In [None]:
X_use = df[['cafe', 'feh', 'teff', 'logg']].loc[df_diffs_filtered.index]

# C-B
y_use = df_diffs_filtered['magcr3_magbr3']

X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size=0.25, random_state=314)

In [None]:
normalizer = StandardScaler()
X_train = pd.DataFrame(
    data=normalizer.fit_transform(X_train),
    columns=['cafe', 'feh', 'teff', 'logg']
)

X_test = pd.DataFrame(
    data=normalizer.transform(X_test),
    columns=['cafe', 'feh', 'teff', 'logg']
)

In [None]:
m_cb = LinearRegression()
m_cb.fit(X_train, y_train)

In [None]:
y_train_predict = m_cb.predict(X_train)
y_test_predict = m_cb.predict(X_test)

y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True C-B value')
plt.ylabel('Predicted C-B value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))

In [None]:
# USE appropriate shift
y_train = y_train - df['cb_field_shift'][y_train.index]
y_test = y_test - df['cb_field_shift'][y_test.index]

In [None]:
m_cb = LinearRegression()
m_cb.fit(X_train, y_train)

In [None]:
y_train_predict = m_cb.predict(X_train)
y_test_predict = m_cb.predict(X_test)

y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True C-B value')
plt.ylabel('Predicted C-B value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))

In [None]:
m_cb = TransformedTargetRegressor(
    regressor=LinearRegression(),
    transformer=PowerTransformer()
)
m_cb.fit(X_train, y_train)


In [None]:
y_train_predict = m_cb.predict(X_train)
y_test_predict = m_cb.predict(X_test)

y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True C-B value')
plt.ylabel('Predicted C-B value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))

In [None]:
y_train = (y_train+.75)**(1/3)
y_test = (y_test+.75)**(1/3)

In [None]:
m_cb = LinearRegression()
m_cb.fit(X_train, y_train)

In [None]:
# Invert transformed target and predictions
y_train = y_train**3 - .75
y_test = y_test**3 - .75
y_train_predict = m_cb.predict(X_train)**3 - .75
y_test_predict = m_cb.predict(X_test)**3 - .75


y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True C-B value')
plt.ylabel('Predicted C-B value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))

# Negative B-A values looks suspicious ...
# plt.scatter(y_train[df_diffs_filtered['magbr3_magar3'][y_train.index]<0]**3-.75,
#             m.predict(X_train)[df_diffs_filtered['magbr3_magar3'][y_train.index]<0]**3-.75, color='k')
# plt.scatter(y_test[df_diffs_filtered['magbr3_magar3'][y_test.index]<0]**3-.75,
#             m.predict(X_test)[df_diffs_filtered['magbr3_magar3'][y_test.index]<0]**3-.75, color='k')


### Modeling B-A magnitude diffs

In [None]:
X_use = df[['cafe', 'feh', 'teff', 'logg']].loc[df_diffs_filtered.index]

# B-A
y_use = df_diffs_filtered['magbr3_magar3']

X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size=0.25, random_state=314)

In [None]:
normalizer = StandardScaler()
X_train = pd.DataFrame(
    data=normalizer.fit_transform(X_train),
    columns=['cafe', 'feh', 'teff', 'logg']
)

X_test = pd.DataFrame(
    data=normalizer.transform(X_test),
    columns=['cafe', 'feh', 'teff', 'logg']
)

In [None]:
m_ba = LinearRegression()
m_ba.fit(X_train, y_train)

In [None]:
y_train_predict = m_ba.predict(X_train)
y_test_predict = m_ba.predict(X_test)

y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True B-A value')
plt.ylabel('Predicted B-A value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))

In [None]:
# USE appropriate shift
y_train = y_train - df['ba_field_shift'][y_train.index]
y_test = y_test - df['ba_field_shift'][y_test.index]

In [None]:
m_ba = LinearRegression()
m_ba.fit(X_train, y_train)

In [None]:
y_train_predict = m_ba.predict(X_train)
y_test_predict = m_ba.predict(X_test)

y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True B-A value')
plt.ylabel('Predicted B-A value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))

### Modeling J-K magnitude diffs

In [None]:
X_use = df[['cafe', 'feh', 'teff', 'logg']].loc[df_diffs_filtered.index]

# J-K
y_use = df_diffs_filtered['jmag_kmag']
y_use = y_use**(1/2)

X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size=0.25, random_state=314)

In [None]:
normalizer = StandardScaler()
X_train = pd.DataFrame(
    data=normalizer.fit_transform(X_train),
    columns=['cafe', 'feh', 'teff', 'logg']
)

X_test = pd.DataFrame(
    data=normalizer.transform(X_test),
    columns=['cafe', 'feh', 'teff', 'logg']
)

In [None]:
m_jk = LinearRegression()
m_jk.fit(X_train, y_train)

In [None]:
# Inverse transform target and predictions
y_train = y_train**2
y_test = y_test**2
y_train_predict = m_jk.predict(X_train)**2
y_test_predict = m_jk.predict(X_test)**2

y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True J-K value')
plt.ylabel('Predicted J-K value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))

### Modeling Bp-Rp magnitude diffs

In [None]:
X_use = df[['cafe', 'feh', 'teff', 'logg']].loc[df_diffs_filtered.index]

# Bp-Rp
y_use = df_diffs_filtered['bpmag_rpmag']
y_use = y_use**(1/2)

X_train, X_test, y_train, y_test = train_test_split(X_use, y_use, test_size=0.25, random_state=314)

In [None]:
normalizer = StandardScaler()
X_train = pd.DataFrame(
    data=normalizer.fit_transform(X_train),
    columns=['cafe', 'feh', 'teff', 'logg']
)

X_test = pd.DataFrame(
    data=normalizer.transform(X_test),
    columns=['cafe', 'feh', 'teff', 'logg']
)

In [None]:
m_br = LinearRegression()
m_br.fit(X_train, y_train)

In [None]:
# Inverse transform target and predictions
y_train = y_train**2
y_test = y_test**2
y_train_predict = m_br.predict(X_train)**2
y_test_predict = m_br.predict(X_test)**2

y_min = min(min(y_use), min(y_train_predict), min(y_test_predict))
y_max = max(max(y_use), max(y_train_predict), max(y_test_predict))
plt.scatter(y_train, y_train_predict, alpha=.6, label='train')
plt.scatter(y_test, y_test_predict, alpha=.6, color='r', label='test')
plt.plot([y_min, y_max], [y_min, y_max], color='g', linestyle=':')
plt.xlabel('True Bp-Rp value')
plt.ylabel('Predicted Bp-Rp value')
plt.legend()

print('Mean absolute error on train/test set and R2 score on train/test set:',
      (mean_absolute_error(y_train, y_train_predict),
       mean_absolute_error(y_test, y_test_predict),
       r2_score(y_train, y_train_predict),
       r2_score(y_test, y_test_predict)))


In [None]:
np.array([
    m_cb.coef_, m_ba.coef_, m_jk.coef_, m_br.coef_
])

In [None]:
c = [[-0.0018, 0.030, -0.097, -0.034],
[-0.0036, 0.021, -0.15, -0],
[0.0006, 0.001, -0.10,-0.023],
[0.0017, 0.0095, -0.12, -0.022]]

In [None]:
# Angles between distinct coefficient vectors
from math import sqrt, acos, pi
for i in range(4):
    for j in range(i):
        print(acos(sum([x*y for x, y in zip(c[i], c[j])]) / sqrt(sum([x**2 for x in c[i]])*sum([x**2 for x in c[j]])))*180/pi)