## Boosting with applications in astronomy

This is a short introduction to Boosting algorithms with an application in astronomy

In [1]:
# Basic packages and plotting
import numpy as np 
import scipy.io as sio
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
import pandas as pd
#import seaborn as sns
rcParams['font.family'] = 'serif'

# Adjust rc parameters to make plots pretty
def plot_pretty(dpi=200, fontsize=9):
    
    import matplotlib.pyplot as plt

    plt.rc("savefig", dpi=dpi)       # dpi resolution of saved image files
    plt.rc('text', usetex=True)      # use LaTeX to process labels
    plt.rc('font', size=fontsize)    # fontsize
    plt.rc('xtick', direction='in')  # make axes ticks point inward
    plt.rc('ytick', direction='in')
    plt.rc('xtick.major', pad=10) 
    plt.rc('xtick.minor', pad=5)
    plt.rc('ytick.major', pad=10) 
    plt.rc('ytick.minor', pad=5)
    plt.rc('lines', dotted_pattern = [0.5, 1.1]) # fix dotted lines

    return

plot_pretty()

In [2]:
# Skikit-learn and XGBoost
from sklearn.ensemble import AdaBoostClassifier #Adaptive Boosting
from sklearn.ensemble import GradientBoostingClassifier #Gradient Boosting
#from xgboost import XGBClassifier #Extreme Gradient Boosting
from sklearn.linear_model import LogisticRegression # Linear regression for comparison

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

### Astronomy-related Example

Import a catalog of SDSS objects (Stars, Galaxies, QSOs).

Specifically, a feature matrix (`X_feat`) that contains magnitudes $u,g,r,i,z$, colors (all the possible differences of magnitudes), and ratios of magnitudes.

We also 

- Stars $\to y = 0$

- Galaxies $\to y = 1$

- Quasars $\to y = 2$

In [3]:
# Import feature matrix
X_feat = np.load("Feature_matrix.npy")
# Import labels
y_lab = np.load("Labels.npy")

Now, keep only stars and galaxies.

In [4]:
X_feat_bn = X_feat[y_lab!=2] #For binary classification, Stars and Galaxies only
y_bn = y_lab[y_lab!=2] # Labels for binary classification

**Split into Training-Validation-Test sets**

Using `sklearn`'s `train_test_split` this can be done in two steps

In [5]:
X_train_bn, X_valtest_bn, y_train_bn, y_valtest_bn = train_test_split(
    X_feat_bn, y_bn, test_size=0.3, random_state=42)
X_val_bn, X_test_bn, y_val_bn, y_test_bn = train_test_split(
    X_valtest_bn, y_valtest_bn, test_size=0.5, random_state=42)

**Rescale features**: We rescale all the features, in such a way that all have mean 0, and standard deviation 1 

In [6]:
scaler = StandardScaler()

scaler.fit(X_train_bn)
X_train_bn = scaler.transform(X_train_bn) # Train, rescaled
X_val_bn = scaler.transform(X_val_bn) # Validation, rescaled
X_test_bn = scaler.transform(X_test_bn) # Test, rescaled

In [7]:
print(len(X_train_bn))

14000


### **Baseline: Logistic Regression**

In [8]:
LR = LogisticRegression(penalty='none', max_iter=10000)
LR.fit(X_train_bn, y_train_bn)
y_pred_LR = LR.predict(X_test_bn)

In [9]:
print('Accuracy:',accuracy_score(y_test_bn,y_pred_LR))
print('Precision:',precision_score(y_test_bn,y_pred_LR))
print('Recall:',recall_score(y_test_bn,y_pred_LR))

Accuracy: 0.8156666666666667
Precision: 0.8235294117647058
Recall: 0.8133159268929504


### AdaBoost

Adaptive Boosting

![Adaboost](Adaboost.png)
(not mine)

![Boost_example](Boost_example.png)

In [10]:
ADA = AdaBoostClassifier(random_state=0)
ADA.fit(X_train_bn, y_train_bn)
y_pred_ADA = ADA.predict(X_test_bn)

In [11]:
print('Accuracy:',accuracy_score(y_test_bn,y_pred_ADA))
print('Precision:',precision_score(y_test_bn,y_pred_ADA))
print('Recall:',recall_score(y_test_bn,y_pred_ADA))

Accuracy: 0.8923333333333333
Precision: 0.8847867600254615
Recall: 0.9073107049608355


### Gradient Boosting (classifier)

![Boost_1](Boost_1.png)
![Boost_2](Boost_2.png)
![Boost_3](Boost_3.png)
![Boost_4](Boost_4.png)

In [28]:
GRAD = GradientBoostingClassifier(n_estimators=100, learning_rate=1., max_depth=5, random_state=0)
GRAD.fit(X_train_bn, y_train_bn)
y_pred_GRAD = GRAD.predict(X_test_bn)

In [29]:
print('Accuracy:',accuracy_score(y_test_bn,y_pred_GRAD))
print('Precision:',precision_score(y_test_bn,y_pred_GRAD))
print('Recall:',recall_score(y_test_bn,y_pred_GRAD))

Accuracy: 0.916
Precision: 0.91343669250646
Recall: 0.922976501305483


### XGBoost (Extreme Gradient Boosting)

![XGBoost](XGboost.png)

In [14]:
from xgboost import XGBClassifier #Extreme Gradient Boosting

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ['dlopen(/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [15]:
XGB = XGBClassifier(max_depth=1, learning_rate=1.0, n_estimators=100, random_state=0)
XGB.fit(X_train_bn, y_train_bn)
y_pred_XGB = XGB.predict(X_test_bn)

NameError: name 'XGBClassifier' is not defined