# Gamma and Proton Classification Baseline



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline

import sys
import pathlib

sys.path.append("../..")

random_state = 420

In [2]:
data_path = pathlib.Path("../../..")

gammas = pd.read_parquet(data_path / "data" / "magic-gammas-new-1.parquet")
protons = pd.read_parquet(data_path / "data" / "magic-protons.parquet")

In [3]:
protons["class"] = 0
gammas["class"] = 1

protons = protons.dropna()
gammas = gammas.dropna()

In [4]:
dataset = pd.concat([gammas, protons])

In [5]:
def combine(*features):
    return np.concatenate((*features,), axis=1)

In [6]:
FEATURE_SIZE = 67

X = dataset.iloc[:, 0:FEATURE_SIZE]
y = dataset.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=random_state, stratify=y)

X_train_image_m1 = np.array(X_train["clean_image_m1"].tolist())
X_test_image_m1 = np.array(X_test["clean_image_m1"].tolist())
X_train_image_m2 = np.array(X_train["clean_image_m2"].tolist())
X_test_image_m2 = np.array(X_test["clean_image_m2"].tolist())

hillas_params = [
    "hillas_length_m1",
    "hillas_width_m1",
    "hillas_delta_m1",
    "hillas_size_m1",
    "hillas_cog_x_m1",
    "hillas_cog_y_m1",
    "hillas_sin_delta_m1",
    "hillas_cos_delta_m1",
    "hillas_length_m2",
    "hillas_width_m2",
    "hillas_delta_m2",
    "hillas_size_m2",
    "hillas_cog_x_m2",
    "hillas_cog_y_m2",
    "hillas_sin_delta_m2",
    "hillas_cos_delta_m2"
]
stereo_params = [
    "stereo_direction_x",       
    "stereo_direction_y",       
    "stereo_zenith",            
    "stereo_azimuth",           
    "stereo_dec",               
    "stereo_ra",                
    "stereo_theta2",            
    "stereo_core_x",            
    "stereo_core_y",            
    "stereo_impact_m1",         
    "stereo_impact_m2",         
    "stereo_impact_azimuth_m1", 
    "stereo_impact_azimuth_m2", 
    "stereo_shower_max_height", 
    "stereo_xmax",              
    "stereo_cherenkov_radius",  
    "stereo_cherenkov_density", 
    "stereo_baseline_phi_m1",   
    "stereo_baseline_phi_m2",   
    "stereo_image_angle",   
    "stereo_cos_between_shower"
]

X_train_hillas = X_train[hillas_params]
X_test_hillas = X_test[hillas_params]

X_train_stereo = X_train[stereo_params]
X_test_stereo = X_test[stereo_params]

In [7]:
print(np.bincount(y) / len(y))
print(np.bincount(y_test) / len(y_test))
print(np.bincount(y_train) / len(y_train))

[0.34867095 0.65132905]
[0.34867095 0.65132905]
[0.34867095 0.65132905]


In [8]:
def optimise_model(model, data_name, X_train, X_test, y_train, y_test, hyperparams):
    #params = {}
    #for key in hyperparams.keys():
    #    params[model.__name__.lower() + "__" + str(key)] = hyperparams[key]

    print(f"Optimizing model {model.__name__} with data {data_name}")

    pipeline = make_pipeline(
        StandardScaler(),
        RandomizedSearchCV(
            model(random_state=random_state),
            hyperparams,
            cv=2,
            refit=True
        )
    )

    pipeline.fit(X_train, y_train)
    print(f"Training accuracy: {pipeline.score(X_train, y_train)}")
    print(f"Validation accuracy: {pipeline.score(X_test, y_test)}")

    best_estimator = pipeline["randomizedsearchcv"].best_estimator_

    print(f"Best params are: {best_estimator.get_params()}\n")

    return best_estimator



# Using cleaned images concatenated

In [9]:
for model, params in [
    (LogisticRegression, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] }),
    (RandomForestClassifier, {
        "max_depth": [3, 5, 10, 20, 40, 75],
        "n_estimators": [10, 20, 50, 100, 200]
    }),
    (LinearSVC, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] })
]:
    try:
        optimise_model(
            model, 
            "m1, m2",
            combine(X_train_image_m1, X_train_image_m2), 
            combine(X_test_image_m1, X_test_image_m2),
            y_train,
            y_test,
            params
        )
    except Exception as e:
        print(f"Could not optimize {model.__name__} model\n{e}")

Optimizing model LogisticRegression with data m1, m2


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training accuracy: 0.6502659799752656
Validation accuracy: 0.648465956257306
Best params are: {'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 420, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Optimizing model RandomForestClassifier with data m1, m2
Training accuracy: 0.7782667253968523
Validation accuracy: 0.6829586460433361
Best params are: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 75, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 420, 'verbose': 0, 'warm_start': False}

Optimizing model LinearSVC with data m1, m2




Training accuracy: 0.6503252748742101
Validation accuracy: 0.6476527690717807
Best params are: {'C': 0.001, 'class_weight': None, 'dual': 'auto', 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 420, 'tol': 0.0001, 'verbose': 0}



# Using cleaned images and hillas paramaters

In [None]:
for model, params in [
    (LogisticRegression, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] }),
    (RandomForestClassifier, {
        "max_depth": [3, 5, 10, 20, 40, 75],
        "n_estimators": [10, 20, 50, 100, 200]
    }),
    (LinearSVC, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] })
]:
    try:
        optimise_model(
            model, 
            "m1, m2",
            combine(X_train_image_m1, X_train_image_m2, X_train_hillas), 
            combine(X_test_image_m1, X_test_image_m2, X_test_hillas),
            y_train,
            y_test,
            params
        )
    except Exception as e:
        print(f"Could not optimize {model.__name__} model\n{e}")

Optimizing model LogisticRegression with data m1, m2


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training accuracy: 0.7434436783167025
Validation accuracy: 0.7401358700255816
Best params are: {'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 420, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Optimizing model RandomForestClassifier with data m1, m2
Training accuracy: 0.879542412794145
Validation accuracy: 0.8029037559083132
Best params are: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 40, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': 420, 'verbose': 0, 'warm_start': False}

Optimizing model LinearSVC with data m1, m2




In [None]:
optimise_model(LinearSVC, "m1 m2 hillas", combine(X_train_image_m1, X_train_image_m2, X_train_hillas)[0:10000], combine(X_test_image_m1, X_test_image_m2, X_test_hillas)[0:10000], y_train[0:10000], y_test[0:10000])

# Using cleaned images and stereo parameters

In [None]:
for model, params in [
    (LogisticRegression, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] }),
    (RandomForestClassifier, {
        "max_depth": [3, 5, 10, 20, 40, 75],
        "n_estimators": [10, 20, 50, 100, 200]
    }),
    (LinearSVC, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] })
]:
    try:
        optimise_model(
            model, 
            "m1, m2",
            combine(X_train_image_m1, X_train_image_m2, X_train_stereo), 
            combine(X_test_image_m1, X_test_image_m2, X_test_stereo),
            y_train,
            y_test,
            params
        )
    except Exception as e:
        print(f"Could not optimize {model.__name__} model\n{e}")

# Using cleaned images, hillas and stereo parameters

In [None]:
for model, params in [
    (LogisticRegression, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] }),
    (RandomForestClassifier, {
        "max_depth": [3, 5, 10, 20, 40, 75],
        "n_estimators": [10, 20, 50, 100, 200]
    }),
    (LinearSVC, { "C": [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] })
]:
    try:
        optimise_model(
            model, 
            "m1, m2",
            combine(X_train_image_m1, X_train_image_m2, X_train_stereo, X_train_hillas), 
            combine(X_test_image_m1, X_test_image_m2, X_test_stereo, X_test_hillas),
            y_train,
            y_test,
            params
        )
    except Exception as e:
        print(f"Could not optimize {model.__name__} model\n{e}")

Optimizing model LogisticRegression with data m1, m2


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training accuracy: 0.7994900638690768
Validation accuracy: 0.7957206024361733
Best params are: {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 420, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Optimizing model RandomForestClassifier with data m1, m2
Training accuracy: 0.9967430159079743
Validation accuracy: 0.8892201873718807
Best params are: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 75, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': 420, 'verbose': 0, 'warm_start': False}

Optimizing model LinearSVC with data m1, m2


