In [40]:
#AML project 3, by Luke Gegick, Dylan Miller, Jackson Dockerty

#first version, using Linear Regression

import warnings
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import entropy
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# read in the .csv file
eclipse_df = pd.read_csv("Eclipse_Pred\Eclipse_Train.csv")

# get the numbers of the types of eclipses
total_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 'T']
annular_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 'A']
hybrid_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 'H']
partial_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 'P']
partial_eclipse_of_sun = eclipse_df[eclipse_df['Eclipse Type'] == 'Pb']

# normalize the totals for each type to reduce overfitting or other logic errors
min_count = min(len(total_eclipse), len(annular_eclipse), len(hybrid_eclipse), len(partial_eclipse), len(partial_eclipse_of_sun))
if min_count < 150:
    min_count = 150
sample_total_eclipse = total_eclipse.sample(n = min_count, replace = True)
sample_annular_eclipse = annular_eclipse.sample(n = min_count, replace = True)
sample_hybrid_eclipse = hybrid_eclipse.sample(n = min_count, replace = True)
sample_partial_eclipse = partial_eclipse.sample(n = min_count, replace = True)
sample_partial_eclipse_of_sun = partial_eclipse_of_sun.sample(n = min_count, replace = True)

# reread the new balanced data
eclipse_df = pd.concat([sample_total_eclipse, sample_annular_eclipse, sample_hybrid_eclipse,
                       sample_partial_eclipse, sample_partial_eclipse_of_sun], ignore_index = True)

# drop all features that have no or negative impact on results
eclipse_df = eclipse_df.drop(["Catalog Number", "Calendar Date", "Latitude", "Longitude",
                              "Date Time", "Geographical Hemisphere", "Daytime/Nighttime","Normalized Duration",
                              "Visibility Score", "Eclipse Classification", "Duration in Seconds",
                              "Moon Distance (km)", "Sun Distance (km)", "Moon Angular Diameter (degrees)",
                              "Sun Angular Diameter (degrees)", "Normalized Path Width", "Year Modulus", 
                              "HEAS", "Decade", "Localized ESC", "ESC Moving Average", "ESC Wide-Scale Moving Average", 
                              "Cluster", "Cluster 6"], axis=1)


# set the X (features) and Y (target variables)
X = eclipse_df[['Eclipse Time', 'Delta T (s)', 'Lunation Number', 'Saros Number',
               'Gamma', 'Eclipse Magnitude', 'Sun Altitude', 'Sun Azimuth', 'Path Width (km)',
               'Central Duration', 'Year', 'Month', 'Day', 'Visibility', 'Eclipse Latitude',
               'Eclipse Longitude', 'obliquity', 'Inter-Eclipse Duration', 'Central Duration Seconds',
               'EII', 'Eclipse Interval']]

y = eclipse_df['Eclipse Type']

# split the data between training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

# set up the pipeline for use by the param_grid
clf2 = Pipeline(steps = [('imputer', SimpleImputer(strategy = "mean")), ('lr', LogisticRegression())])

# Define the parameter grid
param_grid = {
    'lr__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'lr__penalty': ['l1', 'l2'],  # Regularization penalty
    'lr__solver': ['liblinear', 'saga'],  # Solver for optimization problem
    'lr__max_iter': [10000]  # Maximum number of iterations
}

# Initialize GridSearchCV, cv is the number of folds, verbose tells it not to state what its implementing
#and the n_jobs allows the program to utilize all of the cores of the cpu for faster times
grid = GridSearchCV(clf2, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
grid.fit(X_train, y_train)

# Get the best parameters for logistic regression for the given data
best_params = grid.best_params_

#set the new parameters to the pipeline
clf.set_params(**best_params)

#fit the pipeline to the data
clf.fit(X_train, y_train)

print("accuracy:", clf.score(X_test, y_test)) 

ValueError: 
All the 72 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\base.py", line 918, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\impute\_base.py", line 366, in fit
    X = self._validate_input(X, in_fit=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\impute\_base.py", line 327, in _validate_input
    raise new_ve from None
ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: '13:53:47'

--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\base.py", line 918, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\impute\_base.py", line 366, in fit
    X = self._validate_input(X, in_fit=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LRex4\anaconda3\Lib\site-packages\sklearn\impute\_base.py", line 327, in _validate_input
    raise new_ve from None
ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: '23:17:35'
