In [9]:
#AML project 3, by Luke Gegick, Dylan Miller, Jackson Dockerty

# The below code is to help parse the data to see the trend the data follows, below this code is the different models
# used to predict on this media

import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import entropy
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# read in the .csv file
eclipse_df = pd.read_csv("Eclipse_Pred\Eclipse_Train.csv")

# get the numbers of the types of eclipses
total_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 0]
Tm_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 1]
Ts_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 2]
T_plus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 3]
T_minus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 4]
Tn_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 5]
annular_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 6]
As_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 7]
Am_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 8]
A_plus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 9]
A_minus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 10]
An_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 11]
partial_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 12]
Pb_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 13]
Pe_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 14]
hybrid_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 15]
Hm_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 16]
H2_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 17]
H3_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 18]

# normalize the totals for each type to reduce overfitting or other logic errors
min_count = min(len(total_eclipse), len(annular_eclipse), len(hybrid_eclipse), len(partial_eclipse), 
                len(Tm_eclipse), len(Ts_eclipse), len(T_plus_eclipse), len(T_minus_eclipse), 
                len(Tn_eclipse), len(As_eclipse), len(Am_eclipse), len(A_plus_eclipse), 
                len(A_minus_eclipse), len(An_eclipse), len(Pb_eclipse), len(Pe_eclipse), 
                len(Hm_eclipse), len(H2_eclipse), len(H3_eclipse))

print(min_count)


6


In [26]:
#first version, using Linear Regression

import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import entropy
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# read in the .csv file
eclipse_df = pd.read_csv("Eclipse_Pred\Eclipse_Train.csv")

# get the numbers of the types of eclipses
total_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 0]
Tm_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 1]
Ts_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 2]
T_plus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 3]
T_minus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 4]
Tn_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 5]
annular_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 6]
As_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 7]
Am_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 8]
A_plus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 9]
A_minus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 10]
An_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 11]
partial_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 12]
Pb_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 13]
Pe_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 14]
hybrid_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 15]
Hm_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 16]
H2_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 17]
H3_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 18]

# normalize the totals for each type to reduce overfitting or other logic errors
min_count = min(len(total_eclipse), len(annular_eclipse), len(hybrid_eclipse), len(partial_eclipse), 
                len(Tm_eclipse), len(Ts_eclipse), len(T_plus_eclipse), len(T_minus_eclipse), 
                len(Tn_eclipse), len(As_eclipse), len(Am_eclipse), len(A_plus_eclipse), 
                len(A_minus_eclipse), len(An_eclipse), len(Pb_eclipse), len(Pe_eclipse), 
                len(Hm_eclipse), len(H2_eclipse), len(H3_eclipse))

if min_count < 20:
    min_count = 20
sample_total_eclipse = total_eclipse.sample(n = min_count, replace = True)
sample_Tm_eclipse = Tm_eclipse.sample(n = min_count, replace = True)
sample_Ts_eclipse = Ts_eclipse.sample(n = min_count, replace = True)
sample_T_plus_eclipse = T_plus_eclipse.sample(n = min_count, replace = True)
sample_T_minus_eclipse = T_minus_eclipse.sample(n = min_count, replace = True)
sample_Tn_eclipse = Tn_eclipse.sample(n = min_count, replace = True)
sample_annular_eclipse = annular_eclipse.sample(n = min_count, replace = True)
sample_As_eclipse = As_eclipse.sample(n = min_count, replace = True)
sample_Am_eclipse = Am_eclipse.sample(n = min_count, replace = True)
sample_A_plus_eclipse = A_plus_eclipse.sample(n = min_count, replace = True)
sample_A_minus_eclipse = A_minus_eclipse.sample(n = min_count, replace = True)
sample_An_eclipse = An_eclipse.sample(n = min_count, replace = True)
sample_partial_eclipse = partial_eclipse.sample(n = min_count, replace = True)
sample_Pb_eclipse = Pb_eclipse.sample(n = min_count, replace = True)
sample_Pe_eclipse = Pe_eclipse.sample(n = min_count, replace = True)
sample_hybrid_eclipse = hybrid_eclipse.sample(n = min_count, replace = True)
sample_Hm_eclipse = Hm_eclipse.sample(n = min_count, replace = True)
sample_H2_eclipse = H2_eclipse.sample(n = min_count, replace = True)
sample_H3_eclipse = H3_eclipse.sample(n = min_count, replace = True)

# reread the new balanced data
eclipse_df = pd.concat([sample_total_eclipse, sample_Tm_eclipse, sample_Ts_eclipse,
                        sample_T_plus_eclipse, sample_T_minus_eclipse, sample_Tn_eclipse,
                        sample_annular_eclipse, sample_As_eclipse, sample_Am_eclipse,
                        sample_A_plus_eclipse, sample_A_minus_eclipse, sample_An_eclipse,
                        sample_partial_eclipse, sample_Pb_eclipse, sample_Pe_eclipse, 
                        sample_hybrid_eclipse, sample_Hm_eclipse, sample_H2_eclipse,
                        sample_H3_eclipse], ignore_index = True)

# Drop columns with missing data in "Central Duration Seconds"
eclipse_df.dropna(subset=['Central Duration Seconds'], inplace=True)

# Replace non-numeric values in the DataFrame with NaN
eclipse_df.replace('-', np.nan, inplace=True)

# drop all features that have no or negative impact on results
eclipse_df = eclipse_df.drop(["Catalog Number", "Calendar Date", "Eclipse Time", "Latitude",
                              "Longitude", "Central Duration", "Date Time", "Visibility",
                              "Geographical Hemisphere", "Daytime/Nighttime", "Sun Constellation",
                              "Eclipse Classification", "Duration in Seconds", "Year Modulus",
                              "Decade", "ESC Moving Average", "ESC Wide-Scale Moving Average",
                              "Cluster", "Cluster 6"], axis=1)


# set the X (features) and Y (target variables)
X = eclipse_df[["Delta T (s)", "Lunation Number", "Saros Number", "Gamma",
               "Eclipse Magnitude", "Sun Altitude", "Sun Azimuth", "Path Width (km)",
               "Year", "Month", "Day", "Eclipse Latitude", "Eclipse Longitude",
               "obliquity", "Inter-Eclipse Duration", "Visibility Score",
               "Moon Distance (km)", "Sun Distance (km)", "Moon Angular Diameter (degrees)",
               "Sun Angular Diameter (degrees)", "Central Duration Seconds",
               "Normalized Duration", "Normalized Path Width", "EII", "HEAS",
               "Localized ESC", "Eclipse Interval"]]

y = eclipse_df['Eclipse Type']

# split the data between training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

# set up the pipeline for use by the param_grid
clf2 = Pipeline(steps=[('imputer', SimpleImputer(strategy="mean", missing_values = np.nan, fill_value = None)),
                       ('scaler', StandardScaler()),
                       ('lr', LogisticRegression())])

# Define the parameter grid
param_grid = {
    'lr__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'lr__penalty': ['l1', 'l2'],  # Regularization penalty
    'lr__solver': ['liblinear', 'saga'],  # Solver for optimization problem
    'lr__max_iter': [10000]  # Maximum number of iterations
}

# Initialize GridSearchCV, cv is the number of folds, verbose tells it not to state what its implementing
#and the n_jobs allows the program to utilize all of the cores of the cpu for faster times
grid = GridSearchCV(clf2, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
grid.fit(X_train, y_train)

# Get the best parameters for logistic regression for the given data
best_params = grid.best_params_

# set the new parameters to the pipeline
clf2.set_params(**best_params)

# fit the pipeline to the data
clf2.fit(X_train, y_train)

# Get the predicted labels and set up confustion matrix
y_pred = clf2.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# output all relevant accuracies
print("Train accuracy:", clf2.score(X_train, y_train))
print("Test accuracy:", clf2.score(X_test, y_test)) 
print(cm)

Train accuracy: 0.9635416666666666
Test accuracy: 0.75
[[3 0 0 0 0 0 0 0 1 0 0 0]
 [0 3 0 0 0 0 0 0 1 0 0 0]
 [0 0 4 0 0 0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 2 0 0 0 0 0]
 [0 0 0 0 0 4 0 0 0 0 0 0]
 [0 0 0 0 0 0 4 0 0 0 0 0]
 [0 0 0 0 0 0 0 4 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 2 1]
 [0 0 0 0 0 0 0 0 0 4 0 0]
 [0 0 0 0 0 0 0 0 0 1 2 1]
 [0 0 0 0 0 0 0 0 0 1 1 2]]


In [35]:
# Model 2 - Knn Learning Model

import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import entropy
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# read in the .csv file
eclipse_df = pd.read_csv("Eclipse_Pred\Eclipse_Train.csv")

# get the numbers of the types of eclipses
total_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 0]
Tm_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 1]
Ts_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 2]
T_plus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 3]
T_minus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 4]
Tn_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 5]
annular_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 6]
As_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 7]
Am_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 8]
A_plus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 9]
A_minus_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 10]
An_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 11]
partial_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 12]
Pb_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 13]
Pe_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 14]
hybrid_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 15]
Hm_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 16]
H2_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 17]
H3_eclipse = eclipse_df[eclipse_df['Eclipse Type'] == 18]

# normalize the totals for each type to reduce overfitting or other logic errors
min_count = min(len(total_eclipse), len(annular_eclipse), len(hybrid_eclipse), len(partial_eclipse), 
                len(Tm_eclipse), len(Ts_eclipse), len(T_plus_eclipse), len(T_minus_eclipse), 
                len(Tn_eclipse), len(As_eclipse), len(Am_eclipse), len(A_plus_eclipse), 
                len(A_minus_eclipse), len(An_eclipse), len(Pb_eclipse), len(Pe_eclipse), 
                len(Hm_eclipse), len(H2_eclipse), len(H3_eclipse))

if min_count < 20:
    min_count = 20
sample_total_eclipse = total_eclipse.sample(n = min_count, replace = True)
sample_Tm_eclipse = Tm_eclipse.sample(n = min_count, replace = True)
sample_Ts_eclipse = Ts_eclipse.sample(n = min_count, replace = True)
sample_T_plus_eclipse = T_plus_eclipse.sample(n = min_count, replace = True)
sample_T_minus_eclipse = T_minus_eclipse.sample(n = min_count, replace = True)
sample_Tn_eclipse = Tn_eclipse.sample(n = min_count, replace = True)
sample_annular_eclipse = annular_eclipse.sample(n = min_count, replace = True)
sample_As_eclipse = As_eclipse.sample(n = min_count, replace = True)
sample_Am_eclipse = Am_eclipse.sample(n = min_count, replace = True)
sample_A_plus_eclipse = A_plus_eclipse.sample(n = min_count, replace = True)
sample_A_minus_eclipse = A_minus_eclipse.sample(n = min_count, replace = True)
sample_An_eclipse = An_eclipse.sample(n = min_count, replace = True)
sample_partial_eclipse = partial_eclipse.sample(n = min_count, replace = True)
sample_Pb_eclipse = Pb_eclipse.sample(n = min_count, replace = True)
sample_Pe_eclipse = Pe_eclipse.sample(n = min_count, replace = True)
sample_hybrid_eclipse = hybrid_eclipse.sample(n = min_count, replace = True)
sample_Hm_eclipse = Hm_eclipse.sample(n = min_count, replace = True)
sample_H2_eclipse = H2_eclipse.sample(n = min_count, replace = True)
sample_H3_eclipse = H3_eclipse.sample(n = min_count, replace = True)

# reread the new balanced data
eclipse_df = pd.concat([sample_total_eclipse, sample_Tm_eclipse, sample_Ts_eclipse,
                        sample_T_plus_eclipse, sample_T_minus_eclipse, sample_Tn_eclipse,
                        sample_annular_eclipse, sample_As_eclipse, sample_Am_eclipse,
                        sample_A_plus_eclipse, sample_A_minus_eclipse, sample_An_eclipse,
                        sample_partial_eclipse, sample_Pb_eclipse, sample_Pe_eclipse, 
                        sample_hybrid_eclipse, sample_Hm_eclipse, sample_H2_eclipse,
                        sample_H3_eclipse], ignore_index = True)

# Drop columns with missing data in "Central Duration Seconds"
eclipse_df.dropna(subset=['Central Duration Seconds'], inplace=True)

# Replace non-numeric values in the DataFrame with NaN
eclipse_df.replace('-', np.nan, inplace=True)

# drop all features that have no or negative impact on results
eclipse_df = eclipse_df.drop(["Catalog Number", "Calendar Date", "Eclipse Time", "Latitude",
                              "Longitude", "Central Duration", "Date Time", "Visibility",
                              "Geographical Hemisphere", "Daytime/Nighttime", "Sun Constellation",
                              "Eclipse Classification", "Duration in Seconds", "Year Modulus",
                              "Decade", "ESC Moving Average", "ESC Wide-Scale Moving Average",
                              "Cluster", "Cluster 6"], axis=1)


# set the X (features) and Y (target variables)
X = eclipse_df[["Delta T (s)", "Lunation Number", "Saros Number", "Gamma",
               "Eclipse Magnitude", "Sun Altitude", "Sun Azimuth", "Path Width (km)",
               "Year", "Month", "Day", "Eclipse Latitude", "Eclipse Longitude",
               "obliquity", "Inter-Eclipse Duration", "Visibility Score",
               "Moon Distance (km)", "Sun Distance (km)", "Moon Angular Diameter (degrees)",
               "Sun Angular Diameter (degrees)", "Central Duration Seconds",
               "Normalized Duration", "Normalized Path Width", "EII", "HEAS",
               "Localized ESC", "Eclipse Interval"]]

y = eclipse_df['Eclipse Type']

# split the data between training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

# set up the pipeline for use by the param_grid
clf2 = Pipeline(steps=[('imputer', SimpleImputer(strategy="mean", missing_values = np.nan, fill_value = None)),
                       ('scaler', StandardScaler()),
                       ('knn', KNeighborsClassifier())])
# Define parameters for grid search
param_grid = {
    'knn__n_neighbors': range(3, 20),  # Test K values from 10 to 17
    'knn__metric': ['manhattan', 'chebyshev', 'minkowski'],
    'knn__weights': ['distance', 'uniform']
}

# Perform grid search, note cv is the number of folds it tests over
grid = GridSearchCV(clf2, param_grid, cv = 5, scoring='accuracy', n_jobs = -1)
grid.fit(X_train, y_train)

# Get the best parameters for KNN from grid search
best_params = grid.best_params_
best_k = best_params['knn__n_neighbors']

# Update the pipeline with the best parameters
clf2.set_params(knn__n_neighbors = best_k)

# Fit the pipeline with updated parameters
clf2.fit(X_train, y_train)

# Get the predicted labels and set up confustion matrix
y_pred = clf2.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# output all relevant accuracies
print("Train accuracy:", clf2.score(X_train, y_train))
print("Test accuracy:", clf2.score(X_test, y_test)) 
print(cm)


Train accuracy: 0.7604166666666666
Test accuracy: 0.6458333333333334
[[1 2 0 1 0 0 0 0 0 0 0 0]
 [2 2 0 0 0 0 0 0 0 0 0 0]
 [1 0 3 0 0 0 0 0 0 0 0 0]
 [0 0 0 3 0 0 0 0 0 1 0 0]
 [0 0 0 0 1 1 0 1 0 0 1 0]
 [0 0 0 0 0 4 0 0 0 0 0 0]
 [0 0 0 0 1 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 4 0 0 0 0]
 [0 0 0 0 0 0 0 0 2 0 1 1]
 [0 0 0 0 0 0 0 0 0 4 0 0]
 [0 0 0 0 0 0 1 0 0 0 2 1]
 [0 0 0 0 1 0 1 0 0 0 0 2]]


In [None]:
# model 3 - Linear Regression

In [None]:
# model 4 - K means (unsupervised)