In [15]:
import pandas as pd
import numpy as np
import glob
import os
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import metrics 
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report

In [16]:
import imblearn
from imblearn.over_sampling import SMOTE 

In [17]:
start_time = time.time() 

In [18]:
asteroid_df = pd.read_csv("data/asteroid_normalized_small_df.csv")
asteroid_df.drop(columns = ['diameter', 'albedo', 'a', 'i', 'neo'], inplace = True)
asteroid_df

Unnamed: 0,pha,H,e,q,n,moid,spkid,full_name
0,0,-7.411043,-0.608738,0.289176,-0.009788,0.306452,20000001,1 Ceres (A801 AA)
1,0,-6.932515,0.859223,-0.382876,-0.017945,-0.258065,20000002,2 Pallas (A802 FA)
2,0,-6.282209,1.108738,-0.618740,0.181077,-0.564516,20000003,3 Juno (A804 RA)
3,0,-7.466258,-0.503883,-0.357027,0.928222,-0.403226,20000004,4 Vesta (A807 FA)
4,0,-5.171779,0.440777,-0.444265,0.384992,-0.451613,20000005,5 Astraea (A845 XA)
...,...,...,...,...,...,...,...,...
137912,0,-3.141104,8.195146,6.101777,-3.502333,6.048387,3547316,(2010 BK118)
137913,0,-0.319018,7.751456,2.129241,-3.470979,2.258065,3516402,(2010 GW64)
137914,0,-1.306748,8.038835,4.859451,-3.497338,5.290323,3517549,(2010 GW147)
137915,0,-0.503067,7.960194,0.447496,-3.476215,0.903226,3523334,(2010 JH124)


## Split data

In [19]:
X = pd.DataFrame(asteroid_df['moid'])
y = asteroid_df.iloc[:, 0]

In [20]:
from sklearn.model_selection import train_test_split
X_train_original, X_test, y_train_original, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [21]:
sm = SMOTE(random_state = 42) 
X_train, y_train = sm.fit_resample(X_train_original, y_train_original.ravel())
  
print("After OverSampling, counts of label 'N': {}".format(sum(y_train == 0))) 
print("After OverSampling, counts of label 'Y': {}".format(sum(y_train == 1))) 

After OverSampling, counts of label 'N': 110091
After OverSampling, counts of label 'Y': 110091


  X_train, y_train = sm.fit_resample(X_train_original, y_train_original.ravel())


## Grid Search

In [22]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

start_time_grid = time.time() 

tuned_parameters = [{'kernel': ['linear'], 'C': [1, 10, 25]},
                    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 25]}]

score = 'accuracy'

print("# Tuning hyper-parameters for %s" % score)
print()

clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                   scoring=score, n_jobs = -2)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
results = clf.cv_results_
for i in range(len(results["params"])):
    print("%0.3f (+/-%0.03f) for %r" % (results["mean_test_score"][i], results["std_test_score"][i] * 2, results["params"][i]))

end_time_grid = time.time()
execution_time = end_time_grid - start_time_grid
print(f"Execution time: {execution_time:.2f} seconds")

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'C': 25, 'kernel': 'linear'}

Grid scores on development set:

0.999 (+/-0.000) for {'C': 1, 'kernel': 'linear'}
0.999 (+/-0.000) for {'C': 10, 'kernel': 'linear'}
0.999 (+/-0.000) for {'C': 25, 'kernel': 'linear'}
0.997 (+/-0.000) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.997 (+/-0.000) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.997 (+/-0.000) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.997 (+/-0.000) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.998 (+/-0.000) for {'C': 25, 'gamma': 0.001, 'kernel': 'rbf'}
0.997 (+/-0.000) for {'C': 25, 'gamma': 0.0001, 'kernel': 'rbf'}
Execution time: 804.05 seconds


In [23]:
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'loss': ['hinge', 'log_loss', 'modified_huber'], 
                     'penalty': ['l2', 'elasticnet'],
                     'l1_ratio': [0, 0.2, 0.5, 1],
                     'max_iter': [100, 300]}]

score = 'accuracy'

print("# Tuning hyper-parameters for %s" % score)
print()

clf_SGD = GridSearchCV(SGDClassifier(), tuned_parameters, cv=5,
                   scoring=score, n_jobs = -2)
clf_SGD.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf_SGD.best_params_)
print()
print("Grid scores on development set:")
print()
results = clf_SGD.cv_results_
for i in range(len(results["params"])):
    print("%0.3f (+/-%0.03f) for %r" % (results["mean_test_score"][i], results["std_test_score"][i] * 2, results["params"][i]))

end_time_grid = time.time()
execution_time = end_time_grid - start_time_grid
print(f"Execution time: {execution_time:.2f} seconds")

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'l1_ratio': 1, 'loss': 'modified_huber', 'max_iter': 300, 'penalty': 'elasticnet'}

Grid scores on development set:

0.998 (+/-0.000) for {'l1_ratio': 0, 'loss': 'hinge', 'max_iter': 100, 'penalty': 'l2'}
0.998 (+/-0.000) for {'l1_ratio': 0, 'loss': 'hinge', 'max_iter': 100, 'penalty': 'elasticnet'}
0.998 (+/-0.000) for {'l1_ratio': 0, 'loss': 'hinge', 'max_iter': 300, 'penalty': 'l2'}
0.998 (+/-0.000) for {'l1_ratio': 0, 'loss': 'hinge', 'max_iter': 300, 'penalty': 'elasticnet'}
0.997 (+/-0.000) for {'l1_ratio': 0, 'loss': 'log_loss', 'max_iter': 100, 'penalty': 'l2'}
0.997 (+/-0.000) for {'l1_ratio': 0, 'loss': 'log_loss', 'max_iter': 100, 'penalty': 'elasticnet'}
0.997 (+/-0.000) for {'l1_ratio': 0, 'loss': 'log_loss', 'max_iter': 300, 'penalty': 'l2'}
0.997 (+/-0.000) for {'l1_ratio': 0, 'loss': 'log_loss', 'max_iter': 300, 'penalty': 'elasticnet'}
0.999 (+/-0.000) for {'l1_ratio': 0, 'loss': 'm

In [24]:
print("Best Parameters:", clf_SGD.best_params_)
print("Best Cross-Validation Score:", clf_SGD.best_score_)

Best Parameters: {'l1_ratio': 1, 'loss': 'modified_huber', 'max_iter': 300, 'penalty': 'elasticnet'}
Best Cross-Validation Score: 0.9993232870368901


In [25]:
print("Best Parameters:", clf.best_params_)
print("Best Cross-Validation Score:", clf.best_score_)

Best Parameters: {'C': 25, 'kernel': 'linear'}
Best Cross-Validation Score: 0.9991098270770346


In [26]:
clf_SGD.predict(X_test)[0:150]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [27]:
y_true, y_pred = y_test, clf_SGD.predict(X_test)
print(classification_report(y_true, y_pred))
np.mean(y_true == y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     27511
           1       0.68      1.00      0.81        73

    accuracy                           1.00     27584
   macro avg       0.84      1.00      0.91     27584
weighted avg       1.00      1.00      1.00     27584



0.9987674013921114

### Model validation via cross-validation

In [28]:
from sklearn.model_selection import train_test_split
# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

sm = SMOTE(random_state = 42) 
X1, y1 = sm.fit_resample(X1, y1.ravel())
X2, y2 = sm.fit_resample(X2, y2.ravel())

# fit the model on one set of data
clf_SGD.fit(X1, y1)

# evaluate the model on the second set of data
y2_model = clf_SGD.predict(X2)
accuracy_score(y2, y2_model)

  X1, y1 = sm.fit_resample(X1, y1.ravel())
  X2, y2 = sm.fit_resample(X2, y2.ravel())


0.9993096632610055

In [29]:
y2_model = clf_SGD.fit(X1, y1).predict(X2)
y1_model = clf_SGD.fit(X2, y2).predict(X1)
accuracy_score(y1, y1_model), accuracy_score(y2, y2_model)

(0.9994258303655789, 0.9992442629383639)

In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf_SGD, X, y, cv=5)

array([0.99891241, 0.99880365, 0.99862234, 0.99905739, 0.97092412])

In [31]:
from imblearn.pipeline import Pipeline  # Use imblearn's Pipeline
from imblearn.over_sampling import SMOTE

pipeline = Pipeline([
    ('smote', SMOTE()),                 
    ('svc', clf_SGD)     
])

scores = cross_val_score(pipeline, X, y, cv=5)
print("Cross-validation scores with SMOTE:", scores)
print("Mean accuracy:", scores.mean())

Cross-validation scores with SMOTE: [1.         1.         1.         1.         0.97541964]
Mean accuracy: 0.9950839285066889


In [32]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=skf)

res = [clf_SGD.fit(X.iloc[train], y.iloc[train]).score(X.iloc[test], y.iloc[test]) for train, test in skf.split(X, y)]
#res is a score vector within 0,1
np.mean(res) #The average accuracy
print("Cross-validation scores with SMOTE:", scores)
print("Mean accuracy:", scores.mean())

Cross-validation scores with SMOTE: [0.99858614 0.99880365 0.99844107 0.99883987 0.99840481]
Mean accuracy: 0.9986151079303107


## Cross Validation without KFold

In [33]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)  
scores = cross_val_score(pipeline, X, y, cv=kf)

print("Cross-validation scores with SMOTE:", scores)
print("Mean accuracy:", scores.mean())

Cross-validation scores with SMOTE: [0.99851363 0.99836862 0.99815104 0.99851358 0.99869485]
Mean accuracy: 0.9984483422234668


## Validation Curve (for a Single Hyperparameter)

In [None]:
from sklearn.model_selection import validation_curve

para = {'C': [0.001, 0.01, 0.1, 1, 10, 25]} 
param_range = para['C']

train_scores, val_scores = validation_curve(
    SVC(),
    X_train, y_train,
    param_name="C",
    param_range=param_range,
    cv=5,
    scoring="accuracy"
)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(param_range, np.mean(train_scores, axis=1), label="Training score", color="blue")
plt.plot(param_range, np.mean(val_scores, axis=1), label="Validation score", color="red")
plt.xlabel("C (Inverse of Regularization Strength)")
plt.ylabel("Accuracy")
plt.title("Validation Curve for SVC")
plt.xscale("log")  # Log scale for C
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import validation_curve

para = {'C': [0.001, 0.01, 0.1, 1, 10, 25]} 
param_range = para['C']

train_scores, val_scores = validation_curve(
    SGDClassifier(),
    X_train, y_train,
    param_name="C",
    param_range=param_range,
    cv=5,
    scoring="accuracy"
)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(param_range, np.mean(train_scores, axis=1), label="Training score", color="blue")
plt.plot(param_range, np.mean(val_scores, axis=1), label="Validation score", color="red")
plt.xlabel("C (Inverse of Regularization Strength)")
plt.ylabel("Accuracy")
plt.title("Validation Curve for SGDClassifier")
plt.xscale("log")  # Log scale for C
plt.legend()
plt.show()

## Learning Curves (Training vs. Cross-Validation Performance)

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    clf_SGD, X_train, y_train, cv=5, scoring="accuracy", train_sizes=np.linspace(0.1, 1.0, 5)
)

# Mean and standard deviation for plotting
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label="Training score", color="blue")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="blue", alpha=0.2)
plt.plot(train_sizes, val_mean, label="Cross-validation score", color="red")
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, color="red", alpha=0.2)
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.title("Learning Curve for SVC")
plt.legend()
plt.show()

In [None]:
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")