In [159]:
import pandas as pd, seaborn as sns, matplotlib.pyplot as plt, numpy as np, time
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

In [160]:
# Read data
# Change data path on your computer
data_path = 'earthquakes.csv'
data = pd.read_csv(data_path)
# Print the data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1137 entries, 0 to 1136
Data columns (total 43 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1137 non-null   object 
 1   magnitude        1137 non-null   float64
 2   type             1137 non-null   object 
 3   title            1137 non-null   object 
 4   date             1137 non-null   object 
 5   time             1137 non-null   int64  
 6   updated          1137 non-null   int64  
 7   url              1137 non-null   object 
 8   detailUrl        1137 non-null   object 
 9   felt             1137 non-null   int64  
 10  cdi              1137 non-null   int64  
 11  mmi              1137 non-null   int64  
 12  alert            764 non-null    object 
 13  status           1137 non-null   object 
 14  tsunami          1137 non-null   int64  
 15  sig              1137 non-null   int64  
 16  net              1137 non-null   object 
 17  code          

In [161]:
# Remove some features that aren't important for classify / clustering the alert of the earthquake
# Feature 'date' is already converted to float type in the original dataset 
data.drop(['id', 'date', 'title', 'url', 'detailUrl', 'ids', 'sources',
           'types', 'net', 'code', 'geometryType', 'status',
           'postcode', 'what3words', 'locationDetails'], axis=1, inplace=True)

# Replace Null value of column 'alert' to unknown 
data.fillna({'alert': 'unknown'}, inplace=True)

# NA-value columns will be filled by 'unknown' value
na_column = data.columns[data.isna().any()].tolist()

for column in na_column:
    data.fillna({column: 'unknown'}, inplace=True)

# String-value columns will be one-hot coding
string_columns = data.select_dtypes(include=['object']).columns.tolist()
string_columns.remove('alert')

# One-hot coding
for column in string_columns:
    dummies = pd.get_dummies(data[column], prefix='type')
    data.drop([column], axis=1, inplace=True)
    data = pd.concat([data, dummies], axis=1)

pd.DataFrame(data)

Unnamed: 0,magnitude,time,updated,felt,cdi,mmi,alert,tsunami,sig,nst,...,type_Wushi,type_Yakutat,type_Yamgan,type_Yucaipa,type_Yucca Flat,type_Zaybak,type_Zendeh Jan District,type_دهستان جلگه چاه هاشم,type_دهستان مورموری,type_دهستان گهره
0,4.8,1726534182289,1726583895255,1893,6,5,green,0,994,37,...,False,False,False,False,False,False,False,False,False,False
1,5.1,1726534182183,1726672002991,2042,6,5,green,0,1040,24,...,False,False,False,False,False,False,False,False,False,False
2,3.7,1726485728190,1726637414586,1580,4,4,unknown,0,591,135,...,False,False,False,False,False,False,False,False,False,False
3,3.9,1726333266539,1726584426218,5,3,4,green,0,236,38,...,False,False,False,False,False,False,False,False,False,False
4,4.1,1726333266382,1726334616179,4,3,4,green,0,260,28,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132,6.3,1696662769702,1702589215040,11,9,8,orange,0,1010,244,...,False,False,False,False,False,False,True,False,False,False
1133,5.4,1696661381156,1702589215040,0,0,7,yellow,0,650,165,...,False,False,False,False,False,False,True,False,False,False
1134,6.3,1696660863327,1702589215040,19,8,8,orange,0,1014,279,...,False,False,False,False,False,False,True,False,False,False
1135,5.7,1696324863018,1702589211040,131,8,7,green,0,605,226,...,False,False,False,False,False,False,False,False,False,False


In [162]:
# Sample set x_data and label set y_data 
x_data = data.drop(['alert'], axis=1)
x_data = StandardScaler().fit_transform(x_data)
y_data = data['alert']

In [163]:
# Function to split the training set and validation set
def split_data(x_data, y_data, test_size):
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size=test_size,
                                                        random_state=42)
    return x_train, x_test, y_train, y_test

In [164]:
def evaluate_model(start_time, end_time, y_test_reg, y_pred):
    print(f'Training time: {end_time - start_time}s')
    print(f'Mean Square Error: {mean_squared_error(y_test_reg, y_pred)}')
    print(f'R2 Score: {r2_score(y_test_reg, y_pred)}')

### Thực hiện trên dữ liệu gốc

In [165]:
def svm_to_lasso_regression_original(x_data, y_data, test_size):
    # Split the data
    x_train, x_test, y_train, y_test = split_data(x_data, y_data, test_size)

    # Train the model
    svm = SVC(kernel='linear', decision_function_shape='ovr', probability=True)
    svm.fit(x_train, y_train)

    # Encode labels and find the index of the 'green' class
    label_encoder = LabelEncoder()
    label_encoder.fit(y_data)
    green_class_index = np.where(label_encoder.classes_ == "green")[0][0]

    # Compute decision scores for training and testing data
    decision_scores_train = svm.decision_function(x_train)
    decision_scores_test = svm.decision_function(x_test)

    # Extract decision scores for the 'green' class
    decision_scores_train_green = decision_scores_train[:, green_class_index]
    decision_scores_test_green = decision_scores_test[:, green_class_index]

    # Prepare data for regression
    x_train_reg = x_train
    y_train_reg = decision_scores_train_green
    x_test_reg = x_test
    y_test_reg = decision_scores_test_green

    # Train regression model
    start_time = time.perf_counter()
    lasso = Lasso(alpha=0.1, random_state=42)
    lasso.fit(x_train_reg, y_train_reg)
    end_time = time.perf_counter()

    # Predict and evaluate
    y_pred = lasso.predict(x_test_reg)
    evaluate_model(start_time, end_time, y_test_reg, y_pred)

In [166]:
def svm_to_linear_regression_original(x_data, y_data, test_size):
    # Split the data
    x_train, x_test, y_train, y_test = split_data(x_data, y_data, test_size)

    # Train the SVM model
    svm = SVC(kernel='linear', decision_function_shape='ovr', probability=True)
    svm.fit(x_train, y_train)

    # Encode labels and find the index of the 'green' class
    label_encoder = LabelEncoder()
    label_encoder.fit(y_data)
    green_class_index = np.where(label_encoder.classes_ == "green")[0][0]

    # Compute decision scores for training and testing data
    decision_scores_train = svm.decision_function(x_train)
    decision_scores_test = svm.decision_function(x_test)

    # Extract decision scores for the 'green' class
    decision_scores_train_green = decision_scores_train[:, green_class_index]
    decision_scores_test_green = decision_scores_test[:, green_class_index]

    # Prepare data for regression
    x_train_reg = x_train
    y_train_reg = decision_scores_train_green
    x_test_reg = x_test
    y_test_reg = decision_scores_test_green

    # Train the regression model
    start_time = time.perf_counter()
    linear = LinearRegression()
    linear.fit(x_train_reg, y_train_reg)
    end_time = time.perf_counter()

    # Predict using the regression model
    y_pred = linear.predict(x_test_reg)

    # Evaluate the model
    evaluate_model(start_time, end_time, y_test_reg, y_pred)

### Thực hiện trên dữ liệu đã giảm còn $\dfrac{1}{3}$ số chiều 


In [167]:
def svm_to_lasso_regression_pca(x_data, y_data, test_size):
    # -----------------------------------PREPROCESSING-----------------------------------
    # Reduce dimensions of x_data using PCA
    pca = PCA(n_components=min(x_data.shape[1] // 3, x_data.shape[1] - 1), random_state=42)
    x_data_pca = pca.fit_transform(x_data)

    # -----------------------------------SPLIT DATA-----------------------------------
    x_train, x_test, y_train, y_test = split_data(x_data_pca, y_data, test_size)

    # -----------------------------------SVM MODEL TRAINING-----------------------------------
    # Train the SVM model
    svm = SVC(kernel='linear', decision_function_shape='ovr', probability=True)
    svm.fit(x_train, y_train)

    # -----------------------------------EXTRACT DECISION FUNCTION FOR "green" LABEL-----------------------------------
    label_encoder = LabelEncoder()
    label_encoder.fit(y_data)
    decision_scores_train = svm.decision_function(x_train)
    decision_scores_test = svm.decision_function(x_test)

    # Find the index for the "green" label and get the decision score for that label
    green_class_index = np.where(label_encoder.classes_ == "green")[0][0]
    decision_scores_train_green = decision_scores_train[:, green_class_index]
    decision_scores_test_green = decision_scores_test[:, green_class_index]

    # -----------------------------------REGRESSION MODEL-----------------------------------
    start_time = time.perf_counter()

    # Train regression model (Lasso)
    lasso = Lasso(alpha=0.1, random_state=42)
    lasso.fit(x_train, decision_scores_train_green)

    end_time = time.perf_counter()

    # -----------------------------------PREDICTION AND EVALUATION-----------------------------------
    # Predict using Lasso model
    y_pred = lasso.predict(x_test)

    # Evaluate the model
    evaluate_model(start_time, end_time, decision_scores_test_green, y_pred)

In [168]:
def svm_to_linear_regression_pca(x_data, y_data, test_size):
    # -----------------------------------DIMENSIONALITY REDUCTION-----------------------------------
    # Reduce dimensions of x_data using PCA
    pca = PCA(n_components=min(x_data.shape[1] // 3, x_data.shape[1] - 1), random_state=42)
    x_data_pca = pca.fit_transform(x_data)  # Reduce the dimensionality of the data

    # -----------------------------------SPLIT DATA-----------------------------------
    x_train, x_test, y_train, y_test = split_data(x_data_pca, y_data, test_size)

    # -----------------------------------SVM MODEL TRAINING-----------------------------------
    # Train the SVM model
    svm = SVC(kernel='linear', decision_function_shape='ovr', probability=True)
    svm.fit(x_train, y_train)

    # -----------------------------------EXTRACT DECISION FUNCTION FOR "green" LABEL-----------------------------------
    label_encoder = LabelEncoder()
    label_encoder.fit(y_data)

    decision_scores_train = svm.decision_function(x_train)
    decision_scores_test = svm.decision_function(x_test)

    # Find the index for the "green" label and get the decision score for that label
    green_class_index = np.where(label_encoder.classes_ == "green")[0][0]
    decision_scores_train_green = decision_scores_train[:, green_class_index]
    decision_scores_test_green = decision_scores_test[:, green_class_index]

    # -----------------------------------REGRESSION MODEL-----------------------------------
    start_time = time.perf_counter()

    # Train regression model (Linear Regression)
    linear = LinearRegression()
    linear.fit(x_train, decision_scores_train_green)

    end_time = time.perf_counter()

    # -----------------------------------PREDICTION AND EVALUATION-----------------------------------
    # Predict using Linear Regression model
    y_pred = linear.predict(x_test)

    # Evaluate the model
    evaluate_model(start_time, end_time, decision_scores_test_green, y_pred)

### Dữ liệu gốc

#### Lasso Regression

***Tỷ lệ train:test là 4:1***

In [169]:
svm_to_lasso_regression_original(x_data, y_data, 0.2)

Training time: 0.0318608999996286s
Mean Square Error: 0.17449948952736555
R2 Score: 0.560323146568056


***Tỷ lệ train:test là 7:3***

In [170]:
svm_to_lasso_regression_original(x_data, y_data, 0.3)

Training time: 0.027729599999474885s
Mean Square Error: 0.17077160958252133
R2 Score: 0.5456786427886942


***Tỷ lệ train:test là 6:4***

In [171]:
svm_to_lasso_regression_original(x_data, y_data, 0.4)

Training time: 0.019109500000013213s
Mean Square Error: 0.16666929676887024
R2 Score: 0.49454420194556425


#### Linear Regression

***Tỷ lệ train:test là 4:1***

In [172]:
svm_to_linear_regression_original(x_data, y_data, 0.2)

Training time: 4.448074899999483s
Mean Square Error: 7.971724276557289e+26
R2 Score: -2.0085919195735444e+27


***Tỷ lệ train:test là 7:3***

In [173]:
svm_to_linear_regression_original(x_data, y_data, 0.3)

Training time: 3.477676800000154s
Mean Square Error: 9.078026752171202e+26
R2 Score: -2.415121251670333e+27


***Tỷ lệ train:test là 6:4***

In [174]:
svm_to_linear_regression_original(x_data, y_data, 0.4)

Training time: 2.7720111999997243s
Mean Square Error: 6.373899044849628e+26
R2 Score: -1.9330040330707236e+27


### Dữ liệu đã giảm còn $\dfrac{1}{3}$ số chiều

#### Lasso Regression

***Tỷ lệ train:test là 4:1***

In [175]:
svm_to_lasso_regression_pca(x_data, y_data, 0.2)

Training time: 0.012448499999663909s
Mean Square Error: 0.2022325134803679
R2 Score: 0.4904356659792637


***Tỷ lệ train:test là 7:3***

In [176]:
svm_to_lasso_regression_pca(x_data, y_data, 0.3)

Training time: 0.005556799999794748s
Mean Square Error: 0.1864970190563339
R2 Score: 0.5038449005033292


***Tỷ lệ train:test là 6:4***

In [177]:
svm_to_lasso_regression_pca(x_data, y_data, 0.4)

Training time: 0.0033138999997390783s
Mean Square Error: 0.1801536990310301
R2 Score: 0.4536462661549273


#### Linear Regression

***Tỷ lệ train:test là 4:1***

In [178]:
svm_to_linear_regression_pca(x_data, y_data, 0.2)

Training time: 1.5471462000004976s
Mean Square Error: 6.504820218954588e+27
R2 Score: -1.6390165585901146e+28


***Tỷ lệ train:test là 7:3***

In [179]:
svm_to_linear_regression_pca(x_data, y_data, 0.3)

Training time: 0.4542890000002444s
Mean Square Error: 1.2455371980673353e+28
R2 Score: -3.313616676346094e+28


***Tỷ lệ train:test là 6:4***

In [180]:
svm_to_linear_regression_pca(x_data, y_data, 0.4)

Training time: 0.6776749000000564s
Mean Square Error: 1.2261587977416896e+28
R2 Score: -3.7185827492655005e+28
