<a href="https://colab.research.google.com/github/greek-proverb-atlas/proverbs.gr/blob/main/tuning_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tuning ElasticNet
* The best traditional ML regressor
* Repeating the experiments after correcting the coordinates for specific locations


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
!pip install optuna
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.linear_model import ElasticNet

import ast



In [2]:
root = 'https://github.com/greek-proverb-atlas/proverbs.gr/raw/refs/heads/main'
balanced_corpus = pd.read_csv(f"{root}/data/balanced_corpus.csv", index_col=0)
# quality assurance patch
geolocs = pd.read_csv(f'{root}/data/geolocs.csv')
balanced_corpus.lat = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[0])
balanced_corpus.lon = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[1])

In [3]:
train, test = train_test_split(balanced_corpus, 
                               test_size=0.05, 
                               random_state=2023)
train, dev = train_test_split(train, 
                              test_size=test.shape[0], 
                              random_state=2023)

def objective(trial):

    alpha = trial.suggest_loguniform('alpha', 1e-4, 1.0)
    l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)
    regressor = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

    vect = TfidfVectorizer(ngram_range=(2,5), analyzer="char", max_df=0.5,
                           min_df=10, lowercase=True)

    pipe = Pipeline([('vect', vect), ('ref', regressor)])
    scores = cross_val_score(pipe,
                             train.text.values,
                             train[["lat", "lon"]].values,
                             cv=5,
                             n_jobs=-1)
    return scores.mean()


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-11-07 18:54:39,522] A new study created in memory with name: no-name-8076c050-91c6-4231-a680-ca2f0a7e774f
[I 2024-11-07 18:54:41,998] Trial 0 finished with value: 0.016401871924643395 and parameters: {'alpha': 0.009853828376668854, 'l1_ratio': 0.9231320185103109}. Best is trial 0 with value: 0.016401871924643395.
[I 2024-11-07 18:54:43,072] Trial 1 finished with value: -0.0003549715765302164 and parameters: {'alpha': 0.07552829761605577, 'l1_ratio': 0.4758591500921362}. Best is trial 0 with value: 0.016401871924643395.
[I 2024-11-07 18:54:50,041] Trial 2 finished with value: 0.2451450786439835 and parameters: {'alpha': 0.00047199352567656313, 'l1_ratio': 0.21533757688499378}. Best is trial 2 with value: 0.2451450786439835.
[I 2024-11-07 18:54:59,882] Trial 3 finished with value: 0.25935091629234486 and parameters: {'alpha': 0.00031996901325033125, 'l1_ratio': 0.42024308448938075}. Best is trial 3 with value: 0.25935091629234486.
[I 2024-11-07 18:55:04,857] Trial 4 finished with

In [4]:
print('best value:', study.best_value)
print('best trial:', study.best_trial)
best_params = study.best_trial.params
print('best params:', best_params)

best value: 0.26894176756261207
best trial: FrozenTrial(number=91, state=1, values=[0.26894176756261207], datetime_start=datetime.datetime(2024, 11, 7, 19, 12, 30, 381313), datetime_complete=datetime.datetime(2024, 11, 7, 19, 12, 47, 163438), params={'alpha': 0.00019030097880031678, 'l1_ratio': 0.3977607430562909}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'alpha': FloatDistribution(high=1.0, log=True, low=0.0001, step=None), 'l1_ratio': FloatDistribution(high=1.0, log=False, low=0.0, step=None)}, trial_id=91, value=None)
best params: {'alpha': 0.00019030097880031678, 'l1_ratio': 0.3977607430562909}


In [6]:
regressor = ElasticNet(**best_params)
vect = TfidfVectorizer(ngram_range=(2,5), 
                       analyzer="char", 
                       max_df=0.5, 
                       min_df=10, 
                       lowercase=True)
elastic_net_pipe = Pipeline([('vect', vect), ('reg', regressor)])
elastic_net_pipe.fit(train.text.values, train[["lat", "lon"]].values)
score = elastic_net_pipe.score(test.text.values, test[["lat", "lon"]].values)
score

0.29658895769969934

In [None]:
predicted_coords = elastic_net_pipe.predict(test.text.values)

mae_lat, mae_lon = (mean_absolute_error(test.lat.values, predicted_coords[:, 0]),
                    mean_absolute_error(test.lon.values, predicted_coords[:, 1]))
mse_lat, mse_lon = (
    root_mean_squared_error(test.lat.values, predicted_coords[:, 0])**2,
    root_mean_squared_error(test.lon.values, predicted_coords[:, 1])**2)

print(mae_lat, mae_lon)
print(mse_lat, mse_lon)

1.2984176631176354 2.4990179161814643
2.6319187394082166 12.043434909265438


In [8]:
reg_models = {'el':[]}
scores = {m:{'mse':{'lat':[], 'lon':[]}, 'mae':{'lat':[], 'lon':[]}} for m in reg_models}
for i in range(2, 6):
    balanced_corpus = pd.read_csv(f"{root}/data/balanced_corpus_{i}.csv", index_col=0).reset_index()
    # quality assurance patch
    geolocs = pd.read_csv(f'{root}/data/geolocs.csv')
    balanced_corpus.lat = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[0])
    balanced_corpus.lon = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[1])
    # end of quality assurance
    train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=2023)
    train, dev = train_test_split(train, test_size=test.shape[0], random_state=2023)

    vect = TfidfVectorizer(ngram_range=(2,5), 
                           analyzer="char", 
                           max_df=0.5,
                           min_df=10, 
                           lowercase=True)
    # elastic
    el_reg = Pipeline([('vect', vect), ('reg', ElasticNet(**best_params))])
    el_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['el'] = el_reg

In [None]:
for i in range(2,6):

  for model_name in reg_models:
          regressor = reg_models[model_name]
          preds = regressor.predict(test.text.values)
          mae_lat, mae_lon = (mean_absolute_error(test.lat.values, preds[:, 0]),
                              mean_absolute_error(test.lon.values, preds[:, 1]))
          mse_lat, mse_lon = (
            root_mean_squared_error(test.lat.values, preds[:, 0])**2,
            root_mean_squared_error(test.lon.values, preds[:, 1])**2)
          scores[model_name]['mae']['lat'].append(mae_lat)
          scores[model_name]['mae']['lon'].append(mae_lon)
          scores[model_name]['mse']['lat'].append(mse_lat)
          scores[model_name]['mse']['lon'].append(mse_lon)

In [10]:
pd.DataFrame({'mae-lon':pd.DataFrame({m:scores[m]['mae']['lon'] for m in reg_models}).mean(),
              'mae-lat':pd.DataFrame({m:scores[m]['mae']['lat'] for m in reg_models}).mean(),
              'mse-lon':pd.DataFrame({m:scores[m]['mse']['lon'] for m in reg_models}).mean(),
              'mse-lat':pd.DataFrame({m:scores[m]['mse']['lat'] for m in reg_models}).mean()})

Unnamed: 0,mae-lon,mae-lat,mse-lon,mse-lat
el,2.683922,1.271655,12.987285,2.558401


# GrBERT results

In [22]:
data = {
#    "Run": ["Run 1", "Run 2", "Run 3"],
    "MAE_lon": [1.61, 1.67, 1.80],
    "MAE_lat": [1.22, 1.30, 1.22],
    "MSE_lon": [4.32, 4.58, 5.58],
    "MSE_lat": [2.44, 2.68, 2.41],
    "R2_lat": [-1.03, -1.79, -1.19],
    "R2_lon": [-0.39, -0.70, -0.40]
}
pd.DataFrame(data).mean(0)

Unnamed: 0,0
MAE_lon,1.693333
MAE_lat,1.246667
MSE_lon,4.826667
MSE_lat,2.51
R2_lat,-1.336667
R2_lon,-0.496667
