# Proverb geolocation baselines
* Classification: select an area at random.
* Regression: return the average lat/lon of the train data.

In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Load the data

In [46]:
balanced_corpus = pd.read_csv("data/balanced_corpus.csv", index_col=0)

## Train the baselines

* Three splits with fixed random seed for reproducibility.

In [53]:
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.metrics import f1_score
loc_name = {'Ρούμελη':'Roumeli', 'Κοζάνη':'Kozani', 'Κως':'Kos', 'Αδριανούπολη':'Adrian.', 'Νάουσα':'Naousa', 'Σέρρες':'Serres', 'Σίφνος': 'Sifnos', 'Ήπειρος':'Epirus', 'Αιτωλία':'Etolia', 'Αμοργός':'Amorgos', 'Ανατολική Θράκη': 'East Thrace', 'Αρκαδία':'Arcadia', 'Αχαΐα':'Achaia', 'Επτάνησος':'Eptanisos', 'Εύβοια':'Eyvoia', 'Θεσπρωτία':'Thesprotia',  'Θράκη': 'Thrace', 'Ιωάννινα':'Ioannina', 'Κάρπαθος':'Karpathos', 'Κεφαλληνία':'Kefalinia', 'Κρήτη':'Crete', 'Κύπρος':'Cyprus', 'Λέσβος':'Lesvos', 'Λακωνία':'Laconia', 'Μακεδονία':'Maced.', 'Μικρά Ασία':'Asia Minor', 'Νάξος':'Naxos', 'Πόντος':'Pontos', 'Ρόδος':'Rodos', 'Σκύρος':'Skyros'}
regions = ['Πόντος', 'Κύπρος', 'Κάρπαθος', 'Θεσπρωτία', 'Αμοργός', 'Σκύρος', 'Μικρά Ασία', 'Λέσβος', 'Μακεδονία', 'Λακωνία', 'Εύβοια', 'Επτάνησος', 'Αρκαδία', 'Νάξος', 'Κρήτη', 'Αχαΐα', 'Θράκη', 'Ιωάννινα', 'Αιτωλία', 'Κεφαλληνία', 'Ανατολική Θράκη', 'Ρόδος', 'Ήπειρος']
f_scores = []
mae_scores_lat, mae_scores_lon = [], []
mse_scores_lat, mse_scores_lon = [], []
for i in range(3):
    seed = 2023 + i
    train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=seed)
    train, dev = train_test_split(train, test_size=test.shape[0], random_state=seed)
    baseline_clf = DummyClassifier(strategy='uniform')
    baseline_clf.fit(train.text, train.area)
    f_scores.append(f1_score(test.area, baseline_clf.predict(test.text), average=None, zero_division=0, labels=regions))
    
    baseline_reg = DummyRegressor(strategy='mean')
    baseline_reg.fit(train.text, train[['lat', 'lon']])
    reg_pred = baseline_reg.predict(test.text)
    mae_scores_lat.append(mean_absolute_error(test.lat, reg_pred[:,0]))
    mae_scores_lon.append(mean_absolute_error(test.lon, reg_pred[:,1]))
    mse_scores_lat.append(mean_squared_error(test.lat, reg_pred[:,0]))
    mse_scores_lon.append(mean_squared_error(test.lon, reg_pred[:,1]))

## Classification results

In [54]:
results = pd.DataFrame({i: np.array(f_scores[i]) for i in range(3)}, index=[loc_name[r] for r in regions])
print(f'BASELINE CLF F1: {results.mean(1).mean(0):.2f}±{results.sem(1).mean(0):.2f}')
results.agg(['mean', 'sem'], 1)

BASELINE CLF F1: 0.04±0.02


Unnamed: 0,mean,sem
Pontos,0.027222,0.01362
Cyprus,0.013889,0.013889
Karpathos,0.043282,0.003797
Thesprotia,0.014815,0.014815
Amorgos,0.052852,0.026461
Skyros,0.038207,0.020497
Asia Minor,0.025641,0.025641
Lesvos,0.081391,0.02021
Maced.,0.075499,0.020984
Laconia,0.041291,0.003226


## Regression results

In [52]:
print(f'BASELINE REG MAE (lat): {pd.Series(mae_scores_lat).mean(0):.2f}±{pd.Series(mae_scores_lat).sem():.2f}')
print(f'BASELINE REG MAE (lon): {pd.Series(mae_scores_lon).mean(0):.2f}±{pd.Series(mae_scores_lon).sem():.2f}')
print()
print(f'BASELINE REG MSE (lat): {pd.Series(mse_scores_lat).mean(0):.2f}±{pd.Series(mse_scores_lat).sem():.2f}')
print(f'BASELINE REG MSE (lat): {pd.Series(mse_scores_lon).mean(0):.2f}±{pd.Series(mse_scores_lon).sem():.2f}')

BASELINE REG MAE (lat): 1.40±0.02
BASELINE REG MAE (lon): 2.05±0.11

BASELINE REG MSE (lat): 3.08±0.06
BASELINE REG MSE (lat): 7.84±0.93
