<a href="https://colab.research.google.com/github/greek-proverb-atlas/proverbs.gr/blob/main/tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os
import ast

from sklearn.base import TransformerMixin
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [2]:
corpus_path = "data/balanced_corpus.csv"
root = 'https://raw.githubusercontent.com/ipavlopoulos/paremia/main/'
if not os.path.exists(corpus_path):
  corpus_path = root + corpus_path
balanced_corpus = pd.read_csv(corpus_path, index_col=0)

In [3]:
geolocs = pd.read_csv(f'{root}data/geolocs.csv')
balanced_corpus.lat = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[0])
balanced_corpus.lon = balanced_corpus.area.apply(lambda x: geolocs[x].iloc[1])

In [4]:
train, test = train_test_split(balanced_corpus, test_size=0.05, random_state=2023)
train, dev = train_test_split(train, test_size=test.shape[0], random_state=2023)

# Text regression

#### Tuning ElasticNet

In [21]:
# tuning extratrees
elastic = Pipeline([('vect', TfidfVectorizer()), ('clf', ElasticNet())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2, 10],
                    "vect__ngram_range": [(1,1), (1,2), (2,5)],
                    "clf": [ElasticNet()],
                 },
              ]

gsc = GridSearchCV(estimator=elastic, param_grid=grid_param, scoring='r2', cv=3)
tuning_result = gsc.fit(dev.text.values, dev[["lat", "lon"]].values)
print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: -0.000113 using {'clf': ElasticNet(), 'vect': TfidfVectorizer(), 'vect__analyzer': 'word', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (1, 1)}


3 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/usr/l

#### Tuning the ExtraTrees

In [None]:
# tuning extratrees
xtra = Pipeline([('vect', TfidfVectorizer()), ('clf', ExtraTreesRegressor())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2, 10],
                    "vect__ngram_range": [(1,1), (1,2), (2,5)],
                    "clf": [ExtraTreesRegressor()],
                    "clf__n_estimators": [10, 100], # this is redundant
                 },
              ]

gsc = GridSearchCV(estimator=xtra, param_grid=grid_param, scoring='r2', cv=3)
tuning_result = gsc.fit(dev.text.values, dev[["lat", "lon"]].values)
print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: -0.058715 using {'clf': ExtraTreesRegressor(), 'clf__n_estimators': 10, 'vect': TfidfVectorizer(), 'vect__analyzer': 'word', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (2, 5)}


In [None]:
# tuning the best result on all the texts
xtra_tuned = Pipeline([('vect', tuning_result.best_params_["vect"]), ('clf', tuning_result.best_params_["clf"])])
xtra_tuned.fit(train.text.values, train[["lat", "lon"]].values)
predicted_coords = xtra_tuned.predict(test.text.values)
print(f"MAE of lat and lon: {mean_absolute_error(test.lat.values, predicted_coords[:, 0]):.2f} & {mean_absolute_error(test.lon.values, predicted_coords[:, 1]):.2f}")
print(f"MSE of lat and lon: {mean_squared_error(test.lat.values, predicted_coords[:, 0]):.2f} & {mean_squared_error(test.lon.values, predicted_coords[:, 1]):.2f}")

MAE of lat and lon: 1.31 & 1.82
MSE of lat and lon: 2.97 & 6.03


#### Tuning the Forest

In [None]:
# tuning extratrees
forest = Pipeline([('vect', TfidfVectorizer()), ('clf', RandomForestRegressor())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2, 10],
                    "vect__ngram_range": [(1,1), (1,2), (2,5)],
                    "clf": [RandomForestRegressor()],
                    "clf__n_estimators": [10, 100],
                 },
              ]

gsc = GridSearchCV(estimator=forest, param_grid=grid_param, scoring='r2', cv=3)
tuning_result = gsc.fit(dev.text.values, dev[["lat", "lon"]].values)
print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: 0.035717 using {'clf': RandomForestRegressor(), 'clf__n_estimators': 100, 'vect': TfidfVectorizer(), 'vect__analyzer': 'char', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (2, 5)}


In [None]:
# tuning the best result on all the texts
forest_tuned = Pipeline([('vect', tuning_result.best_params_["vect"]), ('clf', tuning_result.best_params_["clf"])])
forest_tuned.fit(train.text.values, train[["lat", "lon"]].values)
forest_coords = forest_tuned.predict(test.text.values)
print(f"MAE of lat and lon: {mean_absolute_error(test.lat.values, forest_coords[:, 0]):.2f} & {mean_absolute_error(test.lon.values, forest_coords[:, 1]):.2f}")
print(f"MSE of lat and lon: {mean_squared_error(test.lat.values, forest_coords[:, 0]):.2f} & {mean_squared_error(test.lon.values, forest_coords[:, 1]):.2f}")

MAE of lat and lon: 1.30 & 1.76
MSE of lat and lon: 2.73 & 5.30


#### Classification assessment

In [None]:
import geopy.distance
def degeocode(geocoded_areas, predictions, target_col="category"):
    classes = []
    # find the nearest geocoded area to the prediction in 2d
    for p in predictions:
        dists = geocoded_areas.apply(lambda r: geopy.distance.geodesic( (r.lat, r.lon), (p[0], p[1])).km, axis=1)
        classes.append(geocoded_areas.iloc[dists.argmin()][target_col])
    return classes

def km_distance(geocodes, gold_locs, pred_ll, target_col="category"):
    dists = []
    for g, p in zip(gold_locs, pred_ll):
        gold_lat, gold_lon = geocodes[geocodes[target_col]==g][["lat", "lon"]].values[0]
        dists.append(geopy.distance.geodesic( (gold_lat, gold_lon), (p[0], p[1])).km)
    return pd.Series(dists)

In [None]:
''' ## WARNING: this code is broken ##
xtra_preds = xtra_tuned.predict(test.text.values)
# interpreting the results in terms of predicted distance
dist = km_distance(geocodes=train.drop_duplicates("category"), gold_locs=train.category.values, pred_ll=xtra_preds)
print(f"W/XtraTrees, {25}% of the predicted places fall within {dist.quantile(0.25):.2f}kms")
print(f"(NOTE: {len(set(test.category.unique())-set(test.category.unique()).intersection((train.category.unique())))} areas out of the {len(set(test.category.unique()))} were not in the train)")
'''



# Classification

#### SVC

In [None]:
# tuning extratrees
svc = Pipeline([('vect', TfidfVectorizer()), ('clf', LinearSVC())])
grid_param = [
                {
                    "vect": [TfidfVectorizer()],
                    "vect__analyzer": ["word", "char"],
                    "vect__lowercase": [True],
                    "vect__max_df": [0.5],
                    "vect__min_df": [2, 10],
                    "vect__ngram_range": [(1,1), (1,2), (2,5)],
                    "clf": [LinearSVC()],
                 },
              ]

gsc = GridSearchCV(estimator=svc, param_grid=grid_param, scoring='accuracy', cv=3)
tuning_result = gsc.fit(train.text.values, train.area.values)
print("Best: %f using %s" % (tuning_result.best_score_, tuning_result.best_params_))

Best: 0.283382 using {'clf': LinearSVC(), 'vect': TfidfVectorizer(), 'vect__analyzer': 'char', 'vect__lowercase': True, 'vect__max_df': 0.5, 'vect__min_df': 2, 'vect__ngram_range': (2, 5)}


#### Benchmarking with the best TFIDF representation found for SVC

In [None]:
# benchmarking text classification algorithms
def benchmark_clf(train, dev):
    for algo_name, algorithm in (('SVM', LinearSVC(random_state=2023)),
                                 ('KNN', KNeighborsClassifier()),
                                 ('Forest', RandomForestClassifier(random_state=2023)),
                                 ('LR', LogisticRegression(random_state=2023))):
        vect = TfidfVectorizer(ngram_range=(2,5), analyzer="char", max_df=0.5, min_df=2, lowercase=True)
        locator = Pipeline([('vect', vect), ('clf', algorithm)])
        locator.fit(train.text.values, train.area.values)
        preds = locator.predict(test.text.values)
        print(f'{algo_name}\n{classification_report(test.area.values, preds)}')

benchmark_clf(train, test)

SVM
                 precision    recall  f1-score   support

        Ήπειρος       0.09      0.09      0.09        23
        Αιτωλία       0.42      0.46      0.44        24
        Αμοργός       0.26      0.32      0.29        22
Ανατολική Θράκη       0.19      0.25      0.22        24
        Αρκαδία       0.11      0.10      0.10        31
          Αχαΐα       0.31      0.25      0.28        32
      Επτάνησος       0.47      0.70      0.56        23
         Εύβοια       0.06      0.05      0.05        20
      Θεσπρωτία       0.11      0.09      0.10        22
          Θράκη       0.26      0.20      0.23        25
       Ιωάννινα       0.26      0.17      0.21        29
       Κάρπαθος       0.42      0.39      0.41        28
     Κεφαλληνία       0.25      0.22      0.24        27
          Κρήτη       0.36      0.33      0.34        30
         Κύπρος       0.70      0.96      0.81        24
         Λέσβος       0.45      0.54      0.49        24
        Λακωνία       0.10