<a href="https://colab.research.google.com/github/greek-proverb-atlas/proverbs.gr/blob/main/geolocation_reg_ssig.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Geographical regression of Greek proverbs

* Using a collection of proverbs from the [Hellenic Folklore Research Centre](http://www.kentrolaografias.gr/) of the Academy of Athens
* Employing text regression to estimate the lat/lon of proverbs whose information is not known.
* Studying feature importance, i.e., terms that distinguish spatially, from South to North, from West to East.
* Experimenting with multiple train/test splits for statistical significance.
---


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

import json

In [None]:
reg_models = {'el': [], 'nn': [], 'rf': [], 'lr': [], 'xt': []}
scores = {
    m : {'mse': {'lat': [], 'lon': []}, 'mae': {'lat': [], 'lon': []}} 
    for m in reg_models
}

root = 'https://github.com/greek-proverb-atlas/proverbs.gr/raw/refs/heads/main'

for i in range(2, 6):
    print(f"Processing balanced_corpus_{i}.csv")
    balanced_corpus = pd.read_csv(f"{root}/data/balanced_corpus_{i}.csv",
                                  index_col=0).reset_index()
    # quality assurance patch
    geolocs = pd.read_csv(f'{root}/data/geolocs.csv')
    balanced_corpus.lat = balanced_corpus.area.apply(
        lambda x: geolocs[x].iloc[0])
    balanced_corpus.lon = balanced_corpus.area.apply(
        lambda x: geolocs[x].iloc[1])
    # end of quality assurance
    
    train, test = train_test_split(balanced_corpus, 
                                   test_size=0.05, 
                                   random_state=2023)
    train, dev = train_test_split(train, 
                                  test_size=test.shape[0], 
                                  random_state=2023)

    vect = TfidfVectorizer(ngram_range=(1,1), 
                           analyzer="word", 
                           max_df=0.5, 
                           min_df=10, 
                           lowercase=True)
    vect.fit(train.text.values)

    # elastic
    print('ElasticNet')
    with open('best_params_el_reg.json') as f:
        el_reg_best_params = json.load(f)
    el_reg = Pipeline([('vect', vect), 
                       ('reg', ElasticNet(**el_reg_best_params))])
    el_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['el'] = el_reg

    # knn
    print('KNeighborsRegressor')
    with open('best_params_knn_reg.json') as f:
        knn_reg_best_params = json.load(f)
    nn_reg = Pipeline([('vect', vect), 
                       ('reg', KNeighborsRegressor(**knn_reg_best_params))])
    nn_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['nn'] = nn_reg

    # forest
    print('RandomForestRegressor')
    with open('best_params_rf_reg.json') as f:
        rf_reg_best_params = json.load(f)
    rf_reg = Pipeline([('vect', vect), 
                       ('reg', RandomForestRegressor(**rf_reg_best_params))])
    rf_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['rf'] = rf_reg

    # extra trees
    print('ExtraTreesRegressor')
    with open('best_params_xt_reg.json') as f:
        xt_reg_best_params = json.load(f)
    xt_reg = Pipeline([('vect', vect), 
                       ('reg', ExtraTreesRegressor(**xt_reg_best_params))])
    xt_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['xt'] = xt_reg

    # linear regression
    print('LinearRegression')
    lr_reg = Pipeline([('vect', vect), ('reg', LinearRegression())])
    lr_reg.fit(train.text.values, train[["lat", "lon"]].values)
    reg_models['lr'] = lr_reg

    for model_name in reg_models:
        regressor = reg_models[model_name]
        preds = regressor.predict(test.text.values)
        mae_lat, mae_lon = (
            mean_absolute_error(test.lat.values, preds[:, 0]),
            mean_absolute_error(test.lon.values, preds[:, 1])
        )
        mse_lat, mse_lon = (
            root_mean_squared_error(test.lat.values, preds[:, 0])**2,
            root_mean_squared_error(test.lon.values, preds[:, 1])**2
        )
        scores[model_name]['mae']['lat'].append(mae_lat)
        scores[model_name]['mae']['lon'].append(mae_lon)
        scores[model_name]['mse']['lat'].append(mse_lat)
        scores[model_name]['mse']['lon'].append(mse_lon)


Processing balanced_corpus_2.csv
ElasticNet
KNeighborsRegressor


NameError: name 'nn_reg_best_params' is not defined

In [None]:
estimates = pd.DataFrame({
    'mae-lat': pd.DataFrame(
        {m: scores[m]['mae']['lat'] for m in reg_models}).mean(),
    'mae-lon': pd.DataFrame(
        {m: scores[m]['mae']['lon'] for m in reg_models}).mean(),
    'mse-lat': pd.DataFrame(
        {m: scores[m]['mse']['lat'] for m in reg_models}).mean(),
    'mse-lon': pd.DataFrame(
        {m: scores[m]['mse']['lon'] for m in reg_models}).mean()
    })

estimates.round(2)

Unnamed: 0,mae-lat,mae-lon,mse-lat,mse-lon
el,1.37,2.87,2.93,15.15
nn,1.48,3.25,3.37,17.99
rf,1.45,2.88,3.24,15.65
lr,1.38,2.91,2.99,15.52
xt,1.44,2.89,3.23,15.63


In [None]:
sems = pd.DataFrame({
    'mae-lat': pd.DataFrame(
        {m: scores[m]['mae']['lat'] for m in reg_models}).sem(),    
    'mae-lon': pd.DataFrame(
        {m: scores[m]['mae']['lon'] for m in reg_models}).sem(),
    'mse-lat': pd.DataFrame(
        {m: scores[m]['mse']['lat'] for m in reg_models}).sem(),   
    'mse-lon': pd.DataFrame(
        {m: scores[m]['mse']['lon'] for m in reg_models}).sem()
    })

sems.round(2)

Unnamed: 0,mae-lat,mae-lon,mse-lat,mse-lon
el,0.01,0.03,0.04,0.34
nn,0.0,0.09,0.02,0.77
rf,0.01,0.02,0.02,0.28
lr,0.01,0.03,0.05,0.39
xt,0.01,0.02,0.03,0.23
