# Data setup

In [None]:
import re
import numpy as np
import pandas as pd
pd.options.mode.copy_on_write = True

import matplotlib.pyplot as plt

import scipy as sp
import seaborn as sns

import sys
sys.path.append('..')
import src.data.util as du
import src.text.tokenizer as tk

d90_file = '../results/complexity_utf-16_10_10_2002_bibles_90_lcm.csv'
dall_file = '../results/complexity_utf-16_10_10_2002_bibles_lcm.csv'
family_file = '../dataset/family.csv'

bible_file_d90 = '../dataset/' + re.match('.*(bibles_.*)', d90_file).group(1)
bibles_d90 = pd.read_csv(bible_file_d90, index_col=False)
bibles_d90 = bibles_d90[bibles_d90.language != 'NAMBIKUÁRA']

df90 = pd.read_csv(d90_file, index_col=False)
df90 = df90[df90.language != 'NAMBIKUÁRA'] # We removed Nambikuára because it has tone annotation
df90.loc[df90.metric == 'del-chars', 'value'] = - df90.loc[df90.metric == 'del-chars', 'value']
df90.loc[df90.metric == 'rep-words', 'value'] = 1/df90.loc[df90.metric == 'rep-words', 'value']

bible_file_dall = '../dataset/' + re.match('.*(bibles_.*)', dall_file).group(1)
bibles_dall = pd.read_csv(bible_file_dall, index_col=False)
bibles_dall = bibles_dall[bibles_dall.language != 'NAMBIKUÁRA']

dfall = pd.read_csv(dall_file, index_col=False)
dfall = dfall[dfall.language != 'NAMBIKUÁRA'] # We removed Nambikuára because it has tone annotation
dfall.loc[dfall.metric == 'del-chars', 'value'] = - dfall.loc[dfall.metric == 'del-chars', 'value']
dfall.loc[dfall.metric == 'rep-words', 'value'] = 1/dfall.loc[dfall.metric == 'rep-words', 'value']

family = pd.read_csv(family_file, index_col=None)

dfall = dfall.drop(columns=['language']).merge(
    family.drop(
        columns=['countries', 'branch']
    ), on='wals', how='inner').rename(columns={ 'wals' : 'code'})
df90 = df90.drop(columns=['language']).merge(
    family.drop(
        columns=['countries', 'branch'])
    , on='wals', how='inner').rename(columns={ 'wals' : 'code'})

metric_rename = {'del-chars'  : r'morphological deletion',
                 'del-verses' : r'pragmatic deletion',
                 'del-words'  : r'syntactic deletion',
                 'rep-words'  : r'morphological substitution',
                 'do-nothing' : r'size'
                }

dfall = dfall.replace(metric_rename)
df90 = df90.replace(metric_rename)


rall = dfall.groupby(
    by=['language', 'family', 'code', 'metric', 'algorithm'],
    as_index=False).agg({'value' : ['mean', 'var']})
r90 = df90.groupby(
    by=['language', 'family', 'code', 'metric', 'algorithm'],
    as_index=False).agg({'value' : ['mean', 'var']})

# Helper functions

In [None]:
def compute_numtypes_numtokens(df):
    langs = du.by_field(df, 'language')
    united = {
        lang : du.df_to_str(val)
        for lang, val in langs.items()
    }

    d = dict(language=[], tokens=[], types=[])
    for lang, text in united.items():
        d['language'].append(lang)
        tokens = tk.tokens(text)
        d['tokens'].append(len(tokens))
        d['types'].append(len(tk.types(tokens)))
    return d


def compute_num_chars(df):
    from collections import Counter
    d = dict(language=[], chars=[])
    for lang in set(df.language):
        c = Counter('\n'.join(df[df.language == lang].text))
        d['language'].append(lang)
        d['chars'].append(len(c))
    dd = pd.DataFrame(d)
    return dd


def num_chars_across_bible(df, bibles):
    nc = compute_num_chars(bibles).sort_values('language').reset_index().drop(columns='index').chars.to_numpy()
    out = h1(df)
    for algo, (dfs, _, __) in out.items():
        if algo == 'none': continue
        y = dfs.drop_duplicates('language').sort_values('language').reset_index().drop(columns='index').oc.to_numpy()
        r = sp.stats.pearsonr(nc, y)
        print("%6s statistic: %.5f pvalue: %.5f" % (algo, r.statistic, r.pvalue))

# H1 Overall complexity of a text

The overall complexity of a text in its
original language is lower than in the other
languages, as a result of the introduction
of cultural clarification in the translation process.
The language complexity of a translated text should be greater than their counterpart in the source language.

In [None]:
def h1(df):
    unique_algorithms = list(df.algorithm.unique())
    out = {}
    for algorithm in unique_algorithms:
        ag = df.algorithm == algorithm
        nn = df.algorithm == 'none'
        sz = df.metric == 'size'
        rw = df[sz & nn]
        co = df[sz & ag]

        x = rw.sort_values('language').value.to_numpy()
        y = co.sort_values('language').value.to_numpy()
        
        if np.unique(x).size > 1:  # Verifica se há mais de um valor único em x
            lr = sp.stats.linregress(x, y)
            ỹ = lr.slope * x + lr.intercept
            oc = y - ỹ
            out[algorithm] = (
                pd.DataFrame(dict(language=list(rw.sort_values('language').language),
                                  oc=oc, rw=x, co=y)),
                x,
                ỹ                 
            )
        else:
            print(f"Skipping linear regression for {algorithm} because all x values are identical.")
            out[algorithm] = (None, None, None)  # Armazenar None para o caso de erro

    return out

def plth1(df, title=None):
    out = h1(df)
    fig, ax = plt.subplots(1, len(out), figsize=(5 * len(out), 5))

    for i, (algo, (dfr, x, ỹ)) in enumerate(out.items()):
        if x is not None and ỹ is not None:  # Verifica se x e ỹ são válidos
            ax[i].plot(x, ỹ)
            ax[i].scatter(dfr['rw'], dfr['co'])
            ax[i].set_title(fr'$\it{{{algo}}}$')
            ax[i].set_xlabel('size in bytes')
            ax[i].set_ylabel('size in bytes compressed')
        else:
            ax[i].set_title(f"{algo} - No data")
            ax[i].set_xlabel('size in bytes')
            ax[i].set_ylabel('size in bytes compressed')

    if title:
        fig.suptitle(title)
    fig.tight_layout()


## D90 Subset

In [None]:
out90 = h1(df90)
plth1(df90, title=r"$\mathcal{H}_1$ - D90 subset")

In [None]:
num_chars_across_bible(df90, bibles_d90)

In [None]:
l = 'Sateré-Mawé'
num_chars_across_bible(df90[df90.language != l], bibles_d90[bibles_d90.language != l.upper()])

## DALL Subset

In [None]:
outall = h1(dfall)
plth1(dfall, title=r"$\mathcal{H}_1$ - DALL subset")

In [None]:
num_chars_across_bible(dfall, bibles_dall)

In [None]:
l = 'Sateré-Mawé'
num_chars_across_bible(dfall[dfall.language != l], bibles_dall[bibles_dall.language != l.upper()])

# H2 Content invariance
For any complexity measure, Bible complexity should be the same (independenlty of translation).
In this case, we should observe a smaller variance in size in bytes of the compressed texts in comparison with
the uncompressed texts.

In [None]:
def h2(rdf):
    H2 = rdf.groupby(by=['metric', 'algorithm'],
                     as_index=False).agg({('value', 'mean') : ['mean', 'var']})

    for algorithm in ['gzip', 'bz2']:
        x = np.log10(
            H2[(H2.metric == 'size') \
            & (H2.algorithm == algorithm)][('value', 'mean', 'var')].item()
        )
        y = np.log10(
            H2[(H2.metric == 'size') \
            & (H2.algorithm == 'none')][('value', 'mean', 'var')].item()
        )
        print('%-4s variance has %d decimal places\n\tOriginal size variance has %d decimal places' \
              % (algorithm, int(np.ceil(x)), int(np.ceil(y))))

def ploth2(outdf, text):
    fig, ax = plt.subplots()
    fig.suptitle(f"Overall Complexity Distribution through the Evaluated Languages for {text} Subset")
    ax.hist(x='oc', bins=10, data=outdf['gzip'][0], label='gzip')
    ax.hist(x='oc', bins=10, data=outdf['bz2'][0], label='bz2')
    ax.set_xlabel('Residuals')
    ax.set_ylabel('Count')
    ax.legend()

## D90 Subset

In [None]:
h2(r90)

In [None]:
ploth2(out90, "D90")

## DALL Subset

In [None]:
h2(rall)

In [None]:
ploth2(outall, "DALL")

# H3 Morphology and Syntatic trade-off
Languages that have a higher morphological complexity show a smaller syntatic complexity and vice-versa.

In [None]:
def h3(df):
    metrics = ['morphological deletion', 'syntactic deletion']
    algorithms = ['gzip', 'bz2']

    for i, algorithm in enumerate(algorithms):
        print('Algorithm %6s' % algorithm)
        a = df.algorithm == algorithm
        xs = [df[(df.metric == metric) & a][('value', 'mean')] for metric in metrics]
        cr =  sp.stats.pearsonr(xs[0], xs[1])
        print('\tPearson Correlation: statistic: %.4f p-value: %.4f' % (cr.statistic, cr.pvalue))

## D90 Subset

In [None]:
print('All languages')
h3(r90)

indo_european_languages = ("Ancient Greek", "English", "French", "Germany", "Portuguese", "Spanish")

r90_iel = r90[r90.language.isin(indo_european_languages)]

print('\nOnly Indo-European Languages')
h3(r90_iel)

## DALL Subset

In [None]:
print('All languages')
h3(rall)

indo_european_languages = ("Ancient Greek", "English", "French", "Germany", "Portuguese", "Spanish")

rall_iel = rall[rall.language.isin(indo_european_languages)]

print('\nOnly Indo-European Languages')
h3(r90_iel)

# O1 Morphological Complexity, tokens and types
a) There exists a **positive** correlation between morphological complexity and **the number of types** in a sample.

b) There exists a **negative** correlation between morphological complexity and **the number of tokens** in a sample.

In [None]:
def o1(df, bibles):
    d = compute_numtypes_numtokens(bibles)
    
    tdf = pd.DataFrame(d).sort_values('language')

    metric = df.metric == 'morphological substitution'
    gzip = df.algorithm == 'gzip'
    bz2 = df.algorithm == 'bz2'
    repwords_gzip = df[metric & gzip].groupby(
        by='language').agg({'value' : 'mean'}).value.to_numpy()
    repwords_bz2 = df[metric & bz2].groupby(
        by='language').agg({'value' : 'mean'}).value.to_numpy()

    s = "%s) corr=%0.4f, p-value=%g"
    
    print("Gzip results:")
    a = sp.stats.pearsonr(tdf.types.to_numpy(), repwords_gzip)#.value.to_numpy())
    b = sp.stats.pearsonr(tdf.tokens.to_numpy(), repwords_gzip)#.value.to_numpy())
    print(s % ("a", a.statistic, a.pvalue))
    print(s % ("b", b.statistic, b.pvalue))
    
    print("\nBz2 results:")
    a = sp.stats.pearsonr(tdf.types.to_numpy(), repwords_bz2)#.value.to_numpy())
    b = sp.stats.pearsonr(tdf.tokens.to_numpy(), repwords_bz2)#.value.to_numpy())
    print(s % ("a", a.statistic, a.pvalue))
    print(s % ("b", b.statistic, b.pvalue))

## D90 Subset

In [None]:
o1(df90, bibles_d90)

## DALL Subset

In [None]:
o1(dfall, bibles_dall)

# O2 All languages are equal in a pragmatic sense
The varinace of the pragmatic complexity should be the smallest.

In [None]:
# Wrap this code in a function, call for each subset
def o2(rdf):
    sz = rdf.metric == 'size'
    nn = rdf.algorithm == 'none'
    out = rdf.loc[~sz & ~nn].groupby(
        by=['metric', 'algorithm']).agg({'value' : 'var'}).sort_values('value')

    return out

## D90 Subset

In [None]:
o2(df90)

## DALL Subset

In [None]:
o2(dfall)

# O3 Morphological complexity metric agrees with Nichol's

In [None]:
nichols_complexity = pd.read_csv('../dataset/complexity_ldst.csv', index_col=None)

In [None]:
def o3(df, nc, languages, metric_str, algorithm_str):
    def language_set(languages):
        return set(map(lambda s: s.lower().capitalize(), languages))

    def language_unset(languages):
        return set(map(lambda s: s.upper(), languages))

    langs = language_set(languages) & language_set(df.language)

    langsi = df.language.isin(langs)
    metric = df.metric == metric_str
    algorithm = df.algorithm == algorithm_str
    a = df[langsi & metric & algorithm].sort_values('language')[('value', 'mean')].to_numpy()
    b = nc[nc.language.isin(language_unset(langs))].sort_values('language')['value'].to_numpy()
    r = sp.stats.pearsonr(a, b)
    print('%r %s %s -> pearsonr=%0.4f p-value=%g' % (langs, metric_str, algorithm_str, r.statistic, r.pvalue))
    #return sp.stats.pearsonr(a, b)

## D90 Subset

In [None]:
o3(r90, nichols_complexity, nichols_complexity.language, 'morphological deletion', 'gzip')
o3(r90, nichols_complexity, nichols_complexity.language, 'morphological deletion', 'bz2')

o3(r90, nichols_complexity, nichols_complexity.language, 'morphological substitution', 'gzip')
o3(r90, nichols_complexity, nichols_complexity.language, 'morphological substitution', 'bz2')

## DALL Subset

In [None]:
o3(r90, nichols_complexity, nichols_complexity.language, 'morphological deletion', 'gzip')
o3(r90, nichols_complexity, nichols_complexity.language, 'morphological deletion', 'bz2')

o3(r90, nichols_complexity, nichols_complexity.language, 'morphological substitution', 'gzip')
o3(r90, nichols_complexity, nichols_complexity.language, 'morphological substitution', 'bz2')

# O4 Compression algorithm independence
The results are equivalent wheter using **Gzip** or **Bz2**.

In [None]:
def o4(df, metric, algo):
    m = df.metric == metric
    a = df.algorithm == algo

    ret = df[m & a].sort_values(('value', 'mean'))
    return ret

## D90 Subset

### Morphological Substitution

In [None]:
o4(r90, 'morphological substitution', 'gzip')

In [None]:
o4(r90, 'morphological substitution', 'bz2')

### Morphological Deletion

In [None]:
o4(r90, 'morphological deletion', 'gzip')

In [None]:
o4(r90, 'morphological deletion', 'bz2')

## DALL Subset

### Morphological Substitution

In [None]:
o4(rall, 'morphological substitution', 'gzip')

In [None]:
o4(r90, 'morphological substitution', 'bz2')

### Morphological Deletion

In [None]:
o4(r90, 'morphological deletion', 'gzip')

In [None]:
o4(r90, 'morphological deletion', 'bz2')