<a href="https://colab.research.google.com/github/greek-nlp/benchmark/blob/main/nlp_gr_access_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialisation


In [None]:
%%capture
!git clone https://github.com/greek-nlp/gen-a.git
!pip install zenodo-get
!pip install datasets
!pip install conll-df

In [None]:
import pandas as pd
import importlib

gr_data = pd.read_csv('gen-a/data.csv')
gena = importlib.import_module("gen-a.data_wrapper")

# GEC

In [None]:
korre = gena.KorreDt(datasets=gr_data)
korre_train = korre.get('train')

# Toxicity

In [None]:
zampieri = gena.ZampieriDt(datasets = gr_data)
zampieri_test = zampieri.get('test')

# MT

In [None]:
prokopidis_mt = gena.ProkopidisMtDt(datasets=gr_data)
pd.set_option('display.max_colwidth', None)
for lang in prokopidis_mt.target_langs:
  print(f"Language: {lang} ({prokopidis_mt.langs_dict[lang]})")
  display(prokopidis_mt.get(lang, 'train').sample())

# Intent

In [None]:
rizou_test = gena.RizouDt(datasets=gr_data).get('test')
rizou_test.sample()

# Summarisation

In [None]:
koniaris = gena.KoniarisDt(datasets = gr_data)
koniaris_test = koniaris.get('test')

# Clustering

In [None]:
%%capture
papaloukas_test = gena.PapaloukasDt(datasets=gr_data).get('test')

In [None]:
papaloukas.sample()

# Structure prediction (POS, NER)

## NER

In [None]:
barziokas_test = gena.BarziokasDt(datasets=gr_data).get('test')
barziokas_test.sample()

## POS

In [None]:
prokopidis_ud_test = gena.ProkopidisUdDt( datasets=gr_data).get('test')
prokopidis_ud_test.head()

# Authorship analysis

In [None]:
barzokas_train = gena.BarzokasDt(datasets=gr_data).get('train')
barzokas_train.sample()

# Language modeling

* Analysing raw data

In [None]:
#@title download the data
raw_data = {}
raw_data['prokopidis'] = gena.ProkopidisCrawledDt(datasets=gr_data).get('train')
raw_data['dritsa'] = gena.DritsaDt(datasets=gr_data).get('train')
raw_data['papantoniou'] = gena.PapantoniouDt(datasets=gr_data).get('train')

* Train a character-level language model per dataset.
* Compute the BPC per dataset.
* Draw a BPC heatmap, showing in red the dataset linguistically surprised by which.

In [None]:
#!git clone https://github.com/ipavlopoulos/lm.git
from lm.markov.models import LM

train_sets = {}
test_sets = {}
for dataset_name in raw_data:
  print(dataset_name)
  dataset = raw_data[dataset_name]
  dataset = dataset[dataset.text.notna()]
  dataset = dataset.sample(frac=1).reset_index(drop=True)
  train_sets[dataset_name] = dataset.text.apply(lambda x: x[:100]).iloc[:1000] # lower lim
  test_sets[dataset_name] = dataset.text.apply(lambda x: x[:100]).iloc[1000:1500]