<a href="https://colab.research.google.com/github/greek-nlp/benchmark/blob/main/nlp_gr_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialisation


### Load data and config

In [None]:
%%capture
!pip install boto3
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError

In [None]:
from tqdm.notebook import tqdm
from google.colab import files
import os
import json
import random
import importlib
import pandas as pd

In [None]:
print ('Upload the `aws.json` file: ')
files.upload()
credentials = json.load(open('aws.json'))

### Set up the model

In [None]:
# Initialize the boto3 client for Bedrock
bedrock_client = boto3.client(
    'bedrock',
    aws_access_key_id=credentials['aws_access_key_id'],
    aws_secret_access_key=credentials['aws_secret_access_key'],
    region_name=credentials['aws_region']
)
bedrock_client.list_foundation_models()['modelSummaries']

In [None]:
# Use the native inference API to send a text message to Meta Llama 3.
# Create a Bedrock Runtime client in the AWS Region of your choice.
client = boto3.client("bedrock-runtime",
                      aws_access_key_id=credentials['aws_access_key_id'],
                      aws_secret_access_key=credentials['aws_secret_access_key'],
                      region_name=credentials['aws_region'])

model_id = "meta.llama3-70b-instruct-v1:0"

In [None]:
def llama_prompt(text,
                 instruction="Correct any grammatical errors in the following text but do not change the text otherwise, and return just the corrected text.",
                 shots="",
                 max_len=512,
                 model_id=model_id,
                 client=client):
  # Embed the prompt in Llama 3's instruction format.
  formatted_prompt = f"""
  <|begin_of_text|><|start_header_id|>user<|end_header_id|>
  {instruction}
  {shots}
  user: {text}
  <|eot_id|>
  <|start_header_id|>assistant<|end_header_id|>
  """

  # Format the request payload using the model's native structure.
  native_request = {
      "prompt": formatted_prompt,
      "max_gen_len": max_len,
      "temperature": 0.5,
  }

  # Convert the native request to JSON.
  request = json.dumps(native_request)

  try:
      # Invoke the model with the request.
      response = client.invoke_model(modelId=model_id, body=request)

  except (ClientError, Exception) as e:
      print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
      exit(1)

  # Decode the response body.
  model_response = json.loads(response["body"].read())

  # Extract and print the response text.
  response_text = model_response["generation"]
  return response_text

### Access the benchmark

In [None]:
%%capture
!git clone https://github.com/greek-nlp/gen-a.git
!pip install zenodo-get
!pip install datasets
!pip install conll-df

import pandas as pd
import importlib

gr_data = pd.read_csv('gen-a/data.csv')
gena = importlib.import_module("gen-a.data_wrapper")

# GEC

In [None]:
korre = gena.KorreDt(datasets=gr_data)
train = korre.get('train')

In [None]:
texts = train.original_text.values
train['llama'] = [llama_prompt(t) for t in tqdm(texts)]

In [None]:
!pip install pywer
import pywer

wer = pywer.wer(train.original_text.values, train.llama.str.strip().values)
cer = pywer.cer(train.original_text.values, train.llama.str.strip().values)
print(f"WER: {wer:.2f}, CER: {cer:.2f}")

In [None]:
train.sample().values[0]
train.to_csv('korre_llama3-70b-i_0s.csv')

In [None]:
shots = """ Here are a couple of examples:
1.
user: Δεν ήθελε να θεωρηθεί προκατειλημένος και για αυτό δε συνέχισε τη συνεργασία περεταίρω.
assistant: Δεν ήθελε να θεωρηθεί προκατειλημμένος και για αυτό δε συνέχισε τη συνεργασία περαιτέρω.
2.
user: Το περιθώριο των κερδών τους δεν αλλάζουν εύκολα.
assistant: Το περιθώριο των κερδών τους δεν αλλάζει εύκολα.
"""
train['llama2s'] = [llama_prompt(t, shots) for t in tqdm(texts)]

In [None]:
wer = pywer.wer(train.original_text.values, train.llama2s.str.strip().values)
cer = pywer.cer(train.original_text.values, train.llama2s.str.strip().values)
print(f"WER: {wer:.2f}, CER: {cer:.2f}")

In [None]:
train.to_csv('korre_llama3-70b-i.csv')

In [None]:
train[train.original_text == train.corrected_text].shape

# Toxicity

In [None]:
zampieri = gena.ZampieriDt(datasets = gr_data)
zampieri_test = zampieri.get('test')

In [None]:
instruct = "You are a Greek moderator and you are given a text, return 1 if the text is offensive and 0 if not. Only the values 1 or 0 should be returned."
zampieri_test['llama'] = [llama_prompt(text=t, instruction=instruct, max_len=10) for t in tqdm(zampieri_test.text.values)]

In [None]:
zampieri_test.llama.value_counts()[:5]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(zampieri_test.subtask_a, zampieri_test.llama.apply(lambda x: 1 if '1' in x else 0).values))

In [None]:
zampieri_test.to_csv('toxicity.csv')

In [None]:
shots = """ Here are a couple of labelled examples:
user: Πρέπει να πεθάνεις.
assistant: 1
user: Αυτή είναι μία ωραία εκπομπή.
assistant: 0
"""
zampieri_test['llama2s'] = [llama_prompt(text=t, instruction=instruct, shots=shots, max_len=10) for t in tqdm(zampieri_test.text.values)]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(zampieri_test.subtask_a, zampieri_test.llama2s.apply(lambda x: 1 if x.strip()=='1' else 0).values))

# MT

In [None]:
prokopidis_mt = gena.ProkopidisMtDt(datasets=gr_data)
pd.set_option('display.max_colwidth', None)
for lang in prokopidis_mt.target_langs:
  print(f"Language: {lang} ({prokopidis_mt.langs_dict[lang]})")
  display(prokopidis_mt.get(lang, 'train').sample())

In [None]:
eng = prokopidis_mt.get('eng', 'test').copy()
jpn = prokopidis_mt.get('jpn', 'test').copy()
fas = prokopidis_mt.get('fas', 'test').copy()

#eng = pd.read_csv('eng_llama3_70b_i.csv')
#jpn = pd.read_csv('jpn_llama3_70b_i.csv')
#fas = pd.read_csv('fas_llama3_70b_i.csv')
#from ast import literal_eval
#eng.target = eng.target.apply(literal_eval)
#jpn.target = jpn.target.apply(literal_eval)
#fas.target = fas.target.apply(literal_eval)
#eng.llama.fillna('', inplace=True)
#jpn.llama.fillna('', inplace=True)
#fas.llama.fillna('', inplace=True)

In [None]:
instruct = lambda source='ell', target='eng': f"Given a text in {source}, translate it to {target}. Only the translation should be returned."
eng['llama'] = [llama_prompt(text=t, instruction=instruct()) for t in tqdm(eng.source.values)]
jpn['llama'] = [llama_prompt(text=t, instruction=instruct(target='jpn')) for t in tqdm(jpn.source.values)]
fas['llama'] = [llama_prompt(text=t, instruction=instruct(target='fas')) for t in tqdm(fas.source.values)]


In [None]:
eng.to_csv('eng_llama3_70b_i.csv')
jpn.to_csv('jpn_llama3_70b_i.csv')
fas.to_csv('fas_llama3_70b_i.csv')

In [None]:
!pip install pywer
import pywer

def ter(hyp, refs, unit='word'):
  """
  Compute the translation error rate between a hypothesis and a reference.
  If a list of references is provided, the minimum score is returned.
  :param hyp: The hypothesis.
  :param refs: The reference or list of references.
  :return: The TER score.
  """
  if isinstance(refs, str):
    refs = [refs]
  scores = [pywer.wer(refs, [hyp]) if unit=='word' else pywer.cer(refs, [hyp]) for ref in refs]
  return min(scores)

# Compute TER (word)
score = eng.apply(lambda row: ter(row.llama, row.target), axis=1)
print(f"Eng: {score.mean():.2f} ({score.std():.2f})")

score = jpn.apply(lambda row: ter(row.llama, row.target), axis=1)
print(f"Jpn: {score.mean():.2f} ({score.std():.2f})")

score = fas.apply(lambda row: ter(row.llama, row.target), axis=1)
print(f"Fas: {score.mean():.2f} ({score.std():.2f})")

In [None]:
# Compute TER (char)
score = eng.apply(lambda row: ter(row.llama, row.target, 'char'), axis=1)
print(f"Eng: {score.mean():.2f} ({score.std():.2f})")

score = jpn.apply(lambda row: ter(row.llama, row.target, 'char'), axis=1)
print(f"Jpn: {score.mean():.2f} ({score.std():.2f})")

score = fas.apply(lambda row: ter(row.llama, row.target, 'char'), axis=1)
print(f"Fas: {score.mean():.2f} ({score.std():.2f})")

In [None]:
# BERTscore
#!pip install evaluate bert_score
from evaluate import load
bertscore = load("bertscore")
results_en = [bertscore.compute(predictions=[p], references=[t[0]], lang="en") for p,t in tqdm(zip(eng.llama.values, eng.target.values))]

In [None]:
eng['bertscore_f1'] = [r['f1'][0] for r in results_en]
eng['bertscore_precision'] = [r['precision'][0] for r in results_en]
eng['bertscore_recall'] = [r['recall'][0] for r in results_en]
eng.bertscore_f1.mean()

In [None]:
results_ja = [bertscore.compute(predictions=[p], references=[t[0]], lang="ja") for p, t in tqdm(zip(jpn.llama.values, jpn.target.values))]

In [None]:
jpn['bertscore_f1'] = [r['f1'][0] for r in results_ja]
jpn['bertscore_precision'] = [r['precision'][0] for r in results_ja]
jpn['bertscore_recall'] = [r['recall'][0] for r in results_ja]
jpn.bertscore_f1.mean()

In [None]:
results_fa = [bertscore.compute(predictions=[p], references=[t[0]], lang="fa") for p, t in tqdm(zip(fas.llama.values, fas.target.values))]

In [None]:
fas['bertscore_f1'] = [r['f1'][0] for r in results_fa]
fas['bertscore_precision'] = [r['precision'][0] for r in results_fa]
fas['bertscore_recall'] = [r['recall'][0] for r in results_fa]
fas.bertscore_f1.mean()

In [None]:
eng.to_csv('eng_llama3_70b_i_bertscore.csv')
jpn.to_csv('jpn_llama3_70b_i_bertscore.csv')
fas.to_csv('fas_llama3_70b_i_bertscore.csv')

# Intent

In [None]:
rizou = gena.RizouDt(datasets=gr_data).get('test')
rizou.sample()

In [None]:
classes = list(rizou.intent.unique())
rizou.intent.value_counts().plot.barh(figsize=(8,2));

In [None]:
rizou.text.apply(len).describe()

In [None]:
instruct = "Given a text, provide the intent of the text. Only the intent should be returned. Here is the list of possible intents: " + ', '.join(classes)
rizou['llama'] = [llama_prompt(text=t, instruction=instruct) for t in tqdm(rizou.text.values)]

In [None]:
rizou.to_csv('rizou.csv')

In [None]:
# fixing the values
rizou['llama_fixed'] = rizou.llama.apply(lambda x: x.strip() if str(x).strip() in classes else random.choice(classes))
rizou[rizou.llama.apply(lambda x: str(x).strip() not in classes)].shape

In [None]:
from sklearn.metrics import classification_report
print(classification_report(rizou.intent, rizou.llama_fixed))

# Summarisation

In [None]:
koniaris = gena.KoniarisDt(datasets = gr_data)

In [None]:
summ = koniaris.get('test')
summ.sample()

In [None]:
summ.text.apply(len).describe()

In [None]:
summ.text.str.split().apply(len).describe()

In [None]:
summ.text.str.split().apply(len).hist(bins=100, figsize=(8,2));

In [None]:
summ_short = summ[summ.text.str.split().apply(len)<1000]
summ_short.text.str.split().apply(len).hist(bins=100, figsize=(8,2));

In [None]:
summ_short.text.apply(len).describe()

In [None]:
instruct = "Given a Greek legal text, provide its summary also in Greek. Only the summary should be returned."
summ_short['llama'] = [llama_prompt(text=t, instruction=instruct) for t in tqdm(summ_short.text.values)]

In [None]:
summ_short.sample()

In [None]:
#!pip install evaluate bert_score
from evaluate import load
bertscore = load("bertscore")
bert_scores = [bertscore.compute(predictions=[p], references=[t[0]], lang="gr") for p,t in tqdm(zip(summ_short.llama.values, summ_short.summary.values))]

In [None]:
summ_short['bert_f1'] = [r['f1'][0] for r in bert_scores]
summ_short['bert_precision'] = [r['precision'][0] for r in bert_scores]
summ_short['bert_recall'] = [r['recall'][0] for r in bert_scores]
summ_short.bert_f1.mean()

In [None]:
!pip install rouge
from rouge import Rouge
rouge = Rouge()
rouge_scores = [rouge.get_scores(p, t) for p,t in tqdm(zip(summ_short.llama.values, summ_short.summary.values))]

In [None]:
rouge_scores[0][0]

In [None]:
summ_short['rouge1_f1'] = [r[0]['rouge-1']['f'] for r in rouge_scores]
summ_short['rouge1_precision'] = [r[0]['rouge-1']['p'] for r in rouge_scores]
summ_short['rouge1_recall'] = [r[0]['rouge-1']['r'] for r in rouge_scores]

In [None]:
summ_short['rouge2_f1'] = [r[0]['rouge-2']['f'] for r in rouge_scores]
summ_short['rouge2_precision'] = [r[0]['rouge-2']['p'] for r in rouge_scores]
summ_short['rouge2_recall'] = [r[0]['rouge-2']['r'] for r in rouge_scores]

In [None]:
summ_short['rougel_f1'] = [r[0]['rouge-l']['f'] for r in rouge_scores]
summ_short['rougel_precision'] = [r[0]['rouge-l']['p'] for r in rouge_scores]
summ_short['rougel_recall'] = [r[0]['rouge-l']['r'] for r in rouge_scores]

In [None]:
summ_short.to_csv('summ_koniaris.csv')
summ_short[summ_short.columns[6:]].agg(['mean', 'std', 'sem'])

# Language modeling

In [None]:
%%capture
!git clone https://github.com/ipavlopoulos/lm.git
from lm.markov.models import LM

In [None]:
raw_datasets = {'dritsa21':gena.DritsaDt(datasets=gr_data).get('train'),
                'barzokas20':gena.BarzokasDt(datasets=gr_data).get('train'),
                'prokopidis20':gena.ProkopidisCrawledDt(datasets=gr_data).get('train'),
                'papantoniou23': gena.PapantoniouDt(datasets=gr_data).get('train')}

train_sets = {}
test_sets = {}
for dataset_name in raw_datasets:
  print(dataset_name)
  dataset = raw_datasets[dataset_name]
  dataset = dataset[dataset.text.notna()]
  dataset = dataset.sample(frac=1).reset_index(drop=True)
  train_sets[dataset_name] = dataset.text.apply(lambda x: x[:100]).iloc[:1000] # lower lim
  test_sets[dataset_name] = dataset.text.apply(lambda x: x[:100]).iloc[1000:1500]

In [None]:
ppls, ppls_std = {}, {}
for dname in raw_datasets:
  train = train_sets[dname]
  test = test_sets[dname]
  lm = LM(gram="CHAR")
  lm.train(' '.join(train.values)[:65000]) # length of min dataset
  ppls[dname], ppls_std[dname] = {}, {}
  for dname2 in raw_datasets:
    scores = test_sets[dname2].apply(lm.cross_entropy)
    ppls[dname][dname2] = scores.mean()
    ppls_std[dname][dname2] = scores.std()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
ppls_pd = pd.DataFrame(ppls)
plt.figure(figsize=(8, 6))
sns.heatmap(ppls_pd, annot=True, cmap='coolwarm', linewidths=0.5,
            linecolor='black', cbar_kws={'label': 'PPL'});

# Add labels and a title
plt.title('PPL per LM per dataset'); plt.xlabel('Dataset'); plt.ylabel('LM');
plt.tight_layout();
plt.savefig('ppl_heatmap.pdf', dpi=300, format='PDF')

# Clustering

In [None]:
#@title Clustering Accuracy
import numpy as np
from scipy.optimize import linear_sum_assignment as hungarian

def hungarian_acc(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1

    row_ind, col_ind = hungarian(w.max() - w)
    return sum([w[i, j] for i, j in zip(row_ind, col_ind)]) * 1.0 / y_pred.size

In [None]:
%%capture
papaloukas = gena.PapaloukasDt(datasets=gr_data).get('test')

In [None]:
papaloukas.sample()

In [None]:
import seaborn as sns
sns.violinplot(papaloukas.text.apply(lambda x: min(len(x), 6000)));
sns.despine(left=True, bottom=True);

## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

# Convert the text documents to a matrix of TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(papaloukas.text.values)
tfidf_matrix_dense = tfidf_matrix.toarray()

# Print the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)

In [None]:
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import adjusted_rand_score as ari

test = papaloukas
# kmeans for k equal to number of labels (volumes, chapters, subjects)
for num_clusters in (47, 374, 1685):
  print(f"K={num_clusters}")
  km = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
  km.fit(tfidf_matrix)
  test[f'kmeans{num_clusters}'] = km.labels_
  print(f"NMI: {nmi(test.volume.values, km.labels_):.4f}")
  print(f"AMI: {ami(test.volume.values, km.labels_):.4f}")
  print(f"ACC: {hungarian_acc(test.volume.values, km.labels_):.4f}")
  print()

In [None]:
ground_truth = {47: 'volume', 374:'chapter', 1685:'subject'}
for num_clusters in ground_truth:
  print(f"K={num_clusters}")
  print(f"NMI: {nmi(test[ground_truth[num_clusters]].values, test[f'kmeans{num_clusters}']):.4f}")
  print(f"AMI: {ami(test[ground_truth[num_clusters]], test[f'kmeans{num_clusters}']):.4f}")
  print(f"ACC: {hungarian_acc(test[ground_truth[num_clusters]], test[f'kmeans{num_clusters}']):.4f}")
  print()

## Instructor
* Texts have a length that is up to 6k characters
* Instructor's tokeniser has a limit of 512 tokens
* Texts are also in Greek (Instructor is not multilingual)

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

model = SentenceTransformer('hkunlp/instructor-large') # Using a sentence-transformer model
instruction = "Instruction: Compute representations for text clustering"
papaloukas['instructor'] = [model.encode(f"{instruction}: {t}") for t in tqdm(papaloukas.text.values)]
papaloukas.to_pickle('papaloukas-instructor.pkl')

In [None]:
# upload the saved dataframe including the Instructor embeddings
papaloukas = pd.read_pickle('papaloukas-instructor.pkl')

* Run KMeans on top of the Instructor embeddings
* Limitation: Instructor is not multilingual by default

In [None]:
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import silhouette_score as sil
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import adjusted_rand_score as ari
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize


test = papaloukas
embeddings = normalize(np.array(papaloukas.instructor.tolist()))

# kmeans for k equal to number of labels (volumes, chapters, subjects)
for num_clusters in (47, 374, 1685):
  print(f"K={num_clusters}")
  km = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
  km.fit(embeddings)
  test[f'kmeans{num_clusters}'] = km.labels_
  print(f"NMI: {nmi(test.volume.values, km.labels_):.4f}")
  print(f"AMI: {ami(test.volume.values, km.labels_):.4f}")
  print(f"ARI: {ari(test.volume.values, km.labels_):.4f}")
  print()

ground_truth = {47: 'volume', 374:'chapter', 1685:'subject'}
for num_clusters in ground_truth:
  print(f"K={num_clusters}")
  print(f"NMI: {nmi(test[ground_truth[num_clusters]].values, test[f'kmeans{num_clusters}']):.4f}")
  print(f"AMI: {ami(test[ground_truth[num_clusters]], test[f'kmeans{num_clusters}']):.4f}")
  print(f"ARI: {ari(test[ground_truth[num_clusters]], test[f'kmeans{num_clusters}']):.4f}")
  print()

## Summarise+Translate+Embed (STE)

In [None]:
instruction = '''Given a legal text in Greek, summarise and translate it to English.
Limit your response to 200 tokens; out directly the English translation; return no other text (e.g., do not start with "Here is the English translation").
'''
t = papaloukas.text.iloc[0]
print(t, '\n', llama_prompt(text=t[:6000], instruction=instruction, max_len=200))

In [None]:
def sum_translate(text, max_out_len=200, max_in_len=2000):
  instruction = f'Given a legal text in Greek, summarise and translate it to English. Limit your response to {max_out_len} tokens; out directly the English translation; return no other text (e.g., do not start with "Here is the English translation").'
  try:
    return llama_prompt(text=text[:max_in_len], instruction=instruction, max_len=max_out_len)
  except:
    print(f'\nERROR\nTEXT: {text}\n')
    return None

papaloukas['llama_en_sum'] = [sum_translate(t) for t in tqdm(papaloukas.text.values)]

In [None]:
from google.colab import files
f = 'papaloukas_sum_translate.csv'
papaloukas.to_csv(f)
files.download(f)

In [None]:
papaloukas = pd.read_csv('papaloukas_sum_translate.csv', index_col=0)
papaloukas.sample()

* TFIDF of English summaries

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import silhouette_score as sil
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import adjusted_rand_score as ari

# tackling empty values
papaloukas['llama_en_sum'] = papaloukas['llama_en_sum'].fillna('')

# Convert the text documents to a matrix of TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(papaloukas.llama_en_sum.values)
tfidf_matrix_dense = tfidf_matrix.toarray()

# Print the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)

test = papaloukas

# kmeans for k equal to number of labels (volumes, chapters, subjects)
for K, level in ((47,'volume'), (374,'chapter'), (1685,'subject')):
  print(f"K={K}")
  km = KMeans(n_clusters=K, random_state=42, n_init='auto')
  km.fit(tfidf_matrix)
  test[f'kmeans{K}'] = km.labels_
  print(f"NMI: {nmi(test[level].values, km.labels_):.4f}")
  print(f"AMI: {ami(test[level].values, km.labels_):.4f}")
  print(f"ACC: {hungarian_acc(test[ground_truth[K]], test[f'kmeans{K}']):.4f}")
  print()

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

model = SentenceTransformer('hkunlp/instructor-large') # Using a sentence-transformer model
instruction = "Compute an embedding for this English legal text for clustering: "
papaloukas['ste'] = [model.encode(f"{instruction}: {t}") for t in tqdm(papaloukas.llama_en_sum.values)]

In [None]:
papaloukas.to_pickle('papaloukas_sum_translate_embed.pkl', protocol=4)
from google.colab import files
files.download('papaloukas_sum_translate_embed.pkl')

In [None]:
papaloukas = pd.read_pickle('papaloukas_sum_translate_embed.pkl')

In [None]:
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import silhouette_score as sil
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import adjusted_rand_score as ari
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

test = papaloukas
embeddings = np.vstack(test.ste.values)

for K, level in ((47,'volume'), (374,'chapter'), (1685,'subject')):
  print(f"K={K}")
  km = KMeans(n_clusters=K, random_state=42, n_init='auto')
  km.fit(embeddings)
  test[f'kmeans{K}'] = km.labels_
  print(f"NMI: {nmi(test[level].values, km.labels_):.4f}")
  print(f"AMI: {ami(test[level].values, km.labels_):.4f}")
  print(f"ACC: {hungarian_acc(test[ground_truth[K]], test[f'kmeans{K}']):.4f}")
  print()

# Structure prediction (POS, NER)

## NER

In [None]:
barziokas = gena.BarziokasDt(datasets=gr_data).get('test')
barziokas.sample()

In [None]:
# prompt: generate a prompt for llama for NER, using the following labels: ['S-LOC', 'O', 'B-ORG', 'E-ORG', 'B-PERSON', 'E-PERSON', 'I-ORG', 'B-LOC', 'E-LOC', 'S-PERSON', 'I-PERSON', 'S-ORG', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'I-LOC']. A sequence of labels should be returned and no other text. Add a few examples.

instruction = """You are a Greek NLP expert and you are given a text. Return a sequence of NER labels for each token in the text. The labels should be chosen from the following list: ['S-LOC', 'O', 'B-ORG', 'E-ORG', 'B-PERSON', 'E-PERSON', 'I-ORG', 'B-LOC', 'E-LOC', 'S-PERSON', 'I-PERSON', 'S-ORG', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'I-LOC']. Do not return any other text.
Here are a couple of labelled examples:

user: Η Αθήνα είναι η πρωτεύουσα της Ελλάδας.
assistant: B-LOC E-LOC O O O B-LOC E-LOC O

user: Ο Αλέξης Τσίπρας είναι πρωθυπουργός της Ελλάδας.
assistant: B-PERSON I-PERSON E-PERSON O O B-LOC E-LOC O
"""
barziokas['llama_demo'] = [llama_prompt(text=t, instruction=instruction) for t in tqdm(barziokas.sentence.values)]

In [None]:
barziokas.llama_demo

In [None]:
import ast
ner_labels = ['S-LOC', 'B-LOC', 'I-LOC', 'E-LOC', 'S-ORG', 'B-ORG', 'I-ORG', 'E-ORG', 'S-PERSON', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'O']
def extract_list(text, labels = ner_labels):
  if isinstance(text, str):
    text = text.strip().replace('\n', '')
    if '[' in text and ']' in text:
      if ('"' not in text) and ("'" not in text):
        for label in labels:
          text = text.replace(label, f'"{label}"')
      text = text[text.index('[') : text.index(']')+1]
    else:
      text = text.split()
    try:
      text = ast.literal_eval(text)
    except:
      text = []
  return text

barziokas['llama_list'] = barziokas.llama_demo.fillna('[]').apply(extract_list)

In [None]:
predictions = barziokas.apply(lambda row: row.llama_list[:len(row.ne_tag4)] + ['O' for _ in range(len(row.ne_tag4)-len(row.llama_list))], axis=1)
gold = barziokas.ne_tag4
from sklearn.metrics import classification_report
print(classification_report(gold.explode(), predictions.explode(), labels=ner_labels, zero_division=0))

### Original attempt

In [None]:
instruction = '''
Identify and label named entities in a given sentence using the specified NER tag set: `['S-LOC', 'O', 'B-ORG', 'E-ORG', 'B-PERSON', 'E-PERSON', 'I-ORG', 'B-LOC', 'E-LOC', 'S-PERSON', 'I-PERSON', 'S-ORG', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'I-LOC']`.
You will be provided with a list of words, which form a sentence. Your task is to analyze this sentence and assign the appropriate named entity tag to each word.
- For single-token entities, use the `S-` prefix followed by the appropriate entity type (e.g., `S-LOC` for a single-token location).
- For multi-token entities, use the `B-`, `I-`, and `E-` prefixes to denote the beginning, inside, and end of the entity, respectively (e.g., `B-PERSON`, `I-PERSON`, `E-PERSON` for a person entity spanning multiple tokens).
- Use the `O` tag for words that are not part of any named entity.
Generate just a list with just the elements being the named entity tags corresponding to each word in the input list. Ensure that the tags correctly represent the boundaries and types of named entities as per the tag set provided.
No text should be provided other than the list.

Tag Set:
    - `S-LOC`: Single-token location entity.
    - `O`: Outside any named entity.
    - `B-ORG`: Beginning of an organization entity.
    - `E-ORG`: End of an organization entity.
    - `B-PERSON`: Beginning of a person entity.
    - `E-PERSON`: End of a person entity.
    - `I-ORG`: Inside an organization entity.
    - `B-LOC`: Beginning of a location entity.
    - `E-LOC`: End of a location entity.
    - `S-PERSON`: Single-token person entity.
    - `I-PERSON`: Inside a person entity.
    - `S-ORG`: Single-token organization entity.
    - `S-MISC`: Single-token miscellaneous entity.
    - `B-MISC`: Beginning of a miscellaneous entity.
    - `I-MISC`: Inside a miscellaneous entity.
    - `E-MISC`: End of a miscellaneous entity.
    - `I-LOC`: Inside a location entity.
'''
barziokas['llama'] = [llama_prompt(text=t, instruction=instruction) for t in tqdm(barziokas.sentence.values)]

In [None]:
barziokas.to_csv('barziokas.csv')
#barziokas = pd.read_csv('barziokas.csv')

In [None]:
import ast
ner_labels = ['S-LOC', 'B-LOC', 'I-LOC', 'E-LOC', 'S-ORG', 'B-ORG', 'I-ORG', 'E-ORG', 'S-PERSON', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'S-MISC', 'B-MISC', 'I-MISC', 'E-MISC', 'O']
def extract_list(text, labels = ner_labels):
  if isinstance(text, str):
    text = text.strip().replace('\n', '')
    if '[' in text and ']' in text:
      if ('"' not in text) and ("'" not in text):
        for label in labels:
          text = text.replace(label, f'"{label}"')
      text = text[text.index('[') : text.index(']')+1]
    else:
      text = '[]'
    try:
      text = ast.literal_eval(text)
    except:
      text = []
  return text

barziokas['llama_list'] = barziokas.llama.fillna('[]').apply(extract_list)

In [None]:
predictions = barziokas.apply(lambda row: row.llama_list[:len(row.ne_tag4)] + ['O' for _ in range(len(row.ne_tag4)-len(row.llama_list))], axis=1)
gold = barziokas.ne_tag4

In [None]:
from sklearn.metrics import classification_report
print(classification_report(gold.explode(), predictions.explode(), labels=ner_labels, zero_division=0))

## POS

In [None]:
prokopidis_ud = gena.ProkopidisUdDt( datasets=gr_data)
prokopidis_ud.get('test').head()

In [None]:
labels=prokopidis_ud.get('test').x.unique()
prokopidis_ud.get('test').x.value_counts()

In [None]:
prokopidis_ud.get('test').shape

In [None]:
prokopidis_ud.get('test').x.value_counts().iloc[-1]/prokopidis_ud.get('test').x.value_counts().iloc[0]

In [None]:
text_level_pos = pd.DataFrame()
text_level_pos['w'] = prokopidis_ud.get('test').groupby('s').w.apply(list)
text_level_pos['x'] = prokopidis_ud.get('test').groupby('s').x.apply(list)
text_level_pos.head()

In [None]:
text_level_pos.w.apply(len).describe()

In [None]:
# prompt: generate a prompt for llama for POS tagging, defining the following categories: 'NOUN', 'DET', 'PUNCT', 'VERB', 'ADJ', 'ADP', 'AUX', 'ADV', 'PRON', 'CCONJ', 'PROPN', '_', 'SCONJ', 'NUM', 'X', 'PART'. The input will already be tokenised to assist keeping the same length. Provide an explanation per named entity and provide a few examples.

instruction = """You are a Greek NLP expert and you are given a text that has already been tokenised. Return a sequence of Part-of-speech (POS) tags for each token in the text.
The labels should be chosen from the following list: 'NOUN', 'DET', 'PUNCT', 'VERB', 'ADJ', 'ADP', 'AUX', 'ADV', 'PRON', 'CCONJ', 'PROPN', '_', 'SCONJ', 'NUM', 'X', 'PART'.
Do not return any other text.

**Explanation of POS Tags:**

* **NOUN:** Noun (e.g., "άνθρωπος", "πόλη")
* **DET:** Determiner (e.g., "ο", "η", "το")
* **PUNCT:** Punctuation (e.g., ".", ",", "!")
* **VERB:** Verb (e.g., "γράφω", "τρώω")
* **ADJ:** Adjective (e.g., "καλός", "μεγάλος")
* **ADP:** Adposition (prepositions and postpositions) (e.g., "σε", "με", "από")
* **AUX:** Auxiliary verb (e.g., "είμαι", "έχω")
* **ADV:** Adverb (e.g., "γρήγορα", "πολύ")
* **PRON:** Pronoun (e.g., "εγώ", "εσύ", "αυτός")
* **CCONJ:** Coordinating conjunction (e.g., "και", "ή")
* **PROPN:** Proper noun (e.g., "Αθήνα", "Γιώργος")
* **_:** Represents an unknown or missing tag.
* **SCONJ:** Subordinating conjunction (e.g., "ότι", "αν")
* **NUM:** Numeral (e.g., "ένα", "δύο")
* **X:** Other (e.g., foreign words, abbreviations)
* **PART:** Particle (e.g., "να", "μη")

**Examples:**

* **Input:** ["Ο", "Γιώργος", "γράφει", "ένα", "βιβλίο", "."]
* **Output:** ['DET', 'PROPN', 'VERB', 'DET', 'NOUN', 'PUNCT']

* **Input:** ["Η", "γρήγορη", "αλώπηκα", "τρέχει", "."]
* **Output:** ['DET', 'ADJ', 'NOUN', 'VERB', 'PUNCT']

"""

text_level_pos['llama'] = [llama_prompt(text=t, instruction=instruction) for t in tqdm(text_level_pos.w.values)]

In [None]:
def extract_list(text, labels = labels):
  """
  Extract a list from a string.
  :param text: The string to extract the list from.
  :param labels: The list of labels to use.
  :return: The list extracted from the string.
  """
  text = text.strip().replace('"', '').replace('\n', '').replace("'", '')
  text = text.replace('.', '').replace(',', '')
  text = text.replace('[', '').replace(']', '')
  casted_list = text.split()
  if len(set(casted_list).intersection(set(labels)))==0:
    return []
  return casted_list

text_level_pos['llama_post'] = text_level_pos['llama'].apply(lambda x: extract_list(x, labels=labels))

In [None]:
import numpy as np
predictions = text_level_pos.apply(lambda row: row.llama_post[:len(row.x)] + [np.random.choice(labels) for _ in range(len(row.x)-len(row.llama_post))], axis=1)
gold = text_level_pos.x
from sklearn.metrics import classification_report
print(classification_report(gold.explode(), predictions.explode(), labels=labels, zero_division=0))

In [None]:
# prompt: generate a prompt for llama for POS tagging, defining the following categories: 'NOUN', 'DET', 'PUNCT', 'VERB', 'ADJ', 'ADP', 'AUX', 'ADV', 'PRON', 'CCONJ', 'PROPN', '_', 'SCONJ', 'NUM', 'X', 'PART'. The input will already be tokenised to assist keeping the same length. Provide an explanation per named entity and provide a few examples.

instruction = """You are a Greek NLP expert and you are given a text that has already been tokenised. Return a sequence of Part-of-speech (POS) tags for each token in the text.
The labels should be chosen from the following list: 'NOUN', 'DET', 'PUNCT', 'VERB', 'ADJ', 'ADP', 'AUX', 'ADV', 'PRON', 'CCONJ', 'PROPN', '_', 'SCONJ', 'NUM', 'X', 'PART'.
Do not return any other text.

**Explanation of POS Tags:**

* **NOUN:** Noun (e.g., "άνθρωπος", "πόλη")
* **DET:** Determiner (e.g., "ο", "η", "το")
* **PUNCT:** Punctuation (e.g., ".", ",", "!")
* **VERB:** Verb (e.g., "γράφω", "τρώω")
* **ADJ:** Adjective (e.g., "καλός", "μεγάλος")
* **ADP:** Adposition (prepositions and postpositions) (e.g., "σε", "με", "από")
* **AUX:** Auxiliary verb (e.g., "είμαι", "έχω")
* **ADV:** Adverb (e.g., "γρήγορα", "πολύ")
* **PRON:** Pronoun (e.g., "εγώ", "εσύ", "αυτός")
* **CCONJ:** Coordinating conjunction (e.g., "και", "ή")
* **PROPN:** Proper noun (e.g., "Αθήνα", "Γιώργος")
* **_:** Represents an unknown or missing tag.
* **SCONJ:** Subordinating conjunction (e.g., "ότι", "αν")
* **NUM:** Numeral (e.g., "ένα", "δύο")
* **X:** Other (e.g., foreign words, abbreviations)
* **PART:** Particle (e.g., "να", "μη")

"""

text_level_pos['llama0s'] = [llama_prompt(text=t, instruction=instruction) for t in tqdm(text_level_pos.w.values)]

In [None]:
from sklearn.metrics import classification_report
import numpy as np
text_level_pos['llama_post'] = text_level_pos['llama0s'].apply(lambda x: extract_list(x, labels=labels))
predictions = text_level_pos.apply(lambda row: row.llama_post[:len(row.x)] + [np.random.choice(labels) for _ in range(len(row.x)-len(row.llama_post))], axis=1)
gold = text_level_pos.x
print(classification_report(gold.explode(), predictions.explode(), labels=labels, zero_division=0))

In [None]:
print(classification_report(gold[:175].explode(), predictions[:175].explode(), labels=labels, zero_division=0))

In [None]:
text_level_pos.to_csv('pos-llama70b.csv')

# Authorship analysis
* Stdying whether LLMs encoded data from open books

In [None]:
barzokas = gena.BarzokasDt(datasets=gr_data).get('train')
barzokas.sample()

In [None]:
# removing duplicates
print(barzokas.shape[0])
print('Removing duplicates...')
barzokas.drop_duplicates(inplace=True, subset=['text'])
print(barzokas.shape[0])

In [None]:
barzokas_df = barzokas[barzokas.status=='parsable']
barzokas_df.shape

In [None]:
# we need enough size to sample text from within
barzokas_df.tokensCount.hist(bins=500, figsize=(8,2), log=True);

In [None]:
barzokas_df = barzokas_df[barzokas_df.tokensCount>1000]
barzokas_df.shape

In [None]:
# sampling 1000 characters from the middle of the text
def get_excerpt(text):
  mid = len(text)/2
  return text[int(mid-500):int(mid+500)]

barzokas_df['excerpt'] = barzokas_df.text.apply(get_excerpt)

In [None]:
barzokas_df[['id', 'title', 'excerpt', 'author', 'type', 'publishedYear', 'tokensCount']].to_csv('barzokas_excerpt.csv', index=False)

In [None]:
pop_authors = barzokas_df.author.value_counts()[:17].index.tolist()
barzokas_pop = barzokas_df[barzokas_df.author.isin(pop_authors)]
print(barzokas_pop.shape)
barzokas_pop.author.value_counts().plot.barh();

In [None]:
barzokas_pop.excerpt.apply(len).describe()

In [None]:
barzokas_pop.author.value_counts().iloc[-1]/barzokas_pop.author.value_counts().iloc[0]

In [None]:
instruct = "Given an excerpt from a Greek book, return the author it is from by picking from the following authors: " + ', '.join(pop_authors) +' Return only the name of the author, nothing else.'
barzokas_pop['llama'] = [llama_prompt(text=t, instruction=instruct) for t in tqdm(barzokas_pop.excerpt.values)]

In [None]:
barzokas_pop['llama'] = barzokas_pop.llama.apply(lambda x: "Θανάσης Τριαρίδης" if "ανάση" in x else x)
barzokas_pop['llama'] = barzokas_pop.llama.apply(lambda x: "Plato" if "Plato" in x else x)
barzokas_pop['llama'] = barzokas_pop.llama.apply(lambda x: "Κολιόπουλος" if "Κολιόπουλος" in x else x)

In [None]:
from sklearn.metrics import classification_report
barzokas_pop_mini = barzokas_pop[:175]
print(classification_report(barzokas_pop_mini.author, barzokas_pop_mini.llama.str.strip(), zero_division=0, labels=pop_authors))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the confusion matrix
cm = confusion_matrix(barzokas_pop_mini.author,
                      barzokas_pop_mini.llama.str.strip(),
                      labels=pop_authors)

# Create a custom plot with Seaborn for better aesthetics
plt.figure(figsize=(10, 8))  # Increase figure size for readability
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,  # Add annotations and color
            xticklabels=pop_authors, yticklabels=pop_authors,
            linewidths=0.5, linecolor='black', square=True)

# Add labels, title, and customize ticks
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix", fontsize=15)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)

# Save the plot as a PDF with DPI=300
plt.savefig("confusion_matrix.pdf", format="pdf", dpi=300, bbox_inches="tight")

# Show the plot (optional)
plt.show()

In [None]:
barzokas_pop.to_csv('barzokas_pop_llama70b.csv')

# Raw data analysis

In [None]:
#@title download the data
raw_data = {}
raw_data['prokopidis'] = gena.ProkopidisCrawledDt(datasets=gr_data).get('train')
raw_data['dritsa'] = gena.DritsaDt(datasets=gr_data).get('train')
raw_data['papantoniou'] = gena.PapantoniouDt(datasets=gr_data).get('train')

* Train a character-level language model per dataset.
* Compute the BPC per dataset.
* Draw a BPC heatmap, showing in red the dataset linguistically surprised by which.

In [None]:
#!git clone https://github.com/ipavlopoulos/lm.git
from lm.markov.models import LM

train_sets = {}
test_sets = {}
for dataset_name in raw_data:
  print(dataset_name)
  dataset = raw_data[dataset_name]
  dataset = dataset[dataset.text.notna()]
  dataset = dataset.sample(frac=1).reset_index(drop=True)
  train_sets[dataset_name] = dataset.text.apply(lambda x: x[:100]).iloc[:1000] # lower lim
  test_sets[dataset_name] = dataset.text.apply(lambda x: x[:100]).iloc[1000:1500]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize dictionaries to store means and standard deviations
ppls = {}
ppls_sem = {}

# Calculate means and standard deviations
for dname in raw_data:
    train = train_sets[dname]
    test = test_sets[dname]
    lm = LM(gram="CHAR")
    lm.train(' '.join(train.values)[:65000])  # length of min dataset
    ppls[dname] = {}
    ppls_sem[dname] = {}
    for dname2 in raw_data:
        scores = test_sets[dname2].apply(lm.bpc)
        ppls[dname][dname2] = scores.mean()
        ppls_sem[dname][dname2] = scores.sem()

# Convert to DataFrames
ppls_pd = pd.DataFrame(ppls)
ppls_sem_pd = pd.DataFrame(ppls_std)

# Create annotations with both mean and standard deviation
annotations = ppls_pd.round(2).astype(str) + " ± " + ppls_sem_pd.round(2).astype(str)

# Plot heatmap with annotations
plt.figure(figsize=(8, 6))
sns.heatmap(ppls_pd, annot=annotations, fmt='', cmap=plt.cm.coolwarm, linewidths=0.5,
            linecolor='black', cbar_kws={'label': 'BPC'})

# Add labels and a title
#plt.title('BPC per LM per dataset')
plt.xlabel('Unseen text from'); plt.ylabel('LM trained on');
plt.tight_layout();
plt.savefig('ppl_heatmap.pdf', dpi=300, format='PDF')

* Counting characters

In [None]:
tokens_num_all = 0
for dataset_name in raw_data:
  texts = raw_data[dataset_name][raw_data[dataset_name].text.notna()].text.values
  tokens_num = sum([t.strip().count(' ')+1 for t in texts])
  print(f'{dataset_name} ==> tokens: {tokens_num}')
  tokens_num_all += tokens_num

print(len(tokens_num_all))

In [None]:
print(len(tokens_num_all))