In [None]:
import itertools
import os

import numpy as np
import pandas as pd
import stanza.resources.common
from stanza.utils.conll import CoNLL
from textcomplexity.cli import surface_based, sentence_based, pos_based, dependency_based, read_language_definition
from textcomplexity.utils import conllu

LANGUAGE = 'en'
PRESET = 'all'
# dataset = 'training'
# dataset = 'valid'
dataset = 'test'
CSV_URL = f'../data/{dataset}_set.csv'

In [None]:
df = pd.read_csv(CSV_URL)
df.drop_duplicates(inplace=True)
df = df.sample(frac=1, random_state=9).reset_index(drop=True)
df_encoded = pd.DataFrame(df[['ID', 'Sentence EN']])

In [None]:
stanza_resources_path = os.path.join(stanza.resources.common.DEFAULT_MODEL_DIR, "resources.json")
if not os.path.isfile(stanza_resources_path):
    stanza.resources.common.download_resources_json()
stanza.download(LANGUAGE)
stanza_pipeline = stanza.Pipeline(LANGUAGE, processors="tokenize,mwt,pos,lemma,depparse")


def encode_sentence(sentence):
    sentence = sentence.strip()
    doc = stanza_pipeline(sentence)
    dicts = doc.to_dict()
    conll = CoNLL.convert_dict(dicts)
    sentence_encoded = []
    for s in conll:
        sentence_encoded.extend(['\t'.join(token) for token in s] + [''])
    return sentence_encoded


df_encoded['sentence_encoded'] = df_encoded['Sentence EN'].apply(encode_sentence)

In [None]:
def extract_features(sentence_encoded):
    language, punct_tags, name_tags, open_tags, reference_frequency_list = read_language_definition(
        f'resources/{LANGUAGE}.json')
    all_results = {}
    sentences, graphs = zip(*conllu.read_conllu_sentences(sentence_encoded * 2))
    tokens = list(itertools.chain.from_iterable(sentences))
    window_size = int(sentence_encoded[-2].split('\t')[0])
    results = []
    try:
        results.extend(surface_based(tokens, window_size, PRESET))
    except:
        print(f'Warning: surface_based failed. Features are set to NaN.')
    results.extend(pos_based(tokens, punct_tags, name_tags, open_tags, reference_frequency_list, PRESET))
    results.extend(sentence_based(sentences, punct_tags, PRESET))
    results.extend(dependency_based(graphs, PRESET))
    for r in results:
        all_results[r.name] = {'value': r.value}
        if r.stdev is not None:
            all_results[r.name]['stdev'] = r.stdev
        if r.length is not None:
            all_results[r.name]['length'] = r.length
            all_results[r.name]['length stdev'] = r.length_stdev
    return all_results


features = df_encoded['sentence_encoded'].apply(extract_features)

In [None]:
feature_names = []
for idx, row in features.items():
    for k, v in row.items():
        for kk, vv in v.items():
            if k + "_" + kk not in feature_names:
                feature_names.append(k + "_" + kk)
features_dict = {name: np.full(len(df), np.nan) for name in feature_names}
for idx, row in features.items():
    for k, v in row.items():
        for kk, vv in v.items():
            features_dict[k + "_" + kk][idx] = vv

df_features = pd.DataFrame.from_dict(features_dict)
df_features = df_encoded.join(df_features)

In [None]:
df_features.to_csv(f'../data/features/features_{dataset}_complexity_en.csv', index=False)