Uses the annotations in `backform_base_cutoff/` to select the true derivations from the files in `random_subsamples/`.
Saves the resulting samples for each suffix in `analysis_samples/`, to be used (you guessed it) for the analysis.

In [2]:
import pandas as pd
import os
import backformer_one as b

# ID the files to look at and define mapping between suffix names used in annotation files and in the samples
# (-e was not yet differentiated into -eA and -eV).
ANNOT = os.listdir('backform_base_cutoff')
ANNOT_SFXS = [fn.split('_')[0] for fn in ANNOT]
SAMPLE_SFXS = ['-e' if s in ['-eA', '-eV'] else s for s in ANNOT_SFXS]

# List to collect dictionaries of metadata created iteratively in following for loop.
meta_list = []

for idx in range(len(ANNOT)):

    curr_sfx = ANNOT_SFXS[idx]

    # Read in the annotation and sample files and postprocess.
    annot_fn = '6_backform_base_cutoff/' + ANNOT[idx]
    sample_fn = '2_random_subsamples/' + SAMPLE_SFXS[idx] + '_subsample.csv'

    curr_samp = pd.read_csv(sample_fn)
    curr_samp = b.prep_sfx_df(curr_samp)

    curr_annot = pd.read_csv(annot_fn)
    true_bases = curr_annot[curr_annot.true_base == 1][['lemma', 'unique_candidates', 'pos']]

    # Left merge the sample with true_bases (keeps only the rows of curr_samp in which the lemma also appears in true_bases).
    subset_samp = true_bases.merge(curr_samp, how='left')
    subset_samp.rename(columns={'unique_candidates':'base', 'pos':'base_pos', 'morph': 'sfx'}, inplace=True)
    subset_samp = subset_samp[['sfx', 'word', 'lemma', 'base', 'base_pos', 'doc.url', 'doc.id', 's.idx']]
    subset_samp.to_csv('7_analysis_samples_unclean/' + curr_sfx + '_sample.csv', index=False)

    print('Done', curr_sfx)

Done -age
Done -ament
Done -and
Done -ant
Done -anz
Done -ateur
Done -ation
Done -ator
Done -atur
Done -eA
Done -el
Done -ement
Done -end
Done -ent
Done -enz
Done -er
Done -eur
Done -eV
Done -heit
Done -ie
Done -iker
Done -ikum
Done -ik
Done -iment
Done -ismus
Done -ist
Done -itaet
Done -iteur
Done -ition
Done -itur
Done -ium
Done -ling
Done -nis
Done -schaft
Done -ung
