Runs a bunch of formatting checks over the files contained in `backform_samples_nano_annot/` to make sure they're consistent.

In [1]:
import pandas as pd
import os

ANNOT_FILES = [fn for fn in os.listdir('2_backform_samples_nano_annot') if fn[-14:] == 'base_annot.csv']
SFXS = [fn.split('_')[0] for fn in ANNOT_FILES]

In [2]:
def true_lemma_contents(df):
    """
    Checks that true_lemma column contains only 0 or 1.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated lemmas or success message.
    """

    badcontent = df[~df.true_lemma.isin([0, 1])]
    
    if len(badcontent) > 0:
        print('+++ Not all lemmas are annotated with 0 or 1. Fix following lemmas. +++\n')
        print(badcontent[['lemma']], '\n')
#     else:
#         print('    All lemmas are annotated with 0 or 1.')


def true_base_contents(df):
    """
    Checks that true_lemma column contains only 1 or NaN.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated bases or success message.
    """
    badcontent = df[(~df.true_base.isna()) & (df.true_base != 1)]

    if len(badcontent) > 0:
        print('+++ Some bases are annotated with something other than 1 or a blank in true_bases. Fix following lemmas. +++\n')
        print(badcontent[['lemma', 'unique_candidates', 'true_base']], '\n')
#     else:
#         print('    All lemmas are annotated with 1 or nothing.')


def merge_subset_true_lemmas(df):
    """
    Checks that there are only values in merge where true_lemma == 1.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated data or success message.
    """
    
    badcontent = df[(~df['merge'].isna()) & (df.true_lemma != 1)]
    
    if len(badcontent) > 0:
        print('+++ Some invalid lemmas have values to merge. Re-annotate lemma or remove merge. +++\n')
        print(badcontent[['lemma', 'true_lemma', 'merge']], '\n')
#     else:
#         print('    Only true lemmas have values to merge.')
    
    
def true_base_subset_true_lemma(df):
    """
    Checks that true_base == 1 only when true_lemma == 1.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated data or success message.
    """
    badcontent = df[(df.true_base == 1) & (df.true_lemma != 1)]
    
    if len(badcontent) > 0:
        print('+++ 1+ true bases correspond to an invalid lemma. Re-annotate lemma or base. +++\n')
        print(badcontent[['lemma', 'unique_candidates']], '\n')
#     else:
#         print('    All true bases correspond to a valid lemma.')


def true_base_intersect_merge(df):
    """
    Checks that there is no overlap between the lemmas to merge and those annotated with true_base == 1.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated data or success message.
    """
    badcontent = df[(~df['merge'].isna()) & (~df.true_base.isna())]
    
    if len(badcontent) > 0:
        print('+++ Overlap between true base annotation and lemmas to merge. Remove true_base. +++\n')
        print(badcontent[['lemma', 'unique_candidates', 'merge']], '\n')
#     else:
#         print('    No overlap between lemmas to merge and true bases.')


def true_base_intersect_query(df):
    """
    Checks that there is no overlap between the bases to query by hand and those annotated with true_base == 1.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated data or success message.
    """
    badcontent = df[(~df.query_by_hand.isna()) & (~df.true_base.isna())]
    
    if len(badcontent) > 0:
        print('+++ Overlap between true_base annotation and query_by_hand annotation. Remove true_base. +++\n')
        print(badcontent[['lemma', 'query_by_hand']], '\n')
#     else:
#         print('    No overlap between lemmas to query manually and true bases.')


def merge_wd_in_lemma(df):
    """
    Checks that all of the values in the merge column are also in the lemma column (or manual_lemma column, if present).
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either values in merge that are not in lemma or success message.
    """
    mrg = set(df['merge'].dropna().unique())
    
    if 'manual_lemma' in df.columns:
        lem = set(df['lemma'].dropna().unique()).union(set(df['manual_lemma'].dropna().unique()))
    else:
        lem = set(df['lemma'].dropna().unique())

    intersect = mrg.intersection(lem)
    
    if intersect != mrg:
        print('+++ Some forms to merge are not in lemma col. Rm from merge, and add 1 in true_base or add stem to query_by_hand. +++\n')
        print(sorted(list(mrg - intersect)), '\n')
#     else:
#         print('    All forms to merge are found in the lemma column.')


def merge_unique_for_lemma(df):
    """
    Checks that there is not more than one merge value for each lemma.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated data or success message.
    """
    
    count_by_lemma = df.groupby('lemma').count().reset_index()
    badcontent = count_by_lemma[count_by_lemma['merge'] > 1]
    
    if len(badcontent) > 0:
        print('+++ The following lemmas have at least one value in the merge column. +++\n')
        print(badcontent['lemma'], '\n')
#     else:
#         print('    All lemmas have max. one value in the merge column.')


def true_base_unique_for_lemma(df):
    """
    Checks that there is not more than one true_base value for each lemma.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated data or success message.
    """
    count_by_lemma = df.groupby('lemma').count().reset_index()
    badcontent = count_by_lemma[count_by_lemma.true_base > 1]
    
    if len(badcontent) > 0:
        print('+++ The following lemmas have too many words annotated as true bases. +++\n')
        print(badcontent[['lemma', 'true_base']], '\n')
#     else:
#         print('    All lemmas have only one corresponding true base.')


def exists_annot_for_lemma(df):
    """
    Checks that each lemma that represents a derivation has some annotation provided for it
    (i.e., some value in the columns true_base, query_by_hand, or merge),
    and that the lemmas that don't represent derivations have no annotations.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either df(s) with incorrect lemmas or success message(s)
    """
    
    # Create a column that indicates whether anything is annotated in each row in the columns 
    # true_base, query_by_hand, or merge.
    df['annot'] = pd.np.where(~df.true_base.isna() | ~df.query_by_hand.isna() | ~df['merge'].isna(), 1, 0)

    # Subset the data based on whether the lemma is true_lemma or not.
    dfpos = df[df.true_lemma == 1]
    dfneg = df[df.true_lemma == 0]
    
    # For the positive subset, the annotation is wrong if there's any number other than 1
    # for each lemma.
    dfpossum = dfpos.groupby('lemma').sum().drop(columns=['lemma_freq', 'base_freq', 'true_lemma', 'true_base'])
    dfposwrong = dfpossum[dfpossum.annot != 1]
    
    if len(dfposwrong) > 0:
        print('+++ Some true lemmas have too many/too few annotations in cols true_base, query_by_hand, or merge. Annotate at most/at least one col. +++\n')
        print(dfposwrong)
        print()
#     else:
#         print('  All true lemmas have the right number of annotations.')
    
    # For the negative subset, the annotation is wrong if there's any number other than 0.
    dfnegsum = dfneg.groupby('lemma').sum().drop(columns=['lemma_freq', 'base_freq', 'true_lemma', 'true_base'])
    dfnegwrong = dfnegsum[dfnegsum.annot != 0]
    
    if len(dfnegwrong) > 0:
        print('+++ Some invalid lemmas have annotation(s) in columns true_base, query_by_hand, or merge. Remove. +++\n')
        print(dfnegwrong)
        print()
#     else:
#         print('    All invalid lemmas have the right number of annotations.')
    

def query_lemmas_have_pos(df):
    """
    Checks that there is a value in the query_pos column for each value in query_by_hand.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints either incorrectly annotated data or success message.
    """
    badcontent = df[(~df.query_by_hand.isna()) & (df.query_pos.isna())]
    
    if len(badcontent) > 0:
        print('+++ No POS tag associated with bases to query manually. +++\n')
        print(badcontent[['lemma', 'query_by_hand', 'query_pos']], '\n')
#     else:
#         print('    POS tags associated with all bases to query manually.')

    
def test_battery(df):
    """
    Runs all of the tests defined in the helper functions.
    
    Arg:
        df: pandas df, read in from file X_base_annot.csv
    Returns:
        Nothing; prints output of the individual test functions.
    """
    
    true_lemma_contents(df)
    true_base_contents(df)
    merge_subset_true_lemmas(df)
    true_base_subset_true_lemma(df)
    true_base_intersect_merge(df)
    true_base_intersect_query(df)
    query_lemmas_have_pos(df)
    merge_wd_in_lemma(df)
    merge_unique_for_lemma(df)
    true_base_unique_for_lemma(df)
    exists_annot_for_lemma(df)

Go through all of the annotated files in `backform_samples_nano_annot/` and run them through the test battery, printing results below.

If only the headings for each suffix are printed, that means that that dataset passes all tests.

In [3]:
for idx in range(len(SFXS)):
    curr_file = ANNOT_FILES[idx]
    curr_sfx = SFXS[idx]
    
    curr_df = pd.read_csv('backform_samples_nano_annot/' + curr_file)
    
    print('=========================== ' + curr_sfx + ' ===========================\n')
    test_battery(curr_df)




































