Creates a new csv file, `bases_manual_query.csv`, with the suffix, the lemma, the base and pos to query, and the CQL query for each.
This file will be used on the SeaCOW server to count all the remaining bases, and a later script will merge these counts with the original dataframes to produce the final samples and derivation-base pairs.

In [5]:
import pandas as pd
import os

ANNOT_FILES = [fn for fn in os.listdir('2_backform_samples_nano_annot') if fn[-14:] == 'base_annot.csv']
SFXS = [fn.split('_')[0] for fn in ANNOT_FILES]

# Define list that will iteratively collect all the to-query subsets from the individual files.
all_to_query = []

for idx in range(len(SFXS)):
    curr_file = ANNOT_FILES[idx]
    curr_sfx = SFXS[idx]
    
    # Grab only those rows from data for current suffix with some value for query_by_hand.
    curr_df = pd.read_csv('2_backform_samples_nano_annot/' + curr_file)
    curr_to_query = curr_df[~curr_df.query_by_hand.isna()].reset_index(drop=True)
    
    # Add some new columns: the CQL query and the suffix.
    curr_to_query['sfx'] = curr_sfx
    curr_to_query['cql'] = ['[lemma="%s" & tag="%s"] within <s/>' % z for z in list(zip(curr_df.query_by_hand.dropna(), curr_df.query_pos.dropna()))]
#     curr_to_query = curr_to_query[['sfx', 'lemma', 'query_by_hand', 'query_pos', 'cql']]
    
    all_to_query.append(curr_to_query)

# Concat list of dfs into one single df, save as csv.
pd.concat(all_to_query).to_csv('bases_manual_query2.csv', index=False)