This notebook provides code that creates a new datafile only containing data of plays with certain properties. It takes a csv file with the full corpus and a metadata file as input. The user can specify which metadata properties the selected files should have and a new csv file containing only those files who have that data is produced.

In [1]:
import pandas as pd

In [2]:
def create_selection_from_meta(metafile, selection_criteria):

    #read in file with metadata
    meta_df = pd.read_csv(metafile, encoding='latin1', delimiter=';')

    # Apply filters to metadata
    filtered_meta = meta_df
    for column, value in selection_criteria.items():
        filtered_meta = filtered_meta[filtered_meta[column] == value]

    # Get the list of names that match the filter, assuming they are provided under the header "ti_id"
    matching_names = filtered_meta['ti_id']

    # return matching_names
    return matching_names

In [3]:
def obtain_selected_corpus_data(corpus_file, selected_names):

    data_df = pd.read_csv(corpus_file)

    #filename ids are the first column
    name_col_index = 0

    #revert filenames to their id in the metadata
    file_names_as_ids = data_df.iloc[:, name_col_index].apply(lambda x: str(x).split('_')[0])
    #check whether these ids are part of the selected set
    selected = file_names_as_ids.isin(selected_names)
    #select these files from the data
    selected_data = data_df[selected]

    

    #return selected data
    return selected_data

In [4]:
# provide path to metadata file
metafile = '../older_metadata_lapa.csv'

# define selection criteria 
selection_criteria = {
    'genre': 'drama'
}

selected_plays = create_selection_from_meta(metafile, selection_criteria)


# provide path to full corpus data
corpus_file = '../original_outdir/original_study_out_corpus_diffstddev.csv'

selected_data = obtain_selected_corpus_data(corpus_file, selected_plays)


#state name outputfile (e.g. ../groupselection/rodenbrug_plays.csv or ../groupselection/drama_plays,csv)
outputfile = '../test_selection_drama.csv'

selected_data.to_csv(outputfile, index=False)


# additional examples selection criteria. General format is
# selection_criteria = {
# 'criterion1' : 'value',
# 'criterion2' : 'value',
# 'criterion3' : 'value'
# }

selection_criteria = {
    'genre': 'drama'
}


selection_criteria = {
    'achternaam' : 'Rodenburg',
    'voornaam' : 'Theodoor'
}

