# Preprocessing and editing the conll format

This notebook provides code for precprocessing and feature extraction. After applying this code, the conll file is edited to include columns for each additional feature. This only needs to be run once, at the beginning of the experiment. This has already been run, but you can run it again.

In [1]:
import pandas as pd
import re
import csv

In [9]:
# Feature extraction Functions

def all_caps(dataframe):
    '''
    adds a column containing TRUE or FALSE values for the Allcaps feature
    Allcaps checks whether the token is in allcaps
    
    :param dataframe: a dataframe containing the conll2003 data
    :type dataframe: pandas.core.frame.DataFrame
    
    returns an updated dataframe
    '''
    checklist = []

    all_caps = r"^[A-Z][A-Z]+$"
    
    for token in dataframe['Token']:
        
        if re.match(all_caps,token):
            checklist.append('TRUE')
        else:
            checklist.append('FALSE') 
                                
    dataframe['Allcaps'] = checklist
    
    return dataframe

def capitalization_after_lowercase(dataframe):
    '''
    adds a column containing TRUE or FALSE values for the Cap_after_lower feature
    Cap_after_lower checks whether the token is capitalized and comes after an all-lowercase token
    
    :param dataframe: a dataframe containing the conll2003 data
    :type dataframe: pandas.core.frame.DataFrame
    
    returns an updated dataframe
    '''
    checklist = []

    starts_with_cap = r"^[A-Z][a-z]+"
    all_lowercase = r"^[a-z]+"
    previous_token = ""
    
    for token in dataframe['Token']:
        
        if re.match(starts_with_cap,token) and re.match(all_lowercase,previous_token):
            checklist.append('TRUE')
        else:
            checklist.append('FALSE') 
                        
        previous_token = token
        
    dataframe['Cap_after_lower'] = checklist
    
    return dataframe

def origin_adjective_suffix(dataframe):
    '''
    adds a column containing TRUE or FALSE values for the Demonym feature
    Demonym checks whether the token contains a suffix following the origin adjectivial pattern
    
    :param dataframe: a dataframe containing the conll2003 data
    :type dataframe: pandas.core.frame.DataFrame
    
    returns an updated dataframe
    '''
    checklist = []
    origin_adjective_suffixes = ['ian','ese','ish','ean','i']
    list_of_tokens = dataframe['Token'].to_list()
    
    for token in dataframe['Token']:
        
        checker = []
        
        for suffix in origin_adjective_suffixes:
            
            if token.endswith(suffix):
                checker.append('TRUE')
            else:
                checker.append('FALSE')
        
        if 'TRUE' in checker:
            checklist.append('TRUE')
        else:
            checklist.append('FALSE')
        
    dataframe['Demonym'] = checklist
    
    return dataframe

def followed_by_company_suffix_check(dataframe):
    '''
    adds a column containing TRUE or FALSE values for the Comp_suf feature
    Comp_suf checks whether the token is followed by a token in the form of a company marker, e.g. Ltd, Inc
    
    :param dataframe: a dataframe containing the conll2003 data
    :type dataframe: pandas.core.frame.DataFrame
    
    returns an updated dataframe
    '''
    checklist = []
    company_suffix_list = ['Ltd','Ltd.','Inc.','Corp.','LLC','Co']
    list_of_tokens = dataframe['Token'].to_list()
    next_token = ""
    
    for token in reversed(list_of_tokens):
                
        if next_token in company_suffix_list:
            checklist.append('TRUE')
        else:
            checklist.append('FALSE')
            
        next_token = token
    
    dataframe['Comp_suf'] = list(reversed(checklist))
    
    return dataframe

def followed_by_possessive_marker(dataframe):
    '''
    adds a column containing TRUE or FALSE values for the Poss_mark feature
    Poss_mark checks whether the token is followed by the token 's, indicating possession
    
    :param dataframe: a dataframe containing the conll2003 data
    :type dataframe: pandas.core.frame.DataFrame
    
    returns an updated dataframe
    '''
    checklist = []
    possessive_marker = "'s"
    list_of_tokens = dataframe['Token'].to_list()
    next_token = ""
    
    for token in reversed(list_of_tokens):
        
        if next_token == possessive_marker:
            checklist.append('TRUE')
        else:
            checklist.append('FALSE')
            
        next_token = token
    
    dataframe['Poss_mark'] = list(reversed(checklist))
    
    return dataframe

In [11]:
# Functions that update the dataframe with the features and write it back to the file

def update_dataframe_with_features(dataframe):
    '''
    adds columns of extracted feature values to dataframe
    
    :param dataframe: a dataframe containing the conll2003 data
    :type dataframe: pandas.core.frame.DataFrame
    
    returns an updated dataframe
    '''
    dataframe = all_caps(dataframe)
    dataframe = capitalization_after_lowercase(dataframe)
    dataframe = origin_adjective_suffix(dataframe)
    dataframe = followed_by_company_suffix_check(dataframe)
    dataframe = followed_by_possessive_marker(dataframe)
    
    return dataframe

def update_conll_format(inputfile):
    '''
    rewrites inputfile to include the extracted features
    
    :param inputfile: path to a file containing the conll2003 data
    :type inputfile: string
    
    '''
    # converting csv file to data frame
    df = pd.read_csv(inputfile, sep='\t',
                         names=['Token', 'POS', 'Chunk', 'Gold'])
    df = df.dropna() # drop rows that contains nan values
    
    # adding features to the dataframe
    updated_df = update_dataframe_with_features(df)
    
    #rewriting trainingfile to include updated features
    updated_df.to_csv(inputfile.replace('.conll','.preprocessed.conll'), sep='\t')

## Preprocessing the CONLL files

Below I apply the preprocessing functions to the three CONLL files. This has already been applied, and I am using the preprocessed files in the rest of the notebooks, but you can run this again and it will simply rewrite the files.

In [14]:
update_conll_format('../data/conll2003.train.conll')
update_conll_format('../data/conll2003.dev.conll')
update_conll_format('../data/conll2003.test.conll')