# NRGI Extractives Contracts
### Munge Annotation Data for Training
### Takes directories of downloaded xls annotation files and returns 
1. Combined pickled dataframe ready to featurize
2. Labels for machine learning

In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
output_file = 'raw_data/annotation_data.pkl'
labels_output_file = 'clean_data/training_labels.pkl'
labels = ['stabilization','royalties']

In [None]:
# Downloaded xls file annotations from resourcecontracts and openlandcontracts
resource_folder = 'contract_data/Contracts_Annotations/resource_contracts/'
land_folder = 'contract_data/Contracts_Annotations/openland_contracts/'
# Most recently downloaded metadata from resourcecontracts.org/contracts
rc_metadata = 'contract_data/resource_contract_2017-08-16.csv' 
# Most recently downloaded metadata from openlandcontracts.org/contracts
ol_metadata = 'contract_data/openland_contract_2017-08-16.csv'
metadata_files = [rc_metadata,ol_metadata]
folders = [resource_folder,land_folder]

In [None]:
annotations = pd.DataFrame()
for folder in folders:
    xls_files = [f for f in os.listdir(folder) if f.lower().endswith('.xls')]
    for xls in xls_files:
        temp = pd.read_excel(folder + xls)
        if len(temp) > 0:
            temp['OCID'] = xls[:-4]
            temp['Source'] = folder.split('/')[-2]
            annotations = annotations.append(temp)
print "Number of annotations: " + str(len(annotations))

In [None]:
# drop blank and integer annotations and annotations less than 4 words
annotations.dropna(subset=['Annotation Text'],inplace=True)
annotations = annotations[annotations['Annotation Text'].apply(lambda x: type(x)!=int)].copy()
annotations = annotations[annotations['Annotation Text'].apply(lambda x: len(x.split()) > 3)].copy()
print len(annotations)
# If duplicate text appears within the same contract, drop it
annotations.drop_duplicates(['Annotation Text','OCID','Category'],inplace=True)
print len(annotations)

In [None]:
labels = [label.lower() for label in labels]
annotations['label'] = [x.lower() if x.lower() in labels else 'other' for x in annotations['Category']]

In [None]:
# Given some duplicate text with different Category labels, keep the labels that are in our target label list when dropping
sort_num = range(len(labels) + 1)
sort_key = dict(zip(labels,sort_num))
sort_key['other'] = sort_num[-1]
annotations['sort_key'] = [sort_key[x] for x in annotations['label']]
annotations.sort_values(by='sort_key',inplace=True,ascending=True)
annotations.drop_duplicates(['Annotation Text','OCID'],keep='first',inplace=True)
print len(annotations)

In [None]:
# Join metadata from contracts repository
metadata = pd.DataFrame()
for filename in metadata_files:
    temp = pd.read_csv(filename)
    metadata = metadata.append(temp)
    
annotations = annotations.merge(metadata[['OCID','Language','Country Name','Resource','Contract Type','Document Type']],how='left',on='OCID')

In [None]:
y = list(annotations['label'])
yunique = list(np.unique(y))
for item in yunique:
    print str("{0:.2f}%".format(100*y.count(item) / float(len(y)))) + " " + item

In [None]:
annotations.head(2)

In [None]:
annotations.to_pickle(output_file)
pickle.dump(labels_output_file,y)