# Create the Samples for Silver Standard Training

In [None]:
silver_standard_location = '../data/generated_silver_data/'
silver_standard_files = [x for x in listdir(silver_standard_location) if not "cleaned" in x and x.startswith('silver_')]

In [None]:
def cleanup_file(file_name):
    with open(file_name, 'r') as in_file, open(file_name.split(".csv")[0]+'_cleaned.csv', 'w') as out_file:
        text = in_file.read()
        out_text = re.sub(r'\n\[', ' [', text)
        out_text = re.sub(r'\n\(', ' (', out_text)
        out_text = re.sub(r'\n\)', ' )', out_text)
        out_text = re.sub(r'\n\t', ' ', out_text)
        out_text = re.sub(r'\n\*', ' *', out_text)
        out_text = re.sub(r'\n;', ' ;', out_text)
        out_text = re.sub(r'\n"', ' "', out_text)
        out_text = re.sub(r'\n%', ' %', out_text)
        out_text = re.sub(r'\n[^\d]', ' ', out_text)
        out_file.write(out_text)
for f in silver_standard_files:
    cleanup_file(join(silver_standard_location, f))

In [None]:
def create_dataframe(file_list, silver_standard_location):
    df = pd.DataFrame(columns=['span_id','span','beg_off','end_off','sent_id','sent','doc_id','doc','marg'])
    for f in file_list:
        print("Working on file {}".format(f))
        with open(join(silver_standard_location, f.split('.csv')[0]+"_cleaned.csv"), 'r') as csv:
            lines = csv.readlines()
            for idx, line in enumerate(lines[1:]):
                span_id, line = re.split(r',', line, maxsplit=1)
                span, line = re.split(r',', line, maxsplit=1)
                span = span.strip('"')
                beg_off, line = re.split(r',', line, maxsplit=1)
                end_off, line = re.split(r',', line, maxsplit=1)
                sent_id, line = re.split(r',', line, maxsplit=1)
                sent, line = re.split(r'",', line, maxsplit=1)
                sent = sent.strip('"')
                doc_id, line = re.split(r',', line, maxsplit=1)
                doc, line = re.split(r'",', line, maxsplit=1)
                doc = doc.strip('"')
                marg = re.split(r',', line, maxsplit=1)[0].rstrip('\n')
                entry = {
                    'span_id': span_id,
                    'span': span,
                    'beg_off': beg_off,
                    'end_off': end_off,
                    'sent_id': sent_id,
                    'sent': sent,
                    'doc_id': doc_id,
                    'doc': doc,
                    'marg': marg
                }
                df = df.append(entry, ignore_index=True)
    return df

In [None]:
df = create_dataframe(silver_standard_files, silver_standard_location)

There is still the possibility that overlapping samples have been extracted. If that is the case we systematically remove them and keep the longest samples that were extracted. 

In [None]:
def mark_overlapping(df):
    df['overlapped'] = False
    doc_names = set(df['doc'])
    for doc_id in doc_names:
        doc_sentences = df.loc[df.doc == doc_id]
        for sent_id in set(doc_sentences['sent_id']):
            multi_sentences = doc_sentences.loc[doc_sentences.sent_id == sent_id]
            if len(multi_sentences) < 2:
                continue
            for index_test, row_test in multi_sentences.iterrows():
                for index_other, row_other in multi_sentences.iterrows():
                    if index_test == index_other:
                        continue
                    elif row_other['beg_off'] <= row_test['beg_off'] and row_other['end_off'] >= row_test['end_off']:
                        df.loc[index_test, 'overlapped'] = True
    return df

In [None]:
df = mark_overlapping(df)
df = df.loc[df.overlapped == False]

Initially we need to remove the set of files used in the Snorkel training from the silver standard so we do not have a bias when we evaluate later on. Because the sentences are randomly split, we remove all files from the snorkel train and devel set as well as from the test set. 

In [None]:
original_training_files = [re.sub(r'/', '_', x.split('.xml')[0]) for x in listdir('../data/XML')]
for original_file in original_training_files:
    df = df.loc[df.doc != original_file]
df.to_csv("silver_standard.csv")
# To load: df = pd.read_csv("pandas_silver_standard_production.csv", low_memory=False)

## Query negative samples

At this point it is easiest to perform the operation in the database because we need additional information.
Basically we need to:
1. travers each database in the silver data split
2. travers each document in the database
3. travers each sentence in the document 
4. write the sentence to the negative samples if the doc-sentence combination is not yet in our dataframe

In [None]:
df = pd.read_csv('pandas_silver_standard.csv', low_memory=False)
doc_names = set(df['doc'])
doc_fails = [x for x in doc_names if not x.startswith('10.1371')]
for fail in doc_fails:
    df = df.drop(df[df.doc == fail].index)

In [None]:
silver_standard_files = [re.sub(r'data', 'standard', x.split('_training')[0]) for x in listdir('generated_silver_data/') if x.startswith('silver')]

In [None]:
def get_docs(database_name):
    try:
        connection = psycopg2.connect(user="ds626",
                                      password="snorkel",
                                      host="127.0.0.1",
                                      port="5432",
                                      database=database_name)

        cursor = connection.cursor()
        query = "select * from document"

        cursor.execute(query)
        documents = cursor.fetchall()
        new_docs = []
        for row in documents:
            if not row[1].startswith('sent'):
                new_docs.append([row[0], row[1]])
        return new_docs

    except (Exception, psycopg2.Error) as error:
        print("Error fetching data from PostgreSQL table", error)

    finally:
        if (connection):
            cursor.close()
            connection.close()
            
def get_sentences(database_name, document):
    try:
        connection = psycopg2.connect(user="ds626",
                                      password="snorkel",
                                      host="127.0.0.1",
                                      port="5432",
                                      database=database_name)

        cursor = connection.cursor()
        query = "select id,document_id,words,text from sentence where document_id = %s"

        cursor.execute(query, (document,))
        sents = cursor.fetchall()
        new_sents = []
        for row in sents:
            new_sents.append([row[0], row[1], row[2], row[3]])
        return new_sents

    except (Exception, psycopg2.Error) as error:
        print("Error fetching data from PostgreSQL table", error)

    finally:
        if (connection):
            cursor.close()
            connection.close()
            
def generate_negative_samples(silver_standard_files, output_name):
    with open(join('data', '{}_data.txt'.format(output_name)), "w") as out_data, open(join('data', '{}_labels.txt'.format(output_name)), "w") as out_labels:
        for silver_data_split in silver_standard_files:
            silver_docs = get_docs(silver_data_split)
            for document in silver_docs:
                doc_id = document[1]
                sents = get_sentences(silver_data_split, document[0])
                for sent in sents:
                    sent_id = sent[0]
                    matching_df_sents = df.loc[(df.doc == str(doc_id)) & (df.sent_id == sent_id)]
                    if len(matching_df_sents) < 1 and not sent[3].startswith('Figure data removed from full text') and not sent[3].startswith('Figure identifier and caption:')  and not sent[3].startswith('Table data removed from full text') and not sent[3].startswith('Table identifier and caption:'):
                        # found an unknown sample
                        word_string = ''
                        label_string = ''
                        for word in sent[2]:
                            if word == '\n':
                                word_string += '\n'
                                label_string += '\n'
                                out_data.write(word_string)
                                out_labels.write(label_string)
                                word_string = ''
                                label_string = ''
                            else:
                                word_string += ' ' + word
                                label_string += ' O'

In [None]:
generate_negative_samples(silver_standard_files, 'neg_silver_samples')

## Transform positive samples to BIO 

Here we have to work sentence-wise again. So we iterate all documents, all sentences and than transform each sentence into BIO including all spans that need to be annotated. 

In [None]:
def create_document_database_mapping():
    mapping = {}
    silver_sets = [x for x in listdir('data/') if x.startswith('silver_data')]
    for silver_set in silver_sets:
        new_files = [x for x in listdir(join('data', silver_set)) if not x.startswith('sent')]
        for f in new_files:
            mapping[f.split('.txt')[0]] = re.sub('data', 'standard', silver_set)
    return mapping
    
def get_words(database_name, document, sentence):
    try:
        connection = psycopg2.connect(user="ds626",
                                      password="snorkel",
                                      host="127.0.0.1",
                                      port="5432",
                                      database=database_name)

        cursor = connection.cursor()
        query = "SELECT text, words FROM sentence, document WHERE sentence.document_id=document.id AND document.name=%s AND sentence.id=%s;"

        cursor.execute(query, (document,sentence))
        sents = cursor.fetchall()
        text, words = sents[0]
        return text, words

    except (Exception, psycopg2.Error) as error:
        print("Error fetching data from PostgreSQL table", error)

    finally:
        if (connection):
            cursor.close()
            connection.close()

def create_pos_bio(df, name, document_database_mapping):
    with open(join('data', '{}_data.txt'.format(name)), "w") as out_data, open(join('data', '{}_labels.txt'.format(name)), "w") as out_labels:
        doc_names = set(df['doc'])
        for idx, doc_id in enumerate(doc_names):
            if idx % 500 == 0:
                print("Working on doc {}: {}".format(idx, doc_id))
            doc_sentences = df.loc[df.doc == doc_id]
            for sent_id in set(doc_sentences['sent_id']):
                words_string = ''
                labels_string = ''
                text, words = get_words(document_database_mapping[doc_id], doc_id, sent_id)
                multi_sentences = doc_sentences.loc[doc_sentences.sent_id == sent_id]
                spans = []
                for index_test, row_test in multi_sentences.iterrows():
                    beg_off = int(row_test['beg_off'])
                    end_off = int(row_test['end_off'])
                    span = row_test['span']
                    text_span = text[beg_off:end_off+1]
                    #if span != text_span:
                    #    print("Error: text spans do not match: {} and {}. In {} sentence {}".format(span, text_span, doc_id, sent_id))
                    spans.append([span, beg_off, end_off])
                previous_off = 0
                current_off = 0
                remaining_text = text
                for word in words:
                    if word == ' ':
                        pass
                    elif word == '\n':
                        words_string += '\n'
                        labels_string += '\n'
                        out_data.write(words_string)
                        out_labels.write(labels_string)
                    else:
                        previous_off = current_off
                        words_string += ' ' + word
                        current_off += len(word) 
                        span_matched = False
                        for s in spans:
                            if previous_off == s[1]:
                                # begin
                                labels_string += ' B-software'
                                span_matched = True
                            elif previous_off >= s[1] and previous_off <= s[2]:
                                # inside
                                labels_string += ' I-software'
                                span_matched = True
                        if not span_matched:
                            labels_string += ' O'
                        #print(word)
                        #print(len(word))
                        #print(text)
                        #print(words_string)
                        #print(remaining_text)
                        #print(text)
                        remaining_text = remaining_text.split(word, maxsplit=1)[1]
                        current_off += len(remaining_text) - len(remaining_text.lstrip(' '))
                        remaining_text = remaining_text.lstrip(' ')

In [None]:
document_database_mapping = create_document_database_mapping()

In [None]:
create_pos_bio(df, 'pos_silver_samples', document_database_mapping)