In [None]:
# packages

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np 
import csv
import spacy_stanza

import os
# os.getcwd()

In [2]:
# load nlp stanza
nlp = spacy_stanza.load_pipeline("nl")

2025-02-13 22:58:44 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 19.6MB/s]                    
2025-02-13 22:58:46 INFO: Loading these models for language: nl (Dutch):
| Processor | Package         |
-------------------------------
| tokenize  | alpino          |
| mwt       | alpino          |
| pos       | alpino_charlm   |
| lemma     | alpino_nocharlm |
| depparse  | alpino_charlm   |
| ner       | conll02         |

2025-02-13 22:58:46 INFO: Using device: cuda
2025-02-13 22:58:46 INFO: Loading: tokenize
2025-02-13 22:58:46 INFO: Loading: mwt
2025-02-13 22:58:46 INFO: Loading: pos
2025-02-13 22:58:46 INFO: Loading: lemma
2025-02-13 22:58:47 INFO: Loading: depparse
2025-02-13 22:58:47 INFO: Loading: ner
2025-02-13 22:58:47

# Read and Merge DFs

In [None]:
# Read the all annotated NOS articles

df = pd.read_csv('coded_df_actors_full.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [None]:
# Read articles 
df_text = pd.read_csv('final_nosarticles.csv',
                      sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
df_text['page_id'] = df_text['page_id'].astype(int)

df_text = df_text[['page_id', 'Text']].drop_duplicates()

# rename page_id to article_id
df_text.rename(columns = {'page_id': 'article_id'}, inplace = True)

# remove line break
df_text['Text'] = df_text['Text'].str.replace('[LINE_BREAK]', '\n ')

In [None]:
# add text to the annotation df
df = pd.merge(df, df_text, on = 'article_id', how = 'left')

In [None]:
# drop if actor_type is Geopolitieke entiteit
df = df[df.actor_type != 'Geopolitieke entiteit']

In [9]:
unique_articles_df = df[['article_id', 'Text']].drop_duplicates()

# Get Named Entity Names

In [10]:
nlp = spacy_stanza.load_pipeline("nl")


2025-02-13 22:59:33 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 26.2MB/s]                    
2025-02-13 22:59:34 INFO: Loading these models for language: nl (Dutch):
| Processor | Package         |
-------------------------------
| tokenize  | alpino          |
| mwt       | alpino          |
| pos       | alpino_charlm   |
| lemma     | alpino_nocharlm |
| depparse  | alpino_charlm   |
| ner       | conll02         |

2025-02-13 22:59:34 INFO: Using device: cuda
2025-02-13 22:59:34 INFO: Loading: tokenize
2025-02-13 22:59:34 INFO: Loading: mwt
2025-02-13 22:59:34 INFO: Loading: pos
2025-02-13 22:59:34 INFO: Loading: lemma
2025-02-13 22:59:34 INFO: Loading: depparse
2025-02-13 22:59:34 INFO: Loading: ner
2025-02-13 22:59:35

In [13]:
def tag_text_stanza(text):
    # Create a Sentence object from the text
    doc = nlp(text)
    
    # Get the tagged spans
    spans = doc.ents

    # drop if entity label is not in ['ORG','PER']
    spans = [span for span in spans if span.label_ in ['ORG', 'PER']]
    
    # Create a list of tuples containing the entity text and label
    entities = [(ent.text, ent.label_) for ent in spans]

    # Create an empty dictionary to store unique combinations
    unique_entities_tags = {}

    # Iterate through the list of named entity tag combinations
    for entity, tag in entities:
        # Check if the combination exists in the dictionary
        if (entity, tag) not in unique_entities_tags:
            # If it doesn't exist, add it to the dictionary
            unique_entities_tags[(entity, tag)] = True

    # Convert the keys of the dictionary back into a list
    unique_entities = list(unique_entities_tags.keys())

    return unique_entities

In [14]:
# get the first text as test
text = unique_articles_df['Text'].iloc[2]

# tag the text
unique_entities = tag_text_stanza(text)
unique_entities

[('Marion Koopmans', 'PER'),
 ('Wereldgezondheidsorganisatie', 'ORG'),
 ('Koopmans', 'PER'),
 ('Nieuwsuur', 'ORG'),
 ('RIVM', 'ORG'),
 ('Victor Lamme', 'PER'),
 ('Bas van den Putte', 'PER')]

In [15]:
unique_articles_df['entities_stanza'] = unique_articles_df['Text'].apply(tag_text_stanza)

In [None]:
df_exploded = unique_articles_df.explode('entities_stanza')

In [17]:
# 2. Split the 'entities' tuples into two separate columns: 'entity_name' and 'entity_type'
df_exploded[['entity_name', 'entity_type']] = pd.DataFrame(df_exploded['entities_stanza'].tolist(), index=df_exploded.index)
print(df_exploded.shape)

(4031, 5)


# Clean Named Entities

In [None]:
# create a function to check if Text contains kabinet in its lower case form because kabinet cannot be found in the NER model
def check_kabinet(text):
    if 'kabinet' in text.lower():
        return True
    else:
        return False
    
# if the text contains kabinet, then add entity_name as 'Het kabinet' and entity_type as 'ORG'
df_exploded['kabinet'] = df_exploded['Text'].apply(check_kabinet)

df_exploded['kabinet'].value_counts()

kabinet
False    2572
True     1459
Name: count, dtype: int64

In [None]:
# get kabinet into a new df
df_kabinet = df_exploded[df_exploded['kabinet'] == True]
# drop entities_stanza and entity_name and entity_type
df_kabinet = df_kabinet.drop(columns = ['entities_stanza', 'entity_name', 'entity_type'])

df_kabinet['entity_name'] = 'Het kabinet'
df_kabinet['entity_type'] = 'ORG'

In [21]:
# concat df_exploded and df_kabinet
df_exploded = pd.concat([df_exploded, df_kabinet], axis = 0)
print(df_exploded.shape)

(5490, 6)


In [None]:
# drop kabinet column
df_exploded = df_exploded.drop(columns = 'kabinet')

In [23]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('dutch')
print(stopwords)

# extend the stopwords with their title case form
stopwords_title = [word.title() for word in stopwords]
stopwords.extend(stopwords_title)
print(stopwords)

['de', 'en', 'van', 'ik', 'te', 'dat', 'die', 'in', 'een', 'hij', 'het', 'niet', 'zijn', 'is', 'was', 'op', 'aan', 'met', 'als', 'voor', 'had', 'er', 'maar', 'om', 'hem', 'dan', 'zou', 'of', 'wat', 'mijn', 'men', 'dit', 'zo', 'door', 'over', 'ze', 'zich', 'bij', 'ook', 'tot', 'je', 'mij', 'uit', 'der', 'daar', 'haar', 'naar', 'heb', 'hoe', 'heeft', 'hebben', 'deze', 'u', 'want', 'nog', 'zal', 'me', 'zij', 'nu', 'ge', 'geen', 'omdat', 'iets', 'worden', 'toch', 'al', 'waren', 'veel', 'meer', 'doen', 'toen', 'moet', 'ben', 'zonder', 'kan', 'hun', 'dus', 'alles', 'onder', 'ja', 'eens', 'hier', 'wie', 'werd', 'altijd', 'doch', 'wordt', 'wezen', 'kunnen', 'ons', 'zelf', 'tegen', 'na', 'reeds', 'wil', 'kon', 'niets', 'uw', 'iemand', 'geweest', 'andere']
['de', 'en', 'van', 'ik', 'te', 'dat', 'die', 'in', 'een', 'hij', 'het', 'niet', 'zijn', 'is', 'was', 'op', 'aan', 'met', 'als', 'voor', 'had', 'er', 'maar', 'om', 'hem', 'dan', 'zou', 'of', 'wat', 'mijn', 'men', 'dit', 'zo', 'door', 'over', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elifk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
df_exploded.isnull().sum()

# see where the null values are
df_exploded[df_exploded['entity_name'].isnull()]

# drop if entity_name is null
df_exploded = df_exploded.dropna(subset = ['entity_name'])

In [25]:
import re

# Function to generate name variations (without stopword filtering)
def generate_name_variations(name):
    if not isinstance(name, str):
        return []
    
    # Convert to lowercase and split into parts
    # Keep only alphanumeric words (no symbols or numbers)
    parts = re.findall(r'\b\w+\b', name)
    
    # Remove stopwords from the parts
    parts_filtered = [part for part in parts]
    
    variations = []
    
    if parts_filtered:
        # Use the entire filtered name and the individual parts as variations
        variations.append(' '.join(parts_filtered))  # Add filtered parts as a single variation
        variations.extend(parts_filtered)  # Add individual parts as variations

    return variations

# Create a new column for name variations
df_exploded['name_variations'] = df_exploded['entity_name'].apply(lambda x: generate_name_variations(x))

# Drop from name_variations if instance of name_variations matches stopwords
df_exploded['name_variations'] = df_exploded['name_variations'].apply(lambda x: [variation for variation in x if variation not in stopwords])
df_exploded['name_variations'] = df_exploded['name_variations'].apply(lambda x: [variation for variation in x if re.match(r'\b\w+\b', variation)])
df_exploded['name_variations'] = df_exploded['name_variations'].apply(lambda x: [variation for variation in x if len(variation) > 1])
# drop if variation is number
df_exploded['name_variations'] = df_exploded['name_variations'].apply(lambda x: [variation for variation in x if not variation.isnumeric()])
# keep only unique variations
df_exploded['name_variations'] = df_exploded['name_variations'].apply(lambda x: list(set(x)))


In [None]:
# see where entity name has punctuation
df_exploded[df_exploded['entity_name'].str.contains(r'[^\w\s]')]

In [27]:
# Helper function to match whole words
def is_full_word_match(variation, name):
   
    # Use word boundaries to ensure full word matching
    return bool(re.search(r'\b' + re.escape(variation) + r'\b', name))

# Function to map names within each article (case-insensitive, full-word match, and entity type check)
def map_names_within_article(article_df):
    name_mapping = {}
    
    # Step 1: Populate the name mapping with the longest canonical names
    for index, row in article_df.iterrows():
        variations = row['name_variations']
        canonical_name = row['entity_name']
        entity_type = row['entity_type']  # Get the entity type

        for variation in variations:            
            # Check both variation and entity_type match
            for mapped_name, (current_canonical, current_type) in name_mapping.items():
                if is_full_word_match(variation, mapped_name) or is_full_word_match(mapped_name, variation):
                    if entity_type == current_type:  # Ensure entity types match
                        if len(canonical_name) > len(current_canonical):
                            name_mapping[mapped_name] = (canonical_name, entity_type)
                    break
            else:
                name_mapping[variation] = (canonical_name, entity_type)

    # Step 2: Replace entity names based on the canonical mapping (with full-word and entity-type check)
    article_df['entity_name_new'] = article_df.apply(
        lambda row: next(
            (canonical for variation, (canonical, type_) in name_mapping.items()
             if (is_full_word_match(variation, row['entity_name']) or is_full_word_match(row['entity_name'], variation)) 
             and type_ == row['entity_type']), 
            row['entity_name']
        ), axis=1
    )
    
    return article_df


In [None]:
df_exploded_test = df_exploded.groupby(['article_id', 'entity_type'], group_keys=False).apply(map_names_within_article)

In [31]:
df_exploded_test.shape

(5483, 7)

In [137]:
# see where entity_name and entity_name_new are different
differences = df_exploded_test[df_exploded_test['entity_name'] != df_exploded_test['entity_name_new']]
differences.shape

(552, 7)

In [None]:
# see in differences the entity_type ORG for check
# differences[differences['entity_type'] == 'ORG'][['entity_name', 'entity_name_new']].values

In [None]:
# see in differences the entity_type PER for check
# differences[differences['entity_type'] == 'PER'][['entity_name', 'entity_name_new']].values

### Note: mapping names of organizations do not work here, only Kamer and WHO makes sense

In [140]:
# Separate the data into persons (PER) and others (ORG, etc.)
df_persons = df_exploded[df_exploded['entity_type'] == 'PER']
df_others = df_exploded[df_exploded['entity_type'] != 'PER']

# Apply the mapping only to persons
df_persons_updated = df_persons.groupby('article_id', group_keys=False).apply(map_names_within_article)

In [None]:
# in df_others if the entity_name is Kamer then change it to Tweede Kamer or if the entity_name is WHO then change it to Wereldgezondheidsorganisatie WHO
df_others['entity_name_new'] = df_others['entity_name'].apply(lambda x: 'Tweede Kamer' if x == 'Kamer' else 'Wereldgezondheidsorganisatie WHO' if x == 'WHO' else x)

In [None]:
# check where entity_name and entity_name_new are different
differences_others = df_others[df_others['entity_name'] != df_others['entity_name_new']]

In [143]:
# Combine the processed persons back with the rest of the data
df_final = pd.concat([df_persons_updated, df_others]).sort_index()

In [145]:
print(df_final.shape)

(5483, 7)


In [None]:
duplicated = df_final[df_final.duplicated(subset = ['article_id', 'entity_name_new', 'entity_type'], keep = False)]
print(duplicated.shape)

# save duplicated to excel for further inspection
duplicated.to_excel('coref_resolution/duplicated_entities.xlsx', index = False)

In [149]:
# drop duplicates. keep first
df_final = df_final.drop_duplicates(subset = ['article_id', 'entity_name', 'entity_name_new'], keep = 'first')
print(df_final.shape)

(4155, 7)


In [151]:
df_final.groupby(['article_id', 'entity_name_new'])['entity_type'].nunique().value_counts()


entity_type
1    3704
Name: count, dtype: int64

In [152]:
# drop entity_name and change entity_name_new to entity_name
df_final = df_final.drop(columns = ['entity_name'])
df_final.rename(columns = {'entity_name_new': 'entity_name'}, inplace = True)

In [154]:
# Define a function to get the length of the name_variations list
df_final['name_variations_length'] = df_final['name_variations'].apply(len)

# Sort the DataFrame by article_id, entity_name, and the length of name_variations in descending order
df_sorted = df_final.sort_values(by=['article_id', 'entity_name', 'name_variations_length'], ascending=[True, True, False])

# Drop duplicates by keeping the first occurrence, which will be the one with the longest name_variations
df_unique = df_sorted.drop_duplicates(subset=['article_id', 'entity_name'], keep='first')

# Drop the helper column
df_unique = df_unique.drop(columns=['name_variations_length'])

print(df_unique.shape)


(3704, 6)


In [156]:
counts_entitytypes = df_unique.groupby(['article_id','entity_name'])['entity_type'].nunique().reset_index()
counts_entitytypes.entity_type.value_counts()

entity_type
1    3704
Name: count, dtype: int64

In [None]:
# sort the df based on article_id, entity_name, and entity_type
df_unique = df_unique.sort_values(['article_id', 'entity_name', 'entity_type'])

# Extract Sentences Mentioning the Named Entity

In [159]:
def extract_sentences(text, name_variations):
    sentences = sent_tokenize(text, language='dutch')
    relevant_sentences = [sentence for sentence in sentences if any(name in sentence.lower() for name in name_variations)]
    return relevant_sentences

df_unique['relevant_sentences'] = df_unique.apply(lambda x: extract_sentences(x['Text'], x['name_variations']), axis=1)


In [161]:
df_unique.head()

# make exploded_sentences a string by joining the list of sentences
df_unique['relevant_sentences_string'] = df_unique['relevant_sentences'].apply(lambda x: ' \n'.join(x))
df_unique.relevant_sentences.values[0]

['In heel China mogen reisbureaus bovendien per direct geen binnen- of buitenlandse vakanties meer verkopen.',
 'Het ministerie van Buitenlandse Zaken zegt dat de ambassade in Peking de situatie op de voet volgt.']

In [162]:
print(df_unique.relevant_sentences_string.values[0])

In heel China mogen reisbureaus bovendien per direct geen binnen- of buitenlandse vakanties meer verkopen. 
Het ministerie van Buitenlandse Zaken zegt dat de ambassade in Peking de situatie op de voet volgt.


In [None]:
# save the df to csv 
# df_unique.to_csv('path_to_save.csv', index = False, sep=';', quoting=csv.QUOTE_NONNUMERIC, encoding = 'utf-8')

# Create the Quote Classification DF

In [165]:
# limit to only articles about covid
df_covid = df[df['about_covid'] == 1]
len(df_covid)

2666

In [177]:
# create variable measures by taking the max of all measure_ variables
df_covid['measures'] = df_covid[['measure_1', 'measure_2', 'measure_3', 'measure_4', 'measure_5',
                        'measure_6', 'measure_7', 'measure_8', 'measure_9', 'measure_10',
                        'measure_11', 'measure_12', 'measure_13', 'measure_14', 'measure_15',
                        'measure_16', 'measure_17']].max(axis = 1)

df_covid['measures'].value_counts()

measures
0.0    1590
1.0    1076
Name: count, dtype: int64

In [178]:
# create variable positive measures by taking the max of all measure_ variables
df_covid['measures_positive'] = df_covid[['measure_1_positive', 'measure_2_positive', 'measure_3_positive', 'measure_4_positive', 'measure_5_positive',
                              'measure_6_positive', 'measure_7_positive', 'measure_8_positive', 'measure_9_positive', 'measure_10_positive',
                                'measure_11_positive', 'measure_12_positive', 'measure_13_positive', 'measure_14_positive', 'measure_15_positive',
                                'measure_16_positive', 'measure_17_positive']].max(axis = 1)

df_covid['measures_positive'].value_counts()


measures_positive
0.0    2273
1.0     393
Name: count, dtype: int64

In [179]:
# create variable negative measures by taking the max of all measure_ variables
df_covid['measures_negative'] = df_covid[['measure_1_negative', 'measure_2_negative', 'measure_3_negative', 'measure_4_negative', 'measure_5_negative',
                              'measure_6_negative', 'measure_7_negative', 'measure_8_negative', 'measure_9_negative', 'measure_10_negative',
                                'measure_11_negative', 'measure_12_negative', 'measure_13_negative', 'measure_14_negative', 'measure_15_negative',
                                'measure_16_negative', 'measure_17_negative']].max(axis = 1)

df_covid['measures_negative'].value_counts()


measures_negative
0.0    2399
1.0     267
Name: count, dtype: int64

In [180]:
# create variable neutral measures by taking the max of all measure_ variables
df_covid['measures_neutral'] = df_covid[['measure_1_neutral', 'measure_2_neutral', 'measure_3_neutral', 'measure_4_neutral', 'measure_5_neutral',
                              'measure_6_neutral', 'measure_7_neutral', 'measure_8_neutral', 'measure_9_neutral', 'measure_10_neutral',
                                'measure_11_neutral', 'measure_12_neutral', 'measure_13_neutral', 'measure_14_neutral', 'measure_15_neutral',
                                'measure_16_neutral', 'measure_17_neutral']].max(axis = 1)

df_covid['measures_neutral'].value_counts()

measures_neutral
0.0    2107
1.0     559
Name: count, dtype: int64

In [None]:
df_selected = df_covid[df_covid['coder'] == 'MainCoder'][['article_id', 'coder', 'actor_name', 'actor_type', 'directly_quoted', 'indirectly_quoted', 'actor_function', 'actor_pp', 'talks_covid_measures', 'measures','measures_positive', 'measures_negative', 'measures_neutral']].drop_duplicates()
df_selected['actor_name_normalized'] = df_selected['actor_name'].str.lower()
# drop if actor name is nan
df_selected = df_selected.dropna(subset = ['actor_name_normalized'])

In [None]:
# get name_variations for each actor_name
# Create a new column for name variations
df_selected['name_variations'] = df_selected['actor_name'].apply(lambda x: generate_name_variations(x))

# Drop from name_variations if instance of name_variations matches stopwords
df_selected['name_variations'] = df_selected['name_variations'].apply(lambda x: [variation for variation in x if variation not in stopwords])

In [None]:
# any duplicates?
duplicated = df_selected[df_selected.duplicated(subset = ['article_id', 'actor_name', 'actor_type'], keep = False)]

In [None]:
df_unique_selected = df_unique[['article_id', 'entity_name', 'entity_type', 'relevant_sentences_string', 'name_variations']]
df_unique_selected['entity_name_normalized'] = df_unique_selected['entity_name'].str.lower()
# drop name variations
df_unique_selected.drop(columns = ['name_variations'], inplace = True)

In [186]:
# see per relevant_sentences_string nr unique entity_names
df_unique_selected.groupby('relevant_sentences_string')['entity_name'].nunique().value_counts()

# get a check df where relevant_sentences_string has more than one entity_name
check_df = df_unique_selected[df_unique_selected['relevant_sentences_string'].duplicated(keep = False)]
print(check_df.shape)

(924, 5)


In [None]:
# any duplicates?
duplicated = df_unique_selected[df_unique_selected.duplicated(subset = ['article_id', 'entity_name', 'entity_type'], keep = False)]

In [188]:
print(df_unique_selected.entity_type.value_counts())

# if entity_type is PER then it is Persoon, if ORG then it is Organisatie
df_unique_selected['entity_type'] = np.where(df_unique_selected['entity_type'] == 'PER', 'Persoon', 'Organisatie')
print(df_unique_selected.entity_type.value_counts())


entity_type
ORG    2425
PER    1279
Name: count, dtype: int64
entity_type
Organisatie    2425
Persoon        1279
Name: count, dtype: int64


In [189]:
from fuzzywuzzy import fuzz

# Step 1: Merge on article_id
merged_df = pd.merge(df_selected, df_unique_selected, on='article_id', how='inner')

# Step 2: Calculate string similarity and keep matches
def match_names(row):
    # Get actor name and entity name
    actor_name = row['actor_name_normalized']
    entity_name = row['entity_name_normalized']
    # Calculate similarity
    similarity = fuzz.token_set_ratio(actor_name, entity_name)
    return similarity

# Apply the matching function
merged_df['similarity'] = merged_df.apply(match_names, axis=1)

# Step 3: Filter based on a threshold, e.g., 80, and ensure actor_type matches entity_type
threshold = 80
final_matches = merged_df[(merged_df['similarity'] >= threshold)]


In [None]:
# check the matches below 90 similarity
# final_matches[final_matches['similarity'] < 90]

In [192]:
print(final_matches.shape)
print(df_selected.shape)
print(df_unique_selected.shape)    

(1653, 20)
(1838, 15)
(3704, 5)


In [None]:
# save final_matches to explore
final_matches.to_excel('path_to_final_matches.xlsx', index = False)

In [194]:
# get the actors who are not in the final_matches based on article_id and actor_name
len(df_selected[~df_selected['actor_name'].isin(final_matches['actor_name'])])

358

In [None]:
notmatched = df_selected[~df_selected['actor_name'].isin(final_matches['actor_name'])]

In [199]:
# make a column quoted == 1 for final_matches
final_matches['quoted'] = 1

In [None]:
# select the df that will be used for the model
model_df = final_matches[['article_id', 'entity_name', 'entity_type', 'relevant_sentences_string', 'quoted', 'directly_quoted', 'indirectly_quoted', 'actor_function','actor_pp',
                          'talks_covid_measures', 'measures', 'measures_positive', 'measures_negative', 'measures_neutral']]

In [207]:
# get the entities that are not in final_matches
df_unique_selected[~df_unique_selected['entity_name'].isin(final_matches['entity_name'])].head()

df_not_quoted = df_unique_selected[~df_unique_selected['entity_name'].isin(final_matches['entity_name'])]

In [None]:
df_not_quoted['quoted'] = 0
df_not_quoted['directly_quoted'] = 0
df_not_quoted['indirectly_quoted'] = 0

# select the relevant columns to match with model_df
df_not_quoted = df_not_quoted[['article_id', 'entity_name', 'entity_type', 'relevant_sentences_string', 'quoted', 'directly_quoted', 'indirectly_quoted']]

In [209]:
df_not_quoted.shape

(1124, 7)

In [None]:
# combine model_df and df_not_quoted
model_df = pd.concat([model_df, df_not_quoted])

# create input text by combining entity_name : and relevant_sentences_string
model_df['input_text'] = model_df['entity_name'] + ': ' + model_df['relevant_sentences_string']

In [None]:
# save the df
model_df.to_csv('path_to_model_df.csv', index = False, sep=';', quoting=csv.QUOTE_NONNUMERIC, encoding = 'utf-8')

In [213]:
model_df.quoted.value_counts()

quoted
1    1653
0    1124
Name: count, dtype: int64

# Read the Model DF After Manual Checks

In [None]:
model_df = pd.read_csv('path_to_model_df_checked.csv', sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
# make article_id an integer
model_df['article_id'] = model_df['article_id'].astype(int)

In [218]:
model_df.quoted.value_counts()

quoted
1.0    1653
0.0    1124
Name: count, dtype: int64

In [None]:
model_df.quoted_check.value_counts() # correct column

quoted_check
1.0    1681
0.0    1096
Name: count, dtype: int64