In [2]:
import pandas as pd
import re
texts = pd.read_csv('FullText_ALL.csv')
texts.columns
texts = texts[1:]

In [3]:
# Define the list of metric keywords to search for
metrics = ["f1 score", "accuracy", "precision", "recall", "auc", "mean squared error", "r2 score", "mae"]

# Compile a regex pattern to search for any of the metric keywords, case insensitive
pattern = re.compile(r'\b(' + '|'.join(metrics) + r')\b', re.IGNORECASE)

# Function to extract surrounding sentences and detect presence of metrics
def extract_context_and_detect_metrics(text):
    # Return empty list and dictionary with zeros if text is NaN
    if pd.isna(text):
        return [], {metric: 0 for metric in metrics}
    
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?]) +', text)
    matches = []
    metric_presence = {metric: 0 for metric in metrics}  # Dictionary to store 0 or 1 for each metric
    
    for i, sentence in enumerate(sentences):
        # If the sentence contains any of the metrics
        if pattern.search(sentence):
            # Extract two sentences before and after
            context = sentences[max(0, i - 2):min(len(sentences), i + 3)]
            matches.append(" ".join(context))
            
            # Check for each metric and mark as present (1)
            for metric in metrics:
                if re.search(r'\b' + re.escape(metric) + r'\b', sentence, re.IGNORECASE):
                    metric_presence[metric] = 1
    
    return matches, metric_presence

# Apply the function to the "Full_Text" column
texts[['Metric_Context', 'Metric_Presence']] = texts['Full_Text'].apply(
    lambda x: pd.Series(extract_context_and_detect_metrics(x))
)

# Split the Metric_Presence dictionary into individual columns
metric_columns = pd.DataFrame(texts['Metric_Presence'].tolist(), index=texts.index)
texts = pd.concat([texts, metric_columns], axis=1).drop(columns=['Metric_Presence'])

# Filter out rows with no matches and display the resulting dataframe
texts_with_context = texts[texts['Metric_Context'].apply(bool)]
print(texts_with_context[['Full_Text', 'Metric_Context'] + metrics])

                                             Full_Text  \
1    3D Facial Expression Recognition Based on Auto...   
2    3D Facial Expression Recognition Based on Prim...   
6    BREUER,KIMMEL:ADEEPLEARNINGPERSPECTIVEONFACIAL...   
7    1\nA Deeper Look at Facial Expression Dataset ...   
8    ## Title: BP4D-Spontaneous: a high-resolution ...   
..                                                 ...   
218  The Elements of End-to-end Deep Face Recogniti...   
222  ## Title: Three convolutional neural network m...   
224  TorontoCity: Seeing the World with a Million E...   
225  AcceptedasaworkshopcontributionatICLR2015\nTRA...   
228  1\nUtilizing Deep Learning Towards Multi-modal...   

                                        Metric_Context  f1 score  accuracy  \
1    [An attractive scheme is that we have\na large...         0         1   
2    [v1,v2are the principal directions at\npointp....         0         1   
6    [We\nuseOpenCV[4]forallimageoperations.\n3 Res...         0     

In [4]:
print(texts_with_context.columns)

Index(['Unnamed: 0', 'ID', 'Title', 'Authors', 'Year', 'Cited By',
       'Detected_Dataset', 'Detected_Topic', 'Abstract', 'Journal', 'URL',
       'Full_Text', 'Metric_Context', 'f1 score', 'accuracy', 'precision',
       'recall', 'auc', 'mean squared error', 'r2 score', 'mae'],
      dtype='object')


In [5]:
texts_with_context.to_csv('full_text_with_metrics.csv', index=False)