In [10]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
file_path = 'FullText_ALL.csv'
FullText_df = pd.read_csv(file_path)
FullText_df.head()

In [None]:
# Sample dataframe containing the text data
# Replace this with the actual dataframe column as per your case
df = FullText_df.copy()

def clean_text(text):
    # Return NaN if text is empty or placeholder
    if text is np.nan or text == "Nothing found":
        return np.nan

    # Remove common metadata patterns and unwanted sections
    # Adjust regex patterns based on your files' content
    text = re.sub(r'\b(References|Cited by|Acknowledgments|Table of Contents|Outline|Figure \d+|Table \d+|DOI: .*)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'##?\s?Title:.*?\n', '', text)  # Remove lines that start with '## Title:'
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs

    # Remove special characters and multiple spaces
    text = re.sub(r'\n+', ' ', text)  # Replace newlines with space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\s*Figure\s*\d+\s*', '', text)  # Remove figure references
    text = re.sub(r'\s*Table\s*\d+\s*', '', text)  # Remove table references

    # Trim any leading or trailing whitespace
    text = text.strip()
    
    # Remove line breaks and excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    return text

# Apply the cleaning function to the Full_Text column
df['Full_Text'] = df['Full_Text'].apply(clean_text)

# Display the cleaned texts
df.head()


In [None]:
df_no_nan = df.dropna(subset=['Full_Text'])
df_no_nan['Text_Length'] = df_no_nan['Full_Text'].apply(len)

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(df_no_nan.index, df_no_nan['Text_Length'], marker='o', linestyle='-')
plt.title('Length of Full Text Entries (Non-NaN Only)')
plt.xlabel('Row Index')
plt.ylabel('Text Length (in characters)')
plt.grid(True)
plt.show()

In [None]:
# Filter rows in df_no_nan where the length of 'Full_Text' is zero
empty_text_rows = df_no_nan[df_no_nan['Text_Length'] <= 10000]

# Display the filtered rows
empty_text_rows

In [None]:
df_no_nan.shape

In [None]:
# Filter to find rows from each source by looking at URL patterns
ieee_examples = df_no_nan[df_no_nan['URL'].str.contains("ieee", case=False)]['Full_Text'].dropna().iloc[:1].tolist()
arxiv_examples = df_no_nan[df_no_nan['URL'].str.contains("arxiv", case=False)]['Full_Text'].dropna().iloc[:1].tolist()
sd_examples = df_no_nan[df_no_nan['URL'].str.contains("sciencedirect", case=False)]['Full_Text'].dropna().iloc[:1].tolist()

# Create a new dataframe to show examples side-by-side
examples_df = pd.DataFrame({
    'IEEE Texts': ieee_examples,
    'Arxiv Texts': arxiv_examples,
    'ScienceDirect Texts': sd_examples
})

pd.set_option('display.max_colwidth', None)

examples_df

In [None]:
# Define the list of metric keywords to search for
metrics = ["f1 score", "accuracy", "precision", "recall", "auc", "mean squared error", "r2 score", "mae"]

# Compile a regex pattern to search for any of the metric keywords, case insensitive
pattern = re.compile(r'\b(' + '|'.join(metrics) + r')\b', re.IGNORECASE)

# Function to extract surrounding sentences and detect presence of metrics
def extract_context_and_detect_metrics(text):
    # Return empty list and dictionary with zeros if text is NaN
    if pd.isna(text):
        return [], {metric: 0 for metric in metrics}
    
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?]) +', text)
    matches = []
    metric_presence = {metric: 0 for metric in metrics}  # Dictionary to store 0 or 1 for each metric
    
    for i, sentence in enumerate(sentences):
        # If the sentence contains any of the metrics
        if pattern.search(sentence):
            # Extract two sentences before and after
            context = sentences[max(0, i - 2):min(len(sentences), i + 3)]
            matches.append(" ".join(context))
            
            # Check for each metric and mark as present (1)
            for metric in metrics:
                if re.search(r'\b' + re.escape(metric) + r'\b', sentence, re.IGNORECASE):
                    metric_presence[metric] = 1
    
    return matches, metric_presence

# Apply the function to the "Full_Text" column
FullText_df[['Metric_Context', 'Metric_Presence']] = FullText_df['Full_Text'].apply(
    lambda x: pd.Series(extract_context_and_detect_metrics(x))
)

# Split the Metric_Presence dictionary into individual columns
metric_columns = pd.DataFrame(FullText_df['Metric_Presence'].tolist(), index=FullText_df.index)
texts = pd.concat([FullText_df, metric_columns], axis=1).drop(columns=['Metric_Presence'])

# Filter out rows with no matches and display the resulting dataframe
texts_with_context = texts[texts['Metric_Context'].apply(bool)]
print(texts_with_context[['Full_Text', 'Metric_Context'] + metrics])