In [None]:
import pandas as pd
import xlsxwriter
import re

In [None]:
excel = pd.ExcelFile("tables/screening_table.xlsx", engine="openpyxl")

In [None]:
df = excel.parse("Charlotte")

## Adding the two columns for analysis

The highlighting function rom xlswwrite doesn't work if there are missing valuesx

In [None]:
df["included"] = "x"
df["article_type"] = "x"

In [None]:
df["doi"] = df["doi"].fillna("x")

## Creating Excel writer using xlsxwriter

In [None]:
# Create an Excel writer using xlsxwriter
output_file = "second_test.xlsx"
writer = pd.ExcelWriter(output_file, engine='xlsxwriter')

# Add a blank workbook & worksheet
workbook  = writer.book
worksheet = workbook.add_worksheet('Charlotte')

### Define formats for highlighting

In [None]:
# Define formats for highlight and normal text
red_format = workbook.add_format({'font_color': 'red'})
black_format = workbook.add_format({'font_color': 'black'})

### Writing header row

In [None]:
for col_idx, col_name in enumerate(df.columns):
    worksheet.write(0, col_idx, col_name, black_format)

### Keywords to search to regular expression

In [None]:
pattern = re.compile(r'(environmental injustice|environmental justice)',re.IGNORECASE)

### Function

In [None]:
def highlight_keywords(worksheet, row, col, text):
    """
    Splits 'text' around each regex match (pattern)
    and writes partial substrings using XlsxWriter's write_rich_string().
    Matches for 'environmental injustice' or 'environmental justice' are in red.
    Everything else is in black.
    """
    # Find all matches + the segments in between
    matches = list(pattern.finditer(text))
    if not matches:
        # No matches, just write normally
        worksheet.write(row, col, text, black_format)
        return
    
    # We'll build a list of alternating segments and highlight matches:
    #   [ (black_format, 'text before first match'),
    #     (red_format,   'matched substring'),
    #     (black_format, 'text between matches'),
    #     (red_format,   'matched substring'),
    #     ... ]
    
    rich_segments = []
    last_end = 0
    
    for match in matches:
        start, end = match.span()
        
        # Text before this match
        if start > last_end:
            segment_before = text[last_end:start]
            rich_segments.append(black_format)
            rich_segments.append(segment_before)
        
        # The matched substring itself
        match_str = text[start:end]
        rich_segments.append(red_format)
        rich_segments.append(match_str)
        
        last_end = end
    
    # Text after the last match
    if last_end < len(text):
        segment_after = text[last_end:]
        rich_segments.append(black_format)
        rich_segments.append(segment_after)
    
    # Now write the assembled rich string into the cell
    worksheet.write_rich_string(row, col, *rich_segments)

### Writing the data

In [None]:
# Write data rows
for row_idx in range(len(df)):
    for col_idx, value in enumerate(df.iloc[row_idx]):
        col_name = df.columns[col_idx]
        
        if col_name in ["title", "abstract"] and isinstance(value, str):
            # Apply partial substring highlighting
            highlight_keywords(worksheet, row_idx + 1, col_idx, value)
        else:
            # Write other columns normally
            worksheet.write(row_idx + 1, col_idx, value, black_format)

# Close (this actually writes the file)
workbook.close()