In [3]:
import os
import sys
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Add parent directory to path to import from src
notebook_dir = os.getcwd()  # Current working directory
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

PROJECT_ROOT = parent_dir

# The directory in file system is "agent-reflectrion", not "agent-reflection"
output_dir_path = os.path.join(parent_dir, "run_artifacts", "analyst_agent", "20250829_143213")

file_name = None
if os.path.exists(output_dir_path):
    # Find all files starting with 'final_meta_analysis'
    result_files = [f for f in os.listdir(output_dir_path) if f.startswith("final_meta_analysis")]
    if result_files:
        # Sort files by name (timestamp in filename ensures correct order)
        result_files.sort()
        file_name = result_files[-1]  # Latest file
        print(f"Using latest result file: {file_name}")
    else:
        print("File starting with 'final_meta_analysis' not found.")
else:
    print(f"Directory '{output_dir_path}' does not exist.")

# Search for the annotated file(s)
eval_dir_path = os.path.join(PROJECT_ROOT, "data", "annotated_data_csvs")
found_annotated_files = False
if os.path.exists(eval_dir_path):
    for f in os.listdir(eval_dir_path):
        if f.startswith("vali_data"):
            print(f"Found annotated file: {f}")
            found_annotated_files = f
    if not found_annotated_files:
        print("No files starting with 'vali_data' found.")
else:
    print(f"Directory '{eval_dir_path}' does not exist.")

Using latest result file: final_meta_analysis_20250829_144751.csv
Found annotated file: vali_data_manully_extracted_V0.csv


In [4]:
import pandas as pd

df_predicted = pd.read_csv(os.path.join(output_dir_path, file_name))
df_annotated = pd.read_csv(os.path.join(eval_dir_path, found_annotated_files))

df_annotated = df_annotated.drop(columns=['Title_of paper'])
# compare the two dataframes

# print the difference
print(df_predicted.columns)
print(df_annotated.columns)

all(df_predicted.columns == df_annotated.columns)

Index(['Crop Type', 'Crop Yield', 'Crop Yield Unit', 'Climate Drivers',
       'Climate Drivers Value', 'Climate Drivers Unit', 'Experimental Design',
       'Location', 'Time', 'Source in paper'],
      dtype='object')
Index(['Crop Type', 'Crop Yield', 'Crop Yield Unit', 'Climate Drivers',
       'Climate Drivers Value', 'Climate Drivers Unit', 'Experimental Design',
       'Location', 'Time', 'Source in paper'],
      dtype='object')


True

In [5]:
def compare_values(val1, val2):
    """Compare two values as floats if possible, otherwise as strings"""
    try:
        # Try to compare as floats
        return float(val1) == float(val2)
    except (ValueError, TypeError):
        # Fall back to string comparison if float conversion fails
        return str(val1).strip().lower() == str(val2).strip().lower()


def find_similar_rows(row_to_find: pd.Series, df_annotated: pd.DataFrame):
    matched_rows = []
    for index, search_row in df_annotated.iterrows():
        # Compare crop type and unit (case-insensitive)
        type_match = compare_values(search_row['Crop Type'], row_to_find['Crop Type'])
        # unit_match = compare_values(search_row['Crop Yield Unit'], row_to_find['Crop Yield Unit'])
        
        # Compare crop yield (as float)
        yield_match = compare_values(search_row['Crop Yield'], row_to_find['Crop Yield'])

        if type_match and yield_match:
            matched_rows.append(search_row)

    return matched_rows

find_similar_rows(df_annotated.iloc[2], df_predicted)

[Crop Type                                                            maize
 Crop Yield                                                           -11.0
 Crop Yield Unit                                                          %
 Climate Drivers                                   atmospheric temperatures
 Climate Drivers Value                                                    2
 Climate Drivers Unit                                                    °C
 Experimental Design                                    simulation analysis
 Location                                                              UIGP
 Time                                                                   NaN
 Source in paper          The mean baseline yield of rainfed maize crop ...
 Name: 1, dtype: object]

In [10]:
import pandas as pd
from pretty_prompt_compare import PrettyCompare

def pretty_compare_rows(row_to_find: pd.Series, row_annotated: pd.Series):
    pretty_compare = PrettyCompare(compare_response=True)

    print("Compare Climate Drivers:")
    row_to_find['Climate Drivers'] |pretty_compare| row_annotated['Climate Drivers']

    print("Compare Climate Drivers Value:")
    str(row_to_find['Climate Drivers Value']) |pretty_compare| str(row_annotated['Climate Drivers Value'])

    print("Compare Climate Drivers Unit:")
    str(row_to_find['Climate Drivers Unit']) |pretty_compare| str(row_annotated['Climate Drivers Unit'])

index = 2
row_matched = find_similar_rows(df_annotated.iloc[index], df_predicted)
pretty_compare_rows(df_annotated.iloc[index], row_matched[0])

Compare Climate Drivers:


Compare Climate Drivers Value:


Compare Climate Drivers Unit:


In [6]:
from IPython.display import HTML
import pandas as pd

def create_visual_comparison(df_annotated: pd.DataFrame, df_predicted: pd.DataFrame):
    # CSS styles for the grid
    styles = """
    <style>
        .comparison-grid {
            display: grid;
            grid-template-columns: repeat(10, 1fr);
            gap: 4px;
            margin: 20px;
            font-family: Arial, sans-serif;
        }
        .header {
            background-color: #333;
            color: white;
            padding: 8px;
            font-weight: bold;
            text-align: center;
        }
        .cell {
            padding: 8px;
            border-radius: 4px;
            min-height: 50px;
            word-wrap: break-word;
            font-size: 12px;
        }
        .no-match { background-color: #fa3434; }
        .partial-match { background-color: #f7f73b; }
        .match { background-color: #3cc73c; }
        .value-pair {
            display: flex;
            flex-direction: column;
            gap: 4px;
        }
        .annotated-value { color: #666; }
        .predicted-value { color: #000; }
    </style>
    """

    # Start building HTML
    html = styles + '<div class="comparison-grid">'

    # Add headers
    for col in df_annotated.columns:
        html += f'<div class="header">{col}</div>'

    # Process each row in annotated dataframe
    predicted_used_indices = set()

    for _, annotated_row in df_annotated.iterrows():
        matched_rows = find_similar_rows(annotated_row, df_predicted)
        unmatched_rows = [r for r in matched_rows if r.name not in predicted_used_indices]

        predicted_row = None if not unmatched_rows else unmatched_rows[0]
        if predicted_row is not None:
            predicted_used_indices.add(predicted_row.name)

        # Process each column
        for col in df_annotated.columns:
            annotated_val = str(annotated_row[col])
            


            if predicted_row is None:
                # No match found - red background
                html += f'<div class="cell no-match">{annotated_val}</div>'
            else:
                predicted_val = str(predicted_row[col])
                if compare_values(annotated_val, predicted_val):
                    # Perfect match - green background
                    html += f'<div class="cell match">{annotated_val}</div>'
                else:
                    # Partial match - yellow background with both values
                    html += f'''
                    <div class="cell partial-match">
                        <div class="value-pair">
                            <span class="annotated-value">A: {annotated_val}</span>
                            <span class="predicted-value">P: {predicted_val}</span>
                        </div>
                    </div>'''

    html += '</div>'
    return HTML(html)

# Create and display the visual comparison
visual_comparison = create_visual_comparison(df_predicted, df_annotated)
visual_comparison


In [None]:
def create_comparison_df(df_annotated: pd.DataFrame, df_predicted: pd.DataFrame):
    comparison_data = []
    predicted_used_indices = set()

    for _, annotated_row in df_predicted.iterrows():
        # Find potential matches in the predicted dataframe
        matched_rows = find_similar_rows(annotated_row, df_annotated)

        # Filter out any matched rows that we have already used
        unmatched_rows = [r for r in matched_rows if r.name not in predicted_used_indices]

        predicted_row = None
        if len(unmatched_rows) == 1:
            predicted_row = unmatched_rows[0]
            predicted_used_indices.add(predicted_row.name)

        result_row = {}
        for col in df_predicted.columns:
            annotated_val = annotated_row[col]
            if predicted_row is not None:
                predicted_val = predicted_row[col]
                # If the values are the same, just show the annotated value
                if str(annotated_val) == str(predicted_val):
                    result_row[col] = annotated_val
                else:
                    # If they differ, show both with an arrow
                    result_row[col] = f"{annotated_val} -> {predicted_val}"
            else:
                # If no match was found, just show the annotated value
                result_row[col] = annotated_val
        comparison_data.append(result_row)
    
    return pd.DataFrame(comparison_data)

def style_comparison_df(df_comparison: pd.DataFrame):
    """
    Styles the comparison dataframe to highlight differences.
    """
    def highlight_diff(val):
        """Highlights cells containing '->' to indicate a difference."""
        is_different = isinstance(val, str) and '->' in val
        return 'background-color: #ffcccb' if is_different else ''
    
    # Apply the styling function to the dataframe
    return df_comparison.style.applymap(highlight_diff)

# Create the comparison dataframe
df_comparison = create_comparison_df(df_annotated, df_predicted)

# Style and display the dataframe
styled_df = style_comparison_df(df_comparison)
styled_df

  return df_comparison.style.applymap(highlight_diff)


Unnamed: 0,Crop Type,Crop Yield,Crop Yield Unit,Climate Drivers,Climate Drivers Value,Climate Drivers Unit,Experimental Design,Location,Time,Source in paper
0,maize,2,tons/ha,NAN,NAN,NAN,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,UIGP,baseline period-1970 to 1995,"1. The mean baseline yield of rainfed maize crop is about 2M gh a−1in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to base line yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
1,maize,-7,%,Atmospheric temperature,1,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,UIGP,baseline period-1970 to 1995,"1. The mean baseline yield of rainfed maize crop is about 2M gh a−1in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to base line yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
2,maize,-11,%,Atmospheric temperature,2,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,UIGP,baseline period-1970 to 1995,"1. The mean baseline yield of rainfed maize crop is about 2M gh a−1in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to base line yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
3,maize,-15,%,Atmospheric temperature,3,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,UIGP,baseline period-1970 to 1995,"1. The mean baseline yield of rainfed maize crop is about 2M gh a−1in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to base line yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
4,maize,-22,%,Atmospheric temperature,4,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,UIGP,baseline period-1970 to 1995,"1. The mean baseline yield of rainfed maize crop is about 2M gh a−1in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to base line yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
5,maize,-33,%,Atmospheric temperature,5,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,UIGP,baseline period-1970 to 1995,"1. The mean baseline yield of rainfed maize crop is about 2M gh a−1in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to base line yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
6,maize,-8–-35,%,Atmospheric temperature,+1–+5,°C,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,MIGP,baseline period-1970 to 1995,"1. In MIGP region, yield reduction of about 8 –35% with 1 –5°C rise in atmospheric temperature is projected. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
7,maize,-10,%,Atmospheric temperature,1,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,SP,baseline period-1970 to 1995,"1. The SP region also projected to experience adverse impact with −10,−15,−23,−27 and −35% reductions from the baseline yield levels at each 1°C rise in temperature. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
8,maize,-15,%,Atmospheric temperature,2,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,SP,baseline period-1970 to 1995,"1. The SP region also projected to experience adverse impact with −10,−15,−23,−27 and −35% reductions from the baseline yield levels at each 1°C rise in temperature. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."
9,maize,-23,%,Atmospheric temperature,3,℃,InfoCrop-MAIZE model simulations for past 25 years (1970 –1995) for different locations representing three agro-climatic zones,SP,baseline period-1970 to 1995,"1. The SP region also projected to experience adverse impact with −10,−15,−23,−27 and −35% reductions from the baseline yield levels at each 1°C rise in temperature. 2. Using validated InfoCrop-MAIZE model, simulations were carried out for past 25 years (1970–1995) for different locations representing three agro-climatic zones."


In [28]:
df_predicted

Unnamed: 0,Crop Type,Crop Yield,Crop Yield Unit,Climate Drivers,Climate Drivers Value,Climate Drivers Unit,Experimental Design,Location,Time,Source in paper
0,maize,-7.0,%,temperature,1.0,°C,simulation analysis,UIGP,baseline,The mean baseline yield of rainfed maize crop ...
1,maize,-11.0,%,temperature,2.0,°C,simulation analysis,UIGP,baseline,The mean baseline yield of rainfed maize crop ...
2,maize,-15.0,%,temperature,3.0,°C,simulation analysis,UIGP,baseline,The mean baseline yield of rainfed maize crop ...
3,maize,-22.0,%,temperature,4.0,°C,simulation analysis,UIGP,baseline,The mean baseline yield of rainfed maize crop ...
4,maize,-33.0,%,temperature,5.0,°C,simulation analysis,UIGP,baseline,The mean baseline yield of rainfed maize crop ...
5,maize,-8.0,%,temperature,1.0,°C,simulation analysis,MIGP,,"In MIGP region, yield reduction of about 8–35%..."
6,maize,-35.0,%,temperature,5.0,°C,simulation analysis,MIGP,,"In MIGP region, yield reduction of about 8–35%..."
7,maize,-10.0,%,temperature,1.0,°C,simulation analysis,SP,baseline,The SP region is also projected to experience ...
8,maize,-15.0,%,temperature,1.0,°C,simulation analysis,SP,baseline,The SP region is also projected to experience ...
9,maize,-23.0,%,temperature,1.0,°C,simulation analysis,SP,baseline,The SP region is also projected to experience ...
