In [8]:
import pandas as pd
import ast
from pathlib import Path

def parse_confidence(cell):
    """
    Convert a stringified list like "[0.75, 0.85, '0.80', ...]" into a list of floats.
    Ignores any unparseable entries.
    """
    try:
        raw = ast.literal_eval(cell)
    except (ValueError, SyntaxError):
        return []
    floats = []
    for x in raw:
        try:
            floats.append(float(x))
        except:
            # If it's a malformed string like "2. 0.85", try taking the last token
            if isinstance(x, str):
                parts = x.replace(',', '').split()
                try:
                    floats.append(float(parts[-1]))
                except:
                    pass
    return floats

def filter_high_confidence(input_csv, output_csv, threshold=0.8):
    # Load
    df = pd.read_csv(input_csv)
    
    # Parse & average
    df['conf_list'] = df['confidence'].apply(parse_confidence)
    df['avg_confidence'] = df['conf_list'].apply(
        lambda lst: sum(lst)/len(lst) if lst else 0
    )
    
    # Filter
    high_conf = df[df['avg_confidence'] > threshold].copy()
    
    # Sort descending by avg_confidence
    high_conf = high_conf.sort_values(by='avg_confidence', ascending=False)
    
    # (Optional) drop the helper columns before saving
    high_conf = high_conf.drop(columns=['conf_list'])
    
    # Save results
    high_conf.to_csv(output_csv, index=False)
    print(f"Filtered {len(high_conf)} tweets with avg_confidence > {threshold*100:.0f}%")
    
if __name__ == "__main__":
    models = "A4M_A4M2_A4MT_A4M2T"
    th = 0.80
    data_dir = Path(".")
    infile  = data_dir / f"common_{models}.csv"
    outfile = data_dir / f"filtered_high_confidence_{models}_{th}.csv"
    filter_high_confidence(infile, outfile, threshold=th)


Filtered 78 tweets with avg_confidence > 80%
