In [10]:
import pandas as pd
from typing import List

df = pd.read_csv("prod_tags_with_cost.csv")

In [32]:
def filter_by_any_tags(df: pd.DataFrame, tag_names: List[str]) -> pd.DataFrame:
    """
    Filter rows in a DataFrame where any of the specified tags are present in the 'tags' column.
    """
    if 'tags' not in df.columns:
        raise ValueError("DataFrame must contain a 'tags' column")

    # Ensure that tags column contains lists or similar iterable structures
    def contains_any_tag(tags: List[str], tag_names: List[str]) -> bool:
        return any(tag in tags for tag in tag_names)
    
    return df[df['tags'].apply(lambda x: contains_any_tag(x, tag_names))]

projects = {
    "scanner": ["scanner"],
    "auto_insights": ["auto_insights", "auto-insights", "autoinsights"],
    "whispers": ["rnd_whispers", "whispers"],
    "GRA": ["GRA"],
    "ci_platform": ["ci_platform"],
    "ailybot": ["ailybot"],
    "rnd_reputation": ["Reputation"],
    "anonymizer": ["anonymizer"],
    "fin_reports": ["investor_reports", "fin_annual_reports"],
    "google_agent": ["google_agent"],
}

df_scanner = filter_by_any_tags(df, projects["scanner"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_scanner["tags"])]

df_auto_insights = filter_by_any_tags(df, projects["auto_insights"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_auto_insights["tags"])]

df_rnd_whispers = filter_by_any_tags(df, projects["whispers"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_rnd_whispers["tags"])]

df_gra = filter_by_any_tags(df, projects["GRA"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_gra["tags"])]

df_ci_platform = filter_by_any_tags(df, projects["ci_platform"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_ci_platform["tags"])]

df_ailybot = filter_by_any_tags(df, projects["ailybot"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_ailybot["tags"])]

df_rnd_reputation = filter_by_any_tags(df, projects["rnd_reputation"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_rnd_reputation["tags"])]

df_anonymizer = filter_by_any_tags(df, projects["anonymizer"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_anonymizer["tags"])]

df_anonymizer = filter_by_any_tags(df, projects["anonymizer"]).sort_values(["total_cost"], ascending=False)
df = df[~df["tags"].isin(df_anonymizer["tags"])]

In [33]:
df

Unnamed: 0,tags,total_cost
11,"('prod',)",98.28634
28,"('investor_reports', 'prod', 'validation_conte...",40.56079
33,"('prod', 'sales_opportunities_recomms')",36.14678
44,"('prod', 'recommendations')",19.967635
48,"('ppl_recommendations_sanofi', 'prod')",16.237595
49,"('brain_example_langfuse', 'prod')",15.617334
55,"('ppl_rto_sanofi', 'prod')",11.812798
58,"('prod', 'reels_audio')",10.202944
62,"('prod', 'rnd_site_recommender', 'rnd_whatif')",9.740258
75,"('prod', 'pos-dataset-labelling', 'publication...",5.725896
