In [0]:
%pip install openpyxl

import pandas as pd
import ast
import plotly.express as px

## Read in Data

In [0]:
raw_df = pd.read_csv(r'/Volumes/prd_mega/sdgreg25/vdgreg25/Documents/MA Survey/MA Survey Results.csv')

## Process Data

In [0]:
join_cols = [col for col in raw_df.columns.to_list() if 'join_' in col] + ['caseid']
join_df = raw_df[join_cols]

df = join_df.applymap(lambda x: x.split('] ,')[0].replace('[','').strip())
df.columns = [col.replace('join_','') for col in df.columns]

# remove test cases
df = df.reset_index(drop=True)

## Separate out the MIS systems that are integrated with e-cohesion and not

In [0]:
import numpy as np

financial_mis_cols = [col for col in df.columns if 'mis_financial_' in col]

logic_col = 'single_or_separate_system'
mis_only_cols = []
for col in financial_mis_cols:
    joint_mis_col = col.replace('financial_','')
    one_sys_col = col.replace('financial_','all_')

    mis_only_col = joint_mis_col.replace('mis_','mis_only_no_e_cohesion_')
    mis_only_cols.append(mis_only_col)

    # df[one_sys_col] = df[joint_mis_col].where(df[logic_col] == 'all_integrated', other='')
    df[mis_only_col] = df[joint_mis_col].where(df[logic_col] == 'one_mis_separate_e_cohesion', other='')
    df[one_sys_col] = df[joint_mis_col].where(df[logic_col] == 'all_integrated', other='')



## Functions to Display Data

In [0]:
survey_path = r'/Volumes/prd_mega/sdgreg25/vdgreg25/Documents/temp/MA_Survey_Final.xlsx'

choice_df = pd.read_excel(survey_path, sheet_name='choices', engine='openpyxl')

def find_list_name_for_values(values: list):
    choice_df['value'] = choice_df['value'].astype(str)
    values = set(map(str, values))
    grouped = choice_df.groupby('list_name')['value'].apply(set)
    overlap = grouped.apply(lambda s: len(s.intersection(values)))
    matches = overlap[overlap == overlap.max()]
    return matches.index.tolist()

def get_values_dict(choices: list):
    list_names = find_list_name_for_values(choices)
    
    if len(list_names) == 1:
        sub_df = choice_df[choice_df['list_name'] == list_names[0]]
        dic = dict(zip(sub_df['value'].astype(str), sub_df['label']))
        dic[''] = 'No Answer'
    else:
        dic = dict(zip(choices, [underscore_to_sentence(choice) for choice in choices]))
        print('********* could not find unique list_name!!! **********')

    return dic
    
def shorten_choices(values_dict, choices_dict):
    merged_dict = {k: choices_dict.get(k, v) for k, v in values_dict.items()}
    merged_dict.update({k: v for k, v in choices_dict.items() if k not in values_dict})
    return merged_dict

In [0]:
def create_long_df(df, systems):
    long_df = []
    
    has_any_multiple = False
    for system_label, col in systems.items():
        df_subset = df[col].str.split(' ').to_frame()

        # check for select multiple condition
        if df_subset[col].apply(lambda x: len(x) > 1 if isinstance(x, list) else False).any():
            has_any_multiple=True
        
        exploded = df_subset.explode(col)
        exploded['System'] = system_label
        exploded['Barrier'] = exploded[col]
        exploded= exploded.replace('','No Answer',regex=True)
        long_df.append(exploded[['System', 'Barrier']])

    long_df = pd.concat(long_df)

    return long_df, has_any_multiple

def create_counts(long_df):
    counts = (
        long_df.groupby(['System', 'Barrier'])
        .size()
        .reset_index(name='count')
    )

    return counts

def create_percents(counts):
    counts['percent'] = counts.groupby('System')['count'].transform(lambda x: 100 * x / x.sum())

    return counts

def underscore_to_sentence(s):
    return s.replace('_', ' ').capitalize()

def create_color_map(counts):
    color_list = [
    "#E03C31",  # Red (strong attention grabber)
    "#002C77",  # World Bank Blue (deep, highly legible)
    "#F17C36",  # Orange (high visual energy)
    "#A64FA0",  # Purple (vivid, distinct from warm/cool hues)
    "#1B9E77",  # Teal (medium-dark, clear)
    "#60BD68",  # Green (medium saturation)
    "#D95F02",  # Burnt Orange (distinct but softer than bright orange)
    "#FFC845",  # Yellow/Gold (high contrast on dark bg, weaker on white)
    "#4CA6D7",  # Sky Blue (clear but less intense)
    "#8DB600"   # Lime Green (lower contrast on white, more muted)
]

    barriers = counts['Barrier'].drop_duplicates().to_list()
    
    color_discrete_map = {}
    for i, barrier in enumerate(barriers):
        if i < len(color_list) and barrier != 'No answer':
            color_discrete_map[underscore_to_sentence(barrier)] = color_list[i]
    
    color_discrete_map['No answer'] = '#b0b0b0'

    return color_discrete_map

def rename_columns(counts,legend=''):

    counts.rename(columns={'System': 'Process', 'Barrier': legend}, inplace=True)

    return counts

def sentence_case_column(counts, col):
    counts[col] = counts[col].apply(lambda x: underscore_to_sentence(x))

    return counts

In [0]:
def process_others(df, col1, col2, others):
    sub_df = df[[col1, col2]].copy()
    sub_df[col1] = sub_df[col1].str.split(' ')
    sub_df[col1] = sub_df.apply(lambda row: process_row(row, col1, col2, others), axis=1)
    return sub_df

def process_row(row, col1, col2, others):
    new_list = []

    def add_value(val):
        if isinstance(val, list):
            new_list.extend(val)
        elif isinstance(val, str) and val.strip():
            new_list.append(val)

    if not row[col1]:
        add_value(others.get(row[col2], ''))
    else:
        for val in row[col1]:
            if val in ('other', '', None):
                add_value(others.get(row[col2], ''))
            else:
                add_value(val)

    return ' '.join([str(v) for v in new_list if v])


In [0]:
def stacked_bar(df,
    col,
    title,
    legend,
    color_discrete_map,
    has_any_multiple=False,
    orientation='h',
    height=300,
    width=600,
    barwidth=0.25,
    n=None,
    empty_labels=None,
    barmode='stack',
    category_orders = None,
    legend_format=None,
    xtitle='',
    ytitle='',
    note_size=10,
    note_y=-.1,
    note_x =.85
):
    if orientation == 'h':
        x_axis = col
        y_axis = 'Process'
    else:
        x_axis = 'Process'
        y_axis = col

    # if n is not None:
    #     df = df.rename(columns={legend: legend + f' (n={n})'})
    #     legend = legend + f' (n={n})'

    if category_orders is None:
        category_orders = {
            legend: sorted(df[legend].dropna().unique().tolist())
        }
    else:
        category_orders = {
            legend: category_orders
        }
        
    text = df[col].map('{:.0f}%'.format) if col == 'percent' else df[col]

    if empty_labels:
        for p in empty_labels:
            df.loc[df['Process'] == p,'count'] = 0

    fig = px.bar(
        df,
        x=x_axis,
        y=y_axis,
        color=legend,
        orientation=orientation,
        text=text,
        category_orders=category_orders,
        color_discrete_map=color_discrete_map,
        title=title
    )

    if empty_labels:
        for trace in fig.data:
            if trace.name == "No answer":
                trace.showlegend = False

    if legend_format is None:        
        fig.update_layout(
            barmode=barmode,
            plot_bgcolor='white',
            boxmode='group',
            paper_bgcolor='white',
            height=height,
            width=width,
            bargap=0.25,
            font=dict(size=16),
            margin=dict(t=80, b=80),
            yaxis_title=ytitle,
            xaxis_title=xtitle,
            legend_traceorder='grouped',            
        )
    else:
        fig.update_layout(
            barmode=barmode,
            plot_bgcolor='white',
            boxmode='group',
            paper_bgcolor='white',
            height=height,
            width=width,
            bargap=0.25,
            font=dict(size=16),
            margin=dict(t=80, b=80),
            legend_traceorder='grouped',
            yaxis_title=ytitle,
            xaxis_title=xtitle,
            legend=legend_format
        )

    fig.update_traces(
        textposition='inside',
        insidetextanchor='middle',
        textangle=0,
        width=barwidth
    )

    if has_any_multiple:
        fig.add_annotation(
            text="*respondents could have multiple answers",
            xref="paper",
            yref="paper",
            x=0,
            y=1.07,
            showarrow=False,
            font=dict(size=10, color="dimgray"),
        )

    if n is not None:
        fig.add_annotation(
                text=f'Sample Size = {n}',
                x=note_x,
                y=note_y,
                xref='paper',
                yref='paper',
                showarrow=False,
                font=dict(size=note_size),
                align='right'
        )

    return fig

In [0]:
def vertical_bar_chart(df, title, xtitle='', ytitle=None, has_any_multiple=False, mode='percent', height=700, width=600, n=None,note_x=0.5,note_y=-.5,note_text='',note_xanchor='left',note_yanchor='bottom'):
    
    if mode == 'percent':
        fig = px.bar(
            df.reset_index(),
            x='answer',
            y='percent',
            text=df[mode].map('{:.0f}%'.format),
            title=title
        )
        prfx = '%'
    elif mode == 'count':
        fig = px.bar(
            df.reset_index(),
            x='answer',
            y='count',
            text=df[mode],
            title=title
        )
        prfx = 'Number'
    
    if ytitle is None:
        ytitle = f"{prfx} of respondents",

    fig.update_traces(marker_color='steelblue', textposition='outside')
    fig.update_layout(
        yaxis_title=ytitle,
        xaxis_title=xtitle,
        xaxis_tickangle=-45,
        uniformtext_minsize=20,
        margin=dict(t=60, b=120),
        height=height,
        width=width,
        bargap=0.3,
        plot_bgcolor='white',
        font=dict(size=16, color='black'),
    )
    
    fig.update_xaxes( 
        mirror=False, 
        showgrid=True,
        ticks='outside', 
        showline=True, 
        linecolor='black'
    )
    
    fig.update_yaxes(
        showgrid=True,
        showline=False,
        showticklabels=False,
        gridcolor='lightgray',   
        linecolor='black',      
        ticks=''
    )

    text = '*'
    print(text)
    if not n is None:
        text = text + f' (n={n})'
    print(text)
    if has_any_multiple and text != '*':
        text = text + '<br> respondents could have multiple answers;'
    elif has_any_multiple and text == '*':
        text = text + ' respondents could have multiple answers;'
    if note_text != '':
        text = text + f' {note_text}'
    if text != '*':
        fig.add_annotation(
            text=text,
            xref="paper",
            yref="paper",
            x=note_x,
            y=note_y,  # Move to bottom
            xanchor=note_xanchor,  # Align text to left of x=0
            yanchor=note_yanchor,
            showarrow=False,
            font=dict(size=15, color="#333"),
        )
    fig.show()

In [0]:
def bar_chart(df, col, title, xtitle='', ytitle='', mode='percent', no_show_no_answer=True, change_values_dict=None, width=600, height=700, note_x=0,note_y=-.5,note_text='',note_xanchor='left',note_yanchor='bottom',orientation='v'):
    if no_show_no_answer:
        df = df[df[col] != '']
    n = len(df)

    df_subset = df[col].str.split(' ').to_frame()
    has_any_multiple = False
    if df_subset[col].apply(lambda x: len(x) > 1 if isinstance(x, list) else False).any():
        has_any_multiple=True

    # title = f"{title} (n={len(df)})"
    choices = df[col].str.split(' ').explode().value_counts().to_frame(name='choices')
    choices = list(filter(None,choices.index.to_list()))
    value_dict = get_values_dict(choices)

    if not change_values_dict is None:
        
        value_dict = shorten_choices(value_dict,change_values_dict)

    counts = df[col].str.split(' ').explode().value_counts().to_frame(name='count')
    counts.index.name = 'answer'
    counts = counts.rename(index=value_dict)
    n_respondents = df[col].notna().sum()
    counts['percent'] = (counts['count'] / n_respondents) * 100
    
    if orientation == 'v':
        vertical_bar_chart(counts, title, xtitle, ytitle, has_any_multiple=has_any_multiple, mode=mode, width=width, height=height, n=n_respondents, note_x=note_x,note_y=note_y,note_text=note_text,note_xanchor=note_xanchor,note_yanchor=note_yanchor)
    elif orientation == 'h':
        horizontal_bar_chart(counts, title, xtitle,mode=mode, has_any_multiple=has_any_multiple, width=width, height=height,)

In [0]:
def horizontal_bar_chart(
    df, col, title, xtitle='', mode='percent', no_show_no_answer=True, change_values_dict=None, max_x=None,
    height=250, width=900, note_y=0, note_x=1, note_text='', note_xanchor='right', note_yanchor='bottom',
):
    if no_show_no_answer:
        df = df[df[col] != '']
    n = len(df)

    df_subset = df[col].str.split(' ').to_frame()
    has_any_multiple = False
    if df_subset[col].apply(lambda x: len(x) > 1 if isinstance(x, list) else False).any():
        has_any_multiple = True

    title = f"{title}"
    choices = df[col].str.split(' ').explode().value_counts().to_frame(name='choices')
    choices = list(filter(None, choices.index.to_list()))
    value_dict = get_values_dict(choices)

    if change_values_dict is not None:
        value_dict = shorten_choices(value_dict, change_values_dict)
    value_dict = {k: v.strip() + ' ' for k, v in value_dict.items()}

    counts = df[col].str.split(' ').explode().value_counts().to_frame(name='count')
    counts.index.name = 'answer'
    counts = counts.rename(index=value_dict)
    n_respondents = df[col].notna().sum()
    counts['percent'] = (counts['count'] / n_respondents) * 100

    # Sort for better visual order
    counts = counts.sort_values(by=mode, ascending=True)

    x = counts[mode]
    y = counts.index

    if mode == 'percent':
        hovertext = counts['percent'].round().astype(int).astype(str) + '%'
        bar_text = counts['percent'].round().astype(int).astype(str) + '%'
    else:
        hovertext = counts['count'].astype(int).astype(str) + ' responses'
        bar_text = counts['count'].astype(int).astype(str)

    fig = go.Figure(
        go.Bar(
            x=x,
            y=y,
            orientation='h',
            text=bar_text,
            hoverinfo='text',
            hovertext=hovertext,
            marker=dict(color='steelblue')
        )
    )

    fig.update_layout(
        title=title,
        xaxis_title=None,
        yaxis_title=None,
        height=height,
        width=width,
        margin=dict(l=150, r=20, t=50, b=50),
        yaxis=dict(automargin=True),
        xaxis=dict(showticklabels=False, range=[0, max_x] if max_x is not None else None),
        plot_bgcolor='white',
    )

    text = "*"
    if not n is None:
        text = f'Sample Size = {n}'

    if note_text != '':
        text = text + f' {note_text}'
    if text != '*':
        fig.add_annotation(
            text=text,
            xref="paper",
            yref="paper",
            x=note_x,
            y=note_y,  # Move to bottom
            xanchor=note_xanchor,  # Align text to left of x=0
            yanchor=note_yanchor,
            showarrow=False,
            font=dict(size=10, color="black"),
        )

    fig.show()

In [0]:
def invert_df(df,cols):
    temp = df[list(cols) + ['caseid']]
    
    df_long = temp.melt(id_vars='caseid', value_vars=cols, var_name='Barrier', value_name='System')

    df_long = df_long[df_long['System'].str.strip() != '']
    df_long['System'] = df_long['System'].str.split(' ')
    df_exploded = df_long.explode('System')

    df_exploded = df_exploded.drop_duplicates(subset=['caseid', 'System'])
    df_exploded['System'] = df_exploded['System'].apply(lambda x: underscore_to_sentence(x)) 
    return df_exploded

def create_stacked_plots(df, 
                         systems, 
                         title, 
                         legend='', 
                         mode='count',
                         height=300, 
                         width=600,
                         barwidth=0.25,
                         orientation='v', 
                         show_blank_cols=True,
                         barmode='stack',
                         swap=False,
                         category_orders=None,
                         legend_format=None,
                         xtitle='',
                         ytitle='',
                         note_y=-.1,
                         note_x=.5
                            ):
    if legend == '':
        legend = 'Legend'

    cols = systems.values()
    sub_df = df[list(cols)].copy()
    sub_df = sub_df[~(sub_df == "").all(axis=1)]
    if swap:
        long_df = invert_df(df, cols)
        has_any_multiple = False
        n = len(long_df)
    else:
        sub_df = df[list(cols)].copy()
        sub_df = sub_df[~(sub_df == "").all(axis=1)]
        
        if len(cols) > 1:
            renamed_cols = {col: f"{col}_(n={(df[col] != '').sum()})" for col in cols}
            sub_df.rename(columns=renamed_cols, inplace=True)
            systems = {label + ' ' + renamed_cols[col].split('_')[-1]: renamed_cols[col] for label, col in systems.items()}
        # else:
        #     systems = {label + ' ' + col: col for label, col in systems.items()}
        
        long_df, has_any_multiple = create_long_df(sub_df, systems)

        n=len(sub_df)
    counts = create_counts(long_df)

    color_discrete_map = create_color_map(counts)
    counts = rename_columns(counts,legend)
    counts = sentence_case_column(counts, legend)
    if show_blank_cols:
        mask = ~counts['Process'].str.contains('(n=0)', regex=False) & (counts[legend] == 'No answer')
        counts = counts[~mask]
        empty_labels = [label for label in counts['Process'] if '(n=0)' in label]
    else:
        counts = counts[counts[legend] != 'No answer']
        empty_labels = None
    
    # display(counts)
    if mode == 'percent': 
        counts['percent'] = counts['count'] / counts.groupby('Process')['count'].transform('sum') * 100

    # Plot
    fig = stacked_bar(counts, col=mode, title=title, legend=legend, has_any_multiple=has_any_multiple,color_discrete_map=color_discrete_map, orientation=orientation, height=height, width=width, n=n, empty_labels=empty_labels,barmode=barmode,barwidth=barwidth,category_orders=category_orders,legend_format=legend_format,xtitle=xtitle,ytitle=ytitle,note_y=note_y,note_x=note_x)

    return fig

In [0]:
## Display other breakdown
def print_other(col):
    if not '_other' in col:
        col = col + '_other'
    srs = df[df[col].str.strip() != ''][col]
    for elem in srs:
        print(elem)
        print()

In [0]:
# Columns representing each MIS system
systems = {
    'All Integrated': 'mis_all_barriers_to_api_implementation',
    'Joint MIS no e-Cohesion': 'mis_only_no_e_cohesion_barriers_to_api_implementation',
    'Financial MIS': 'mis_financial_barriers_to_api_implementation',
    'M&E MIS': 'mis_me_barriers_to_api_implementation',
    'e-Cohesion': 'e_cohesion_barriers_to_api_implementation'
}

title='Challenges to API Adoption'
legend='Specific Challenges'
mode='count'
fig = create_stacked_plots(df, systems, title, legend, mode=mode, width=900, height=400, barwidth=0.3, show_blank_cols=True, barmode='stack', swap=True)
display(fig)

## aggregate by specific challenge
## do not double count votes
## consolidate systems into single


# Slide 1/4 - Background

In [0]:
"""
Total number of respondents
Percent of survey completed
Number of countries represented
number of respondents per country

"""
summ_df = raw_df[7:]
total = len(summ_df)
total_countries = summ_df['ms_name'].nunique()
countries = summ_df['ms_name'].value_counts().to_frame(name='count')
resp_df = summ_df.copy()

# --- Unique emails per country ---
exploded_df = (
    resp_df
    .assign(join_email=resp_df['join_email'].str.split(' '))
    .explode('join_email')
)

exploded_df['join_email'] = exploded_df['join_email'].str.strip()
exploded_df = exploded_df[exploded_df['join_email'] != '']

unique_mas_per_country = (
    resp_df
    .drop_duplicates(subset=['ms_name', 'ma_name']) 
    .groupby('ms_name')['ma_name']
    .count()
    .reset_index(name='unique_ma_count')
).sort_values(by='unique_ma_count', ascending=False)

print(f'Total number of respondents: {total}')
print(f'Number of countries represented: {total_countries}')
print(f'Number of respondents per country:')
display(unique_mas_per_country)
print('checksum: ', sum(unique_mas_per_country['unique_ma_count']))

In [0]:
import plotly.express as px
import plotly.graph_objs as go


fig = px.choropleth(
    unique_mas_per_country,
    locations='ms_name',
    locationmode='country names',
    color='unique_ma_count',
    hover_name='ms_name',
    color_continuous_scale= ['#e6f5d0', '#a1d99b', '#41ab5d'],
    labels={'unique_ma_count': 'Responses'},
    title='Responses by EU Country'
)


fig.add_trace(
    go.Scattergeo(
        locations=unique_mas_per_country['ms_name'],
        locationmode='country names',
        text=unique_mas_per_country['unique_ma_count'],
        mode='text',
        textfont=dict(size=10),
        showlegend=False
    )
)


fig.update_geos(
    projection_type='natural earth',
    showcountries=True,
    showcoastlines=True,
    coastlinecolor='Black',
    lataxis_range=[34, 72],
    lonaxis_range=[-25, 40]
)


#fig.write_image("/dbfs/tmp/eu_map.png", width=1000, height=800)
#displayHTML(f'<a href="/files/tmp/eu_map.png" target="_blank">Download EU Map</a>')


# Slide 2/4 - Use of Data and Reporting Systems​

### Systems and Integrations​ - Systems per Managing Authorities

In [0]:
df['system_integration_label'] = df['single_or_separate_system'].replace({
    'all_integrated': 'One system: integrated MIS and e-Cohesion',
    'one_mis_separate_e_cohesion': 'Two systems: one e-Cohesion, one MIS',
    'results_integrated_e_cohesion': 'Two systems: one MIS integrated with e-Cohesion, one additional MIS',
    'financial_integrated_e_cohesion': 'Two systems: one MIS integrated with e-Cohesion, one additional MIS',
    'all_separate': 'Multiple Systems',
})

filtered = df['system_integration_label'].replace('', pd.NA).dropna()
counts = filtered.value_counts()
sample_size = counts.sum()

def create_pie_chart(counts, title, sample_size,order=None,note_size=10,note_align='left', height=600,width=600,note_x=0.5,note_y=-0.35,legend_y=-.35):
    pie_df = pd.DataFrame({
        'label': counts.index.tolist(),
        'value': counts.values.tolist()
    })

    fig = px.pie(
        pie_df,
        names='label',
        values='value',
        color_discrete_sequence=['#2ca02c', '#1f77b4', '#ff7f0e', 'grey'],
        title='System Integration Level',
        category_orders={'label': order} if order else None
    )

    fig.update_traces(
        textposition='inside',
        textinfo='percent',
        textfont_size=18,
        texttemplate='%{percent:.0%}',
        showlegend=True
    )

    fig.update_layout(
        title=dict(
            text=title,
            x=0.5,
            xanchor='center',
            font=dict(size=20, color="#333")
        ),
        font=dict(size=12, color="#333"),
        legend=dict(
            orientation='h',
            yanchor='bottom',
            y=legend_y,
            xanchor='center',
            x=0.5
        ),
        height=height,
        width=width,
        paper_bgcolor='white',
        plot_bgcolor='white',
        margin=dict(t=80, b=50, l=20, r=20),
        annotations=[
            dict(
                text=f'Sample Size = {sample_size}',
                x=note_x,
                y=note_y,
                xref='paper',
                yref='paper',
                showarrow=False,
                font=dict(size=note_size),
                align='right'
            )
        ]
    )


    return fig

title = 'System Integration Level'
display(counts)
create_pie_chart(counts, title, sample_size)

## Slide (3/4) - Integrations & Data Processes​

In [0]:
systems = {
    'All Integrated': 'mis_all_api_data_transmission',
    'MIS Only': 'mis_only_no_e_cohesion_api_data_transmission',
    'Financial MIS': 'mis_financial_api_data_transmission',
    'M&E MIS': 'mis_me_api_data_transmission',
    'e-Cohesion': 'e_cohesion_api_data_transmission'
}

value_change_dict={'yes': 'Yes', 'no': 'No', 'no_but_pursuing': 'Planned','does_not_know': "I don't know"}


title='API Data Tranmission <br>Capabilities for MA Systems'
legend = 'Answers'
fig = create_stacked_plots(df, systems, title, legend, height=400,width=600)
# display(fig)

cols = systems.values()
sub_df = df[list(cols) + ['caseid']]
temp = df[list(cols)]
temp = temp[~(temp == '').all(axis=1)]
n = str(len(temp)) + ' Managing Authorities. MAs can have multiple systems'

sub_df = sub_df.melt(id_vars='caseid', value_vars=cols, var_name='system', value_name='answer')

# sub_df = sub_df.drop_duplicates(subset=['caseid','answer'])
sub_df = sub_df[~sub_df['answer'].isin(['', 'does_not_know'])]
n = len(sub_df)
duplicate_rows = sub_df[sub_df.duplicated(subset=['caseid'], keep=False)]
duplicate_rows = duplicate_rows.sort_values(by='caseid')

counts = sub_df['answer'].value_counts()
counts.index=counts.index.map(value_change_dict)
# display(sub_df)

order = ['Yes', 'Planned', 'No', "I don't know"]
note_x = .5
note_y = -.25
height = 350
legend_y= -.2
width = height
fig = create_pie_chart(counts, title, n, order, height = height, width = width,note_x=note_x,note_y=note_y,legend_y=legend_y)
display(fig)

In [0]:
systems = {
    'System Integration': 'single_or_separate_system',
}

title='System Integration Level'
legend = ''
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600,barmode='stack')
fig

In [0]:
systems = {
    'SFC': 'sfc_automation_level',
    'Arachne': 'arachne_submission',
    # 'MA Website': 'operations_publishing_pipeline',
}
title='Data Processes Automation'

sub_df = df[['sfc_automation_level','arachne_submission','operations_publishing_pipeline']]

value_change_dict = {'fully':'fully_automatic',
                        'mostly_automatic':'partially_automatic',
                        'mostly_manual':'partially_automatic',
                        'partially':'partially_automatic',
                        'not_at_all':'fully_manual',
                    }

sub_df = sub_df.applymap(lambda x: value_change_dict.get(x, x))
legend = 'Automation Level'
category_orders=['Fully automatic','Partially automatic','Fully manual','Other']
legend_format=dict(
    title='',
    orientation="h",
    yanchor="bottom",
    y=-1,  # try -0.2 to -0.4 depending on how low you want it
    xanchor="center",
    x=.5
)
ytitle='% of Respondents'
note_x = .5
note_y = -.35
# legend_format=None

fig = create_stacked_plots(sub_df, systems, title, legend, mode='percent', height=400, barwidth=.5, width=500, orientation='v',category_orders=category_orders,legend_format=legend_format,ytitle=ytitle,note_x=note_x,note_y=note_y)
fig


## change to percent
## Remove MA Website
## logical progression of legend

## Slide (4/4) - Constraints

In [0]:
choices = df['reason_for_no_api_use'].str.split(' ').explode().value_counts().to_frame(name='choices')
choices = list(filter(None, choices.index.to_list()))
value_dict = get_values_dict(choices)
value_dict

In [0]:
ud= 'under_development'
idk = "i_do_not_know"
lr = 'budgetary_restrictions'
not_decision_maker = 'not_decision_maker'
insufficient_info = 'insufficient_info'
others = {
  "Communication between MIS and SFC needs to be developed": ud,
  "Automated system-to-system submission via API under developement":ud,
  "I have no idea. It's a technical question, same the question above. I am in the MA responsible for preparing reports and I have no in-depth knowledge on technical aspects of the process.": not_decision_maker,
  'Fewer and simpler programmes.': 'too_difficult',
  "we are currently exploring the development of a method to automate the data transmission to SFC. For the moment we've implemented a new local application which is able to work with APIs and we are now at the beginning of implementing imports of APIs from":ud,
  "As an end user, our organization is not involved in the technical decisions regarding web services or APIs for the SFC":not_decision_maker,
  "New eCohesion system, development has focused on must-have features and data integrity. Development on SFC web services / API will follow.":ud,
  "Limited IT resources":lr,
  "Zu wenig Informationen über API. Keine Werbung über die Möglichkeiten.": insufficient_info,
  "lack of recources":lr,
  "Non siamo adeguatamente informati di tali possibilità": insufficient_info,
  """Implementation of API is hard to facillitate in our JEMS system as it is run on goverment servers and external "links" are difficult to set up due to security concerns.""":lr,
  "insufficient information about API capabilities  of SFC available": insufficient_info,
  "Transmission of data should be incorporated at the central MIS level. Currently under development.":ud,
  "We don't know": idk,
  "too extensive and time-consuming programming":lr,
  '"An integration with SFC web services is planned in the near future."': ud,
  """the interoperability has not been ensured yet by SFC at the moment of the development of our system, so for this programming period we do not plan to change the existing system""": lr,
  "didn't know until now this was an option, we informed a couple years ago but then it wasn't an option": insufficient_info,
  """Decisions regarding the development of integration and their implementation are made at the level of the administrator - the Ministry of Funds and Regional Policy (MFiPR).""": not_decision_maker,
  "The system is designed for the program activity covered by national funds":idk,
  "There is no entry.":idk,
  "We don't information in this regard. MySMIS 2021/SMIS2021+ is operated by Special Telecommunications Service.":idk,
  "I don't know, data transfer is not under the responsibility of the MA, but national coordinating body":idk
}


ytitle=''
xtitle=None
title = f"Challenges to SFC API Implementation"
mode='percent'
col1 = 'reason_for_no_api_use'
col2 = 'reason_for_no_api_use_other'

def process_value(row, others):
    # Split col1 into a list if it's not already
    col1_values = row[col1].split(' ') if isinstance(row[col1], str) else row[col1]
    
    processed_values = []
    for x in col1_values:
        if x == 'other' and row[col2] in others:
            replacement = others[row[col2]]
            if isinstance(replacement, list):
                processed_values.extend(replacement)
            elif isinstance(replacement, str) and replacement:
                processed_values.append(replacement)
        else:
            # Keep the original value
            processed_values.append(x)
    
    # Join the list back into a space-separated string
    return ' '.join(processed_values)
  
sub_df = df[[col1, col2]]
sub_df.loc[:, col1] = sub_df.apply(lambda row: process_value(row, others), axis=1)

sub_df = sub_df[~sub_df[col1].str.contains('|'.join(['i_do_not_know', 'under_development', 'likes_as_is']), na=False)]

change_values_dict = {
  'not_decision_maker': 'Not the decision maker',
  'insufficient_info': 'Insufficient information on API',
}
width = 500

horizontal_bar_chart(sub_df, col1, title, xtitle, mode=mode, change_values_dict=change_values_dict, max_x=50, width=width)

In [0]:
sub_df.reason_for_no_api_use.unique()

In [0]:
df.resources_to_implement_api_other.unique().tolist()

In [0]:
choices = df['resources_to_implement_api'].str.split(' ').explode().value_counts().to_frame(name='choices')
choices = list(filter(None, choices.index.to_list()))
value_dict = get_values_dict(choices)
value_dict

In [0]:
col1 = 'resources_to_implement_api'
col2 = 'resources_to_implement_api_other'

title='Resources needed to facilitate broader API adoption'
xtitle = ''
ytitle=''
mode='percent'

bg = 'budget'
tr = 'training'
rg = 'regulatory_changes'
ta = 'technical_assistance'
idoc = 'improved_documentation'
pe = 'peer_experience'
mbi = 'member_state_buy_in'

all = [bg, tr, rg, ta, idoc, pe, mbi]

others = {
    "I don't know": '',
    'provisions in EU regulations obliging the MAs to develop API connection with SFC': rg,
    'We have no interest in API-based reporting at this point.': '',
    'all of the above': all,
    'As an end user of the SFC, our organization is not involved in the technical decisions regarding the adoption of API-based reporting. This would be a matter for our system administrators to evaluate and implement': '',
    'MA relies on internal non-IT specilaists wokring with external IT specialists to develop our eCoesion system. We made some early tentative steps towards web services at early development phase before decision to defer this due to the technical leap requi': '',
    'Need of improved IT resources': [bg, tr, ta, idoc],
    'CST2021 is not a MA system but a Member State system': mbi,
    'Data transfer runs fine even with the current set up': '',
    'NA': '',
    'We are satisfied with current situation.': '',
    'Currently under development.': '',
    'no further support needed': '',
    'Do not know': '',
    'We have to explore this now': '',
    'Decisions regarding the development of integration and their implementation are made at the level of the administrator - the Ministry of Funds and Regional Policy (MFiPR).': mbi,
    'Maggiori informazioni sulla disponibilità e modalità di utilizzo': [tr, ta, idoc],
    'dedicated infrastructure': ta,
    'Dedicated infrastructure': ta,
    'Does not fall under the responsibilities of the MA. MySMIS 2021/SMIS2021+ is operated by Special Telecommunications Service.': '',
    'Nie znamy odpowiedzi, informacji powinien udzielić właściciel systemu.': '',
    'I don´t know, I am not expert in this area.': '',
}


sub_df = process_others(df, col1, col2, others)

change_values_dict = {
  'member_state_buy_in': 'Central MS agency buy-in',
  'technical_assistance': 'IT support / technical assistence'
}

width = 550
horizontal_bar_chart(sub_df, col1, title, xtitle, mode=mode, width=width, change_values_dict=change_values_dict, max_x=70)

### Operational Needs Alignment

In [0]:
systems = {
    'Joint MIS': 'mis_meeting_needs',
    'Financial MIS': 'mis_financial_meeting_needs',
    'M&E MIS': 'mis_me_meeting_needs',
    'e-Cohesion': 'e_cohesion_meeting_needs'
}
note_x = 0.5
note_y = -.75

title='How Well Systems Meet Operational Needs'
legend = ''
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600,note_y=note_y,note_x=note_x)
fig

In [0]:
systems = {
    'Joint MIS': 'mis_programmes_managed',
    'Financial MIS': 'mis_financial_programmes_managed',
    'M&E MIS': 'mis_me_programmes_managed',
    'e-Cohesion': 'e_cohesion_programmes_managed'
}

title='Programmes Managed In System'
legend = ''
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600)
fig

In [0]:
# Columns representing each MIS system
systems = {
    'All Integrated': 'mis_all_barriers_to_api_implementation',
    'Joint MIS no e-Cohesion': 'mis_only_no_e_cohesion_barriers_to_api_implementation',
    'Financial MIS': 'mis_financial_barriers_to_api_implementation',
    'M&E MIS': 'mis_me_barriers_to_api_implementation',
    'e-Cohesion': 'e_cohesion_barriers_to_api_implementation'
}

title='Challenges to API Adoption'
legend='Specific Challenges'
fig = create_stacked_plots(df, systems, title, legend, width=900, height=500, barwidth=0.1, show_blank_cols=True, barmode='stack')
fig

In [0]:
title = 'Most mentioned barriers to API adoption'
legend = 'Most mentioned barriers to API adoption'

df['All systems'] = df[list(systems.values())].apply(
    lambda row: ' '.join(filter(None, row.astype(str))), axis=1
)

combined_systems = {'All systems': 'All systems'}
note_x=1
note_y =-.1
fig = create_stacked_plots(
    df, 
    combined_systems, 
    title, 
    legend, 
    width=900, 
    height=500, 
    barwidth=0.1, 
    show_blank_cols=True, 
    barmode='stack',
    note_y=note_y,
    note_x=note_x
)
fig

In [0]:
choices = df['reason_for_no_api_use'].str.split(' ').explode().value_counts().to_frame(name='choices')
choices = list(filter(None,choices.index.to_list()))
value_dict = get_values_dict(choices)

## Others

In [0]:
bg='budget'
tr='training'
rg='regulatory_changes'
ta='technical_assistance'
_id='improved_documentation'
pe='peer_experience'
al = 'All'
by = "buy-in_from_member_state_central_agency"


others = {"provisions in EU regulations obliging the MAs to develop API connection with SFC":  rg,
"all of the above":al,
"Need of improved IT resources": tr,
"CST2021 is not a MA system but a Member State system":by,
"Decisions regarding the development of integration and their implementation are made at the level of the administrator - the Ministry of Funds and Regional Policy (MFiPR).":by,
"Maggiori informazioni sulla disponibilità e modalità di utilizzo" : tr
 
}

col1 = 'resources_to_implement_api'
col2 = 'resources_to_implement_api_other'
title = f"Reported Resources Needed for API Adoption"
xtitle=''

sub_df = process_others(df,col1,col2,others)
mode='percent'
ytitle=None
bar_chart(df, col, title=title, xtitle=xtitle, mode=mode,height=550)

In [0]:
systems = {
    'All Integrated': 'mis_all_barriers_to_api_implementation',
    'Joint MIS no e-Cohesion': 'mis_only_no_e_cohesion_barriers_to_api_implementation',
    'Financial MIS': 'mis_financial_barriers_to_api_implementation',
    'M&E MIS': 'mis_me_barriers_to_api_implementation',
    'e-Cohesion': 'e_cohesion_barriers_to_api_implementation'
}
cols = list(systems.values())
sub_df = df[cols + ['caseid']]

sub_df = sub_df.melt(id_vars='caseid', value_vars=cols, var_name='System', value_name='Barrier')
sub_df = sub_df[sub_df['Barrier']!='']
n = len(sub_df)
exploded = sub_df['Barrier'].str.split(' ').explode()
counts = exploded.value_counts().to_frame().rename(columns={'Barrier':'count'})
counts.index.name = 'answer'
counts = counts.reset_index()

counts['percent'] = counts['count']/n*100

counts.answer = counts.answer.apply(lambda x:underscore_to_sentence(x))
display(counts)
title = f'Challenges to Broad API Adoption<br> (n={n})'

xtitle=''
ytitle=''
has_any_multiple=False

fig = vertical_bar_chart(counts, title=title, xtitle=xtitle, ytitle=ytitle, has_any_multiple=has_any_multiple, width=400, height=500)
display(fig)

In [0]:
systems = {
    'System Integration': 'system_integration_level',
}

title='Integration Level of Systems'
sub_df = df[df['single_or_separate_system'] != 'all_integrated']

note_x=0.5
note_y=-.2
legend = 'Integration Level'
fig = create_stacked_plots(sub_df,systems,title,legend,barwidth=.1, height=400,width=600,note_y=note_y,note_x=note_x)
fig

In [0]:
systems = {
    'Arachne': 'arachne_submission'
}

title='Arachne Data Submission'
legend = ''
note_x=0.5
note_y=-.2
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600
,note_y=note_y,note_x=note_x)
fig

In [0]:
systems = {
    'Arachne': 'arachne_information_origin'
}

title='Where Arachne Data Comes From'
legend = ''
note_x=0.5
note_y=-.2
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600,
note_y=note_y,note_x=note_x)
fig

In [0]:
systems = {
    'SFC': 'sfc_submission_types'
}

title='How Data is Submitted to SFC'
legend = 'Submission types'
note_y=-.05
note_x=2
fig = create_stacked_plots(df,systems,title,mode='percent',legend=legend,height=400,width=400,note_y=note_y,note_x=note_x)
display(fig)

In [0]:
df['sfc_submission_types']

In [0]:
#  def classify(row):
#     options = len(row)
#     row = row.replace('',np.nan).dropna()
#     unique_vals = len(row)
#     if unique_vals < options:
#         return f'Answered {unique_vals}'
#     else:
#         return 'Answered All {unique_vals}'
    
# sub_df = df[list(systems.values()) + ['caseid']]
# sub_df['match_type'] = sub_df.apply(lambda row: classify(row), axis=1)
# # display(sub_df)

# percentages = sub_df['caseid'].value_counts().to_frame()
# # display(percentages)
# percentages['match_type'] = 'Answered ' + percentages.index.astype(str)
# percentages = percentages.round(1).reset_index()
# percentages.columns = ['match_type', 'percent']

# for i,row in percentages.iterrows():
#     print(row['match_type'], ': ', row['percent'],'%')

# ## explore how much overlap there is 

In [0]:
systems = {
    'No Excel Reason': 'no_excel_upload_reason'
}

title='Reason to Not Use Excel Upload'
legend = ''
note_y=-.05
note_x=2
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600,note_y=note_y,note_x=note_x)
fig

## Dig into Other

In [0]:
print_other('no_excel_upload_other_reason')

In [0]:
systems = {
    'Tables 1 & 2': 'table_1_to_2_data_origin',
    'Tables 5-10': 'table_5_to_10_data_origin',
    'Table 12': 'table_12_data_origin',
    
}

title='Data Origins for Different Tables'
legend = ''
mode='percent'
note_y=-.05
note_x=2
fig = create_stacked_plots(df,systems,title,legend,mode=mode,height=400,width=600,note_y=note_y,note_x=note_x)
display(fig)

sub_df = df[systems.values()]
import numpy as np
def classify(row):
    row = row.replace('',np.nan).dropna()
    unique_vals = len(row)
    if unique_vals == 1:
        return 'Answered 1'
    elif unique_vals == 2:
        return 'Answered 2'
    else:
        return 'Answered All 3'

sub_df['match_type'] = sub_df.apply(lambda row: classify(row), axis=1)

percentages = sub_df['match_type'].value_counts(normalize=True) * 100
percentages = percentages.round(1).reset_index()
percentages.columns = ['match_type', 'percent']

for i,row in percentages.iterrows():
    print(row['match_type'], ': ', row['percent'],'%')
## Show overlap
## % overlap

In [0]:
systems = {
    'Manual Steps': 'data_validation_procedure'
    
}
title='How to ensure Data Quality before submission'
title = title.title()
legend = ''
note_y=-.05
note_x=2
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600,note_y=note_y,note_x=note_x)
fig

In [0]:
systems = {
    'Reason': 'reason_for_no_api_use'
    
}

title='Reasons for not implementing web services api'
title = title.title()
legend = ''
note_y=-.05
note_x=2
fig = create_stacked_plots(df,systems,title,legend,height=400,width=600,note_y=note_y,note_x=note_x)
fig

## Dig into other

In [0]:
print_other('reason_for_no_api_use')

In [0]:
col = 'reason_for_no_api_use'
title='Reasons for not implementing web services api'
# xtitle = 'Reasons'
# ytitle=None
# mode='count'
# bar_chart(df, col, title, xtitle, ytitle, mode)

# col = 'resources_to_implement_api'
# title='Resources needed to facilitate broader API access'
xtitle = ''
ytitle=None
mode='percent'
bar_chart(df, col, title, xtitle=xtitle, mode=mode)

In [0]:
print_other('resources_to_implement_api')

In [0]:
col = 'reporting_clarity'
title='Reporting Requirements clarity'
xtitle=''
ytitle=''
mode='percent'
bar_chart(df, col, title, xtitle, mode=mode, height = 500)

In [0]:
print_other('reason_for_no_api_use_other')

In [0]:
col = 'prior_api_considerations'
title = f"Current or prior interest in API implementation"
xtitle=''
ytitle=None
change_values_dict = {'Lack of system integration — data extraction from multiple sources':'Too many sources',
        'Differences in data formats — conversion and reconciliation': 'Different data formats',
        'Budgetary restrictions': 'Budget',
        'Manual deduplication of entries before submission': 'Manual deduplication',
        'Data quality checks requiring human review': 'Manual data quality checks',
        'No need for automation, system runs fine as is\xa0': 'No need',
        'Too difficult to implement': 'Too difficult'}
mode='percent'
display(df[col])
bar_chart(df, col, title, xtitle=xtitle, mode=mode, change_values_dict=change_values_dict)

In [0]:
fields_df = pd.read_excel(survey_path, sheet_name='survey', engine='openpyxl')
fields_df

In [0]:
roi = fields_df[fields_df['type'].str.contains('select',na=False)]
roi.head()


In [0]:
for name in roi['name'].to_list():
    if name in df.columns:
        title_name = name.replace('_', ' ').capitalize()
        systems = {
            title_name: name,
        }
        sub_df = df[name]
        title=title_name

        legend = 'Legend'
        try:
            fig = create_stacked_plots(sub_df,systems,title,legend,barwidth=.1, height=400,width=600)
            fig
        except: 
            pass

In [0]:
names = list(set(df.columns.to_list() + roi['name'].to_list()))
names = [name for name in names if name in roi['name'].to_list() ]
names

In [0]:
for name in names:
    title_name = name.replace('_', ' ').capitalize()
    systems = {
        title_name: name
    }

    title= title_name
    legend = ''
    note_x = 1.3
    note_y = -.28
    fig = create_stacked_plots(df,systems,title,legend,height=400,width=600,note_x=note_x,note_y=note_y)
    display(fig)

In [0]:
# for elem in df[df['regulatory_improvement_suggestions']!='']['regulatory_improvement_suggestions']:
#     print(elem)

expl_df = display(df[df['regulatory_improvement_suggestions']!='']['regulatory_improvement_suggestions'])

expl.to_excel()