In [0]:
%pip install openpyxl

import pandas as pd
import numpy as np

## Bring in Cleaned Data

In [0]:
catalog = 'prd_mega'
schema = 'sdgreg25'
clean_name = 'ma_survey_clean'
labels_name = 'ma_survey_labels'
variables_name = 'ma_survey_variables'

# responses
rdf = spark.read.table(f'{catalog}.{schema}.{clean_name}')

# labels
ldf = spark.read.table(f'{catalog}.{schema}.{labels_name}')

# variables
vdf = spark.read.table(f'{catalog}.{schema}.{variables_name}')



In [0]:
# display(rdf)
# display(ldf)
# display(vdf.toPandas()['standalone_mis_name'])

In [0]:
responses_path = r'/Volumes/prd_mega/sdgreg25/vdgreg25/Documents/MA Survey/MA Survey Results.csv'

responses_df_raw = pd.read_csv(responses_path)

survey_path = r'/Volumes/prd_mega/sdgreg25/vdgreg25/Documents/temp/MA_Survey_Final.xlsx'

choice_df_raw = pd.read_excel(survey_path, sheet_name='choices', engine='openpyxl')

question_df_raw = pd.read_excel(survey_path, sheet_name='survey', engine='openpyxl')

## Adding `type` col to ma_survey_variables

In [0]:
# survey_path = r'/Volumes/prd_mega/sdgreg25/vdgreg25/Documents/temp/MA_Survey_Final.xlsx'
# question_df_raw = pd.read_excel(survey_path, sheet_name='survey', engine='openpyxl')

# question_df_raw[['name','type']]
# vdf_temp = vdf.toPandas()
# vdf_temp['column_name'] = vdf_temp['column_name'].str.replace('_clean_consol','').str.replace('_clean','')
# vdf_temp = vdf_temp.drop(columns=['type'])

# temp = pd.merge(vdf_temp, question_df_raw[['name', 'type']], left_on='column_name', right_on='name', how='left').drop(columns=['name'])

# sdf = spark.createDataFrame(temp)

# sdf.write \
#   .option("mergeSchema", "true") \
#   .mode("overwrite") \
#   .saveAsTable(f"{catalog}.{schema}.{variables_name}")

In [0]:
qdf = vdf[vdf['type'].contains('select') | vdf['type'].contains('text')]

qdf = qdf.toPandas()

display(vdf.isna())

In [0]:
# score dataframe
sdf = ldf.toPandas().copy()

sections = sdf['list_name'].drop_duplicates().to_list()

for s in sections:
    idcs = sdf[sdf['list_name'] == s].index
    j = 0
    for i in idcs:
        sdf.loc[i, 'score'] = j
        j += 1

display(rdf[rdf['ma_name']=='Resources, programming, culture, tourism'])

## Scoring Funcs

In [0]:
def score(row, questions, dicts:list =[]):
    caseid = row['caseid']
    answer_row = rdf.toPandas()[rdf.toPandas()['caseid'] == caseid].squeeze()
    
    score = 0
    total_questions = 0
    for q in questions:
        all_answers = answer_row[q]

        # questions for this section needs to fit into the answer_scores, text_impediments, or text_boosts
        q = q.replace('mis_financial_','').replace('mis_me_','').replace('mis_', '').replace('e_cohesion_', '')
        print(q)

        # weed out the questions people did not answer
        if not pd.isna(all_answers) and not all_answers == '':

            # make sure this is a question we are scoring
            for dic in dicts:
                if q in dic.keys():
                    answers = all_answers.split(' ')
                    for answer in answers:
                        total_questions += 1
                        if isinstance(dic[q],dict):
                            to_add = dic[q][answer]
                        else:
                            to_add = dic[q]
                        score += to_add
            
    return [score, total_questions]

In [0]:
def score_choices(row, col_ans_dict,score=0):
    for col in col_ans_dict.keys():
        answers = row[col]
        if not pd.isna(answers) and not answers == '':
            answers = answers.split(' ')
            for answer in answers:
                if not pd.isna(answer) and not answer == '':
                    score += col_ans_dict[col][answer]
    
    return score

def count_questions_answered(row, col_ans_dict,score=0):
    qas = 0
    for col in col_ans_dict.keys():
        answers = row[col]
        if not pd.isna(answers) and not answers == '':
            qas += 1
    
    return qas

def score_text(row,text_ans_dict,score=0):
    for col in text_ans_dict.keys():
        answer = row[col]
        if not pd.isna(answer) and not answer == '':
            score += text_ans_dict[col]

    return score

In [0]:
import pandas as pd
import plotly.express as px
import numpy as np

def plot_df(df):
    # -----------------------------
    # Chart 1: Horizontal Bar Chart of MAs by Score
    # -----------------------------
    df_sorted = df.sort_values("total_score", ascending=False)[:10]
    fig = px.bar(
        df_sorted,
        x="total_score",
        y="ma_name",
        orientation='h',
        title="Total Score by Managing Authority",
        category_orders={"ma_name": df_sorted.sort_values("total_score")['ma_name'].tolist()}
    )
    fig.update_layout(xaxis_title="Total Score", yaxis_title="Managing Authority")
    fig.show()

    # -----------------------------
    # Chart 2: Top 10 MAs by Score
    # -----------------------------
    display(df_sorted[['ma_name','ms_name','total_score']])

    # -----------------------------
    # Chart 3: Histogram with Percentile Lines
    # -----------------------------
    percentiles = np.percentile(df["total_score"], [25, 50, 75])
    fig = px.histogram(df, x="total_score", nbins=10, title="Distribution of Scores with Percentiles")
    for p, label in zip(percentiles, ["25th", "50th", "75th"]):
        fig.add_vline(x=p, line_dash="dash", line_color="red", annotation_text=f"{label}: {p:.1f}", annotation_position="top right")
    fig.update_layout(xaxis_title="Total Score", yaxis_title="Frequency")
    fig.show()

    # -----------------------------
    # Chart 4: Average Score by Region (Member State)
    # -----------------------------
    score_by_country = df.groupby("ms_name", as_index=False)["total_score"].mean()
    fig = px.choropleth(
        score_by_country,
        locations="ms_name",
        locationmode="country names",
        color="total_score",
        color_continuous_scale="Viridis",
        title="Average Total Score by Country"
    )
    fig.show()

    # -----------------------------
    # Chart 5: Count MAs per country
    # -----------------------------
    ma_counts = df.groupby("ms_name")["ma_name"].nunique().reset_index()
    ma_counts.columns = ["Member State", "Number of MAs"]
    ma_counts = ma_counts.sort_values("Number of MAs", ascending=False)
    fig = px.bar(
        ma_counts,
        x="Member State",
        y="Number of MAs",
        title="Number of Managing Authorities per Member State",
        text="Number of MAs"
    )
    fig.update_traces(textposition="outside")
    fig.update_layout(yaxis_title="Number of MAs", xaxis_title="Member State")
    fig.show()


## Technological Foresight and Engagement


In [0]:
clean_scoring = {
    'prior_api_considerations':{
        'financial_information':1, 
        'output_and_result_indicators':1, 
        'financial_instrument_information':1, 
        'no':0
        },
    'reason_for_no_api_use':{
        'unaware':-1, 
        'under_development':1, # proactive measure in favor of
        'lack_of_information':-1,
        'no_integration_developed':-1, 
        'managed_by_higher_level':0, # not in their court
        'lack_of_systems_integration':-1, 
        'diffs_in_data_formats':-1, 
        'budgetary_restrictions':-1, 
        'manual_deduplication':-1, 
        'data_quality_checks':-1, 
        'likes_as_is':-1, 
        'too_difficult':-1, 
        'other':-1
        },
    'resources_to_implement_api':{
        'external_buy_in':1, 
        'no_interest':0, 
        'field_compatibility':1, 
        'no_support_needed':0, 
        'unaware':0, 
        'budget':1, 
        'training':1, 
        'regulatory_changes':1, 
        'technical_assistance':1, 
        'improved_documentation':1, 
        'peer_experience':1, 
        'other':1
        },
    'reporting_clarity':{
        'yes':2, 
        'somewhat':1, 
        'no':0, 
        'does_not_know':0
        },
    'arachne_use':{
        'yes':1, 
        'no':0, 
        'unsure':0
        },  
   }

clean_texts = {
    'regional_national_integration_wants':1, 
    'regulatory_improvement_suggestions':1, 
    'missed_automation_opportunities':1, 
    'reporting_integration_challenges':1
    }

In [0]:
pd_rdf = rdf.toPandas()
mas_clean = pd_rdf.fillna('').loc[:,['ma_name','ms_name','caseid']]

mas_clean['clean_score'] = pd_rdf.apply(lambda row: score_choices(row,clean_scoring), axis=1)
mas_clean['clean_text_score'] = pd_rdf.apply(lambda row: score_text(row,clean_texts), axis=1)
mas_clean['total_score'] = mas_clean['clean_text_score'] + mas_clean['clean_score']

plot_df(mas_clean)

## Technology Sophistication of Managing Authorities

In [0]:
tsma_systems = {
        'single_or_separate_system':{
        'results_integrated_e_cohesion':1, 
        'financial_integrated_e_cohesion':1, 
        'one_mis_separate_e_cohesion':1, 
        'all_integrated': 5, 
        'all_separate':0
        },
    }

tsma_scores = {
    'single_or_separate_system':{
        'results_integrated_e_cohesion':1, 
        'financial_integrated_e_cohesion':1, 
        'one_mis_separate_e_cohesion':1, 
        'all_integrated': 5, 
        'all_separate':0
        },
    'system_integration_level':{
        'fully':3, 
        'partially':2, 
        'barely':1, 
        'not_at_all':0, 
        'na':0
        },
    }

tsma_boiler_scores = {
    'meeting_needs': {'fully':2,
        'partially':1,
        'minimally':0,
    },
    'updates':{
        'yes_maintenance':2,
        'yes_maintenance_and_improvements':3,
        'no_but_adds_regulation':1,
        'not_at_all':0
    },
    'api_data_transmission':{
        'yes':2,
        'no':0,
        'no_but_pursuing':1,
        'does_not_know':0,
    },
    'barriers_to_api_implementation':{
        'lack_of_technical_capacity':-1,
        'budget_constraints':-1,
        'regulatory_restrictions':-1,
        'legacy_issues':-1,
        'security_concerns':-1,
        'satisfied':-1,
        'other':-1,
    },
 }

tsma_boiler_texts = {
            'explain_partially':0,
            'other_api_barriers':1
            }

tsma_texts = {
    'system_integration_level_partial_barely':0
    }

def apply_hdrs(boiler_dict,parent_dict):
    tsma_boiler_headers = ['mis_','mis_financial_','mis_me_','e_cohesion_']
    for header in tsma_boiler_headers:
        for q in boiler_dict.keys():
            parent_dict[header+q] = boiler_dict[q]
    
    return parent_dict

tsma_scores = apply_hdrs(tsma_boiler_scores,tsma_scores)
tsma_texts = apply_hdrs(tsma_boiler_texts,tsma_texts)

In [0]:
pd_rdf = rdf.toPandas()
mas_tsma = pd_rdf.fillna('').loc[:,['ma_name','ms_name','caseid']]

mas_tsma['tsma_score'] = pd_rdf.apply(lambda row: score_choices(row,tsma_scores), axis=1)
mas_tsma['tsma_text_score'] = pd_rdf.apply(lambda row: score_text(row,tsma_texts), axis=1)
mas_tsma['total_score'] = mas_tsma['tsma_score'] + mas_tsma['tsma_text_score']

plot_df(mas_tsma)

## MA Data Processing Capability

In [0]:
mdpc_scores = {
    'sfc_submission_types':{
        'manual':0, 
        'api':2, 
        'excel':1, 
        'other':0,
        },
    'no_excel_upload':{
        'api':1,
        'mis_does_not_generate':0, 
        'unaware':0, 
        'error':0,
        },
    'data_validation_procedure':{'training':0, 
        'audits':0, 
        'double_entry':0, 
        'manual_check':0, 
        'other':0,
        },
    'operations_publishing_pipeline':{
        'fully_automatic':3, 
        'mostly_automatic':2, 
        'mostly_manual':1, 
        'fully_manual':0,
        },
     
    'operations_publishing_manual_steps':{
        'manual_creation':0, 
        'it_system_download':0, 
        'manual_publishing':0, 
        'api_planned':0, 
        'external_department_involved':0, 
        'manual_check':0, 
        'publication_not_working':0,
        },
    
    'arachne_submission':{
        'fully_automatic':3, 
        'mostly_automatic':2, 
        'mostly_manual':1, 
        'fully_manual':0,
        }
}

mdpc_texts = {'manual_steps_explanation':0}

In [0]:
pd_rdf = rdf.toPandas()
mas_mdpc = pd_rdf.fillna('').loc[:,['ma_name','ms_name','caseid']]

mas_mdpc['mdpc_score'] = pd_rdf.apply(lambda row: score_choices(row,mdpc_scores), axis=1)
mas_mdpc['mdpc_text_score'] = pd_rdf.apply(lambda row: score_text(row,mdpc_texts), axis=1)
mas_mdpc['total_score'] = mas_mdpc['mdpc_score'] + mas_mdpc['mdpc_text_score']

plot_df(mas_mdpc)

## Cross-System Integration Within Country

In [0]:
csi_scores = {
    'regional_national_integration_existing':{
        'population_registry':1, 
        'other_mis':1, 
        'public_procurement_system':1, 
        'e_government':1, 
        'keep':1, 
        'national_data_bank':1, 
        'funding_management':1, 
        'index':1, 
        'specific_national_registry':1, 
        'financial_management_system':1,
        },
    'eu_integration_existing':{
        'public_procurement_system':0, 
        'other_mis':0, 
        'arachne':0, 
        'keep':0, 
        'sfc':0,
        'index':0
        },
    'eu_integration_wants':{
        'ims':0, 
        'public_procurement_system':0,
        'other_mis':0, 
        'arachne':0, 
        'keep':0, 
        'funding_management':0, 
        'sfc':0, 
        'index':0, 
        'specific_national_registry':0
        },
    'table_1_to_2_data_origin':{
        'mis':0, 
        'e_cohesion':0, 
        'other':0,
        },
    'table_5_to_10_data_origin':{
        'mis':0, 
        'e_cohesion':0, 
        'other':0,
        },
    'table_12_data_origin':{
        'mis':0, 
        'e_cohesion':0, 
        'other':0,
        },
    'sfc_automation_level':{
        'fully':0, 
        'partially':0, 
        'not_at_all':0, 
        'other':0,
        },
    'operation_data_origin':{
        'me_mis':0,
        'financial_mis':0, 
        'e_cohesion':0, 
        'other':0,
        },
    'arachne_information_origin':{
        'me_mis':0, 
        'financial_mis':0, 
        'e_cohesion':0, 
        'other':0
        }
}

csi_texts = {'operation_data_origin_other':0, 'other_arachne_information_origin':0}

In [0]:
pd_rdf = rdf.toPandas()
mas_csi = pd_rdf.fillna('').loc[:,['ma_name','ms_name','caseid']]

mas_csi['csi_score'] = pd_rdf.apply(lambda row: score_choices(row,csi_scores), axis=1)
mas_csi['csi_text_score'] = pd_rdf.apply(lambda row: score_text(row,csi_texts), axis=1)
mas_csi['total_score'] = mas_csi['csi_score'] + mas_csi['csi_text_score']

plot_df(mas_csi)

## Total Score

In [0]:
# Rename total_score columns to distinguish each source
mas_mdpc_short = mas_mdpc.rename(columns={'total_score': 'score_mdpc'})
mas_csi_short = mas_csi.rename(columns={'total_score': 'score_csi'})
mas_clean_short = mas_clean.rename(columns={'total_score': 'score_clean'})
mas_tsma_short = mas_tsma.rename(columns={'total_score': 'score_tsma'})

# Merge all four on 'caseid'
df_merged = mas_mdpc_short[['caseid', 'ma_name', 'ms_name', 'score_mdpc']]\
    .merge(mas_csi_short[['caseid', 'score_csi']], on='caseid', how='inner')\
    .merge(mas_clean_short[['caseid', 'score_clean']], on='caseid', how='inner')\
    .merge(mas_tsma_short[['caseid', 'score_tsma']], on='caseid', how='inner')

# Calculate combined total_score
df_merged['total_score'] = (
    0.25 * df_merged['score_mdpc'] +
    0.25 * df_merged['score_csi'] +
    0.25 * df_merged['score_clean'] +
    0.25 * df_merged['score_tsma']
)

# Optional: reorder columns
total_df = df_merged[['caseid', 'ma_name', 'ms_name', 'score_mdpc', 'score_csi', 'score_clean', 'score_tsma', 'total_score']]

total_df.sort_values('total_score', ascending=False)

## Legacy

In [0]:
# Scoring for this section

answer_scores = {
 'meeting_needs': {'fully':2,
    'partially':1,
    'minimally':0,
    },
 
 'updates':{
    'yes_maintenance':2,
    'yes_maintenance_and_improvements':3,
    'no_but_adds_regulation':1,
    'not_at_all':0
    },
 
 'api_data_transmission':{
        'yes':2,
        'no':0,
        'no_but_pursuing':1,
        'does_not_know':0,
    },
 
 'barriers_to_api_implementation':{'lack_of_technical_capacity':-1,
    'budget_constraints':-1,
    'regulatory_restrictions':-1,
    'legacy_issues':-1,
    'security_concerns':-1,
    'satisfied':-1,
    'other':-1,
    },
 }

# boost score for having an answer, assuming this means more connections
text_boosts = {
    'regional_national_integration_existing':1,
    'eu_integration_existing':1,
    'regional_national_integration_wants':1,
    'eu_integration_wants':1
    }

# assume this means more impediments
text_impediments={
    'other_api_barriers':-1
    }

In [0]:
# --- Score and count answers for the section ---
mas = rdf.toPandas().fillna('').loc[:,['ma_name','ms_name','caseid']]
dicts = [answer_scores, text_boosts, text_impediments]
all_questions = qdf['column_name'].tolist()
system_questions = [q for q in all_questions if 'mis_' in q.lower() or 'e_cohesion_' in q.lower()]
mas['system_score & total_questions'] = mas.apply(lambda row: score(row, system_questions, dicts), axis=1)

# --- unpack answers and counts to get final score
mas['total_system_score'] = mas['system_score & total_questions'].apply(lambda x: x[0])
mas['total_questions'] = mas['system_score & total_questions'].apply(lambda x: x[1])
mas['system_score'] = mas['total_system_score'] / mas['total_questions']

In [0]:
import pandas as pd

# Sort by system_score descending
mas_sorted = mas.sort_values(by='system_score', ascending=False)

# Define your top percentile (e.g., 10%)
percentile = 0.10  # top 10%

# Calculate how many rows that is
top_n = int(len(mas_sorted) * percentile)

# Get the top X percentile rows
top_df = mas_sorted.head(top_n)

top_df

In [0]:
import matplotlib.pyplot as plt

mas_sorted = mas.dropna().sort_values(by='system_score', ascending=True)

plt.figure(figsize=(10, 12))
plt.barh(mas_sorted['ma_name'], mas_sorted['system_score'])
plt.xlabel('System Score')
plt.ylabel('Managing Authority (ma_name)')
plt.title('System Score by Managing Authority')
plt.tight_layout()
plt.show()


In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.hist(mas['system_score'].dropna(), bins=20, edgecolor='black')

for p in [25, 50, 75, 90]:
    val = np.percentile(mas['system_score'], p)
    plt.axvline(val, color='r', linestyle='--', label=f'{p}th percentile: {val:.1f}')

plt.xlabel('System Score')
plt.ylabel('Count')
plt.title('System Score Histogram with Percentiles')
plt.legend()
plt.tight_layout()
plt.show()