# Parse UMD-CTIS dataset
Note: The UMD Global CTIS API is currently inactive. This code still calls it as it was developed before its migration. However, it is possible to manually download the data from https://www.icpsr.umich.edu/web/ICPSR/studies/39206/versions/V3 .

In [None]:
import pandas as pd
import numpy as np
import datetime as DT
import requests
import json 
import warnings
import csv

# disable warnings
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# list of indicators
indicators = [
    "mask",
    "worried_catch_covid",
    "belief_masking_effective",
    "received_news_local_health",
    "received_news_experts",
    "received_news_who",
    "received_news_govt_health",
    "received_news_politicians",
    "received_news_journalists",
    "received_news_friends",
    "trust_covid_info_local_health",
    "trust_covid_info_experts",
    "trust_covid_info_who",
    "trust_covid_info_govt_health",
    "trust_covid_info_politicians",
    "trust_covid_info_journalists",
    "trust_covid_info_friends"
]

# Dictionary to save indicators for each country
indicator_country_map = {}

# Cycle over indicators
for i, indicator in enumerate(indicators, start=1):
    #print(f"\nAnalizzando indicatore {i}/{len(indicators)}: {indicator}")
    url = f"https://covidmap.umd.edu/apiv2/resources?indicator={indicator}&type=daily&country=all&region=all&daterange=20210521-20220625"
    response = json.loads(requests.get(url, verify=False).text)
    df = pd.DataFrame.from_dict(response['data'])
    df["survey_date"] = pd.to_datetime(df['survey_date'], errors='coerce')
    df.to_csv(f'Data/all_{indicator}.csv', index=False)

    country_counts = df.groupby('country').size()     # count lines per country
    # Filter countries with more than 300 lines
    countries_over_300 = country_counts[country_counts > 300].index.tolist()
    indicator_country_map[indicator] = countries_over_300

# Print final result
print("\n===> Countries with more than 300 lines for each indicator:")
for indicator, countries in indicator_country_map.items():
    print(f"{indicator}: {len(countries)} paesi -> {countries}")

# Find common countries to all lists 
common_countries = set.intersection(*map(set, indicator_country_map.values()))
print("===> Final list of countries with >300 lines:")
print(sorted(common_countries))



In [None]:
da_eliminare=[]
# Create CSV file with only common_countries
for indicator in indicators:

    url = f"https://covidmap.umd.edu/apiv2/resources?indicator={indicator}&type=daily&country=all&region=all&daterange=20210521-20220625"
    response = json.loads(requests.get(url, verify=False).text)
    df = pd.DataFrame.from_dict(response['data'])
    df["survey_date"] = pd.to_datetime(df['survey_date'], errors='coerce')
    
    # Filter common countries
    df_common = df[df['country'].isin(common_countries)]

    # Save dataframe into CSV 
    df_common[['country']].to_csv(f'Data/common_{indicator}.csv', index=False)
    
    # Verify if there are data holes longer than 7 days
    for country in common_countries:
        country_df = df_common[df_common['country'] == country]
      
        country_df = country_df.sort_values('survey_date')  # Ordina per data
        
        # Dates difference between consecutive dates
        country_df['date_diff'] = country_df['survey_date'].diff().dt.days
        # Gaps > 7 days?
        gaps = country_df[country_df['date_diff'] > 7]
        if not gaps.empty:
            #print(f"There are gaps in {country} for the indicator
            
            # Exclude da_eliminiare countries with big gaps
for country in da_eliminare:
    while country in common_countries:
        common_countries.remove(country)
        
# Save included countries in CSV 
with open("csv/common_countries.csv", mode="w", newline='', encoding="utf-8") as file_csv:
    writer = csv.writer(file_csv)
    writer.writerow(["country"])
    for country in common_countries:
        writer.writerow([country]){indicator}:")
            #print(gaps[['country', 'survey_date', 'date_diff']])
            da_eliminare.append(country)
da_eliminare = list(set(da_eliminare))
da_eliminare

# Exclude da_eliminiare countries with big gaps
for country in da_eliminare:
    while country in common_countries:
        common_countries.remove(country)
        
# Save included countries in CSV 
with open("csv/common_countries.csv", mode="w", newline='', encoding="utf-8") as file_csv:
    writer = csv.writer(file_csv)
    writer.writerow(["country"])
    for country in common_countries:
        writer.writerow([country])

### For each country and date, save indicators, samplesize and pct
This creates the first preliminary database in the selection process associated with Tab 1

In [None]:

# Read and combine CSVs
dfs = []
for indicator in indicators:
    df = pd.read_csv(f'Data/all_{indicator}.csv')
    df['survey_date'] = pd.to_datetime(df['survey_date'])
    df = df[['country', 'survey_date', 'indicator', 'sample_size', 'pct']]
    
    # Filter common countries
    if 'common_countries' in locals():
        df = df[df['country'].isin(common_countries)]
    
    dfs.append(df)

# Combine data
all_data = pd.concat(dfs)

# Wide Dataframe
wide_df = all_data.pivot_table(
    index=['country', 'survey_date'],
    columns='indicator',
    values=['sample_size', 'pct'],
    aggfunc='first'  # Prende il primo valore se ci sono duplicati
)

# Flatten multiindex columns
wide_df.columns = [f"{col[0]}_{col[1]}" for col in wide_df.columns]

# Reset index
wide_df = wide_df.reset_index()

# Reorder columns 
cols_order = ['country', 'survey_date']
for indicator in indicators:
    cols_order.extend([f'sample_size_{indicator}', f'pct_{indicator}'])

# Select existing columns only
cols_order = [col for col in cols_order if col in wide_df.columns]
wide_df = wide_df[cols_order]

# Save result
wide_df.to_csv('csv/FULL_DATA1.csv', index=False)