In [7]:
#pip install sodapy


In [1]:
from sodapy import Socrata
import pandas as pd


In [2]:
# Socrata domain for CDC data
domain = "data.cdc.gov"

# Application token (optional)
app_token = None  # Replace with your application token if available

# Initialize the Socrata client
client = Socrata(domain, app_token)




In [3]:
# Retrieve metadata for all datasets
datasets_metadata = client.datasets()


In [4]:
dataset_structure = next((dataset for dataset in datasets_metadata if dataset.get('resource', {}).get('id') == 'dttw-5yxu'), None)
dataset_structure

{'resource': {'name': 'Behavioral Risk Factor Surveillance System (BRFSS) Prevalence Data (2011 to present)',
  'id': 'dttw-5yxu',
  'resource_name': None,
  'parent_fxf': [],
  'description': '2011 to present. BRFSS combined land line and cell phone prevalence data. BRFSS is a continuous, state-based surveillance system that collects information about modifiable risk factors for chronic diseases and other leading causes of death. Data will be updated annually as it becomes available. Detailed information on sampling methodology and quality assurance can be found on the BRFSS website (http://www.cdc.gov/brfss). Methodology: http://www.cdc.gov/brfss/factsheets/pdf/DBS_BRFSS_survey.pdf Glossary: https://data.cdc.gov/Behavioral-Risk-Factors/Behavioral-Risk-Factor-Surveillance-System-BRFSS-H/iuq5-y9ct',
  'attribution': 'Centers for disease control and prevention',
  'attribution_link': 'http://www.cdc.gov/brfss',
  'contact_email': 'PublicInquiriesDPH@cdc.gov',
  'type': 'dataset',
  'upd

In [5]:


# Convert the JSON structure in dataset_structure to a tabular format
if datasets_metadata:
    dataset_df = pd.json_normalize(datasets_metadata)
    dataset_df
else:
    print("No dataset structure found.")


dataset_df['column_count'] = dataset_df['resource.columns_name'].apply(len) 



dataset_df


Unnamed: 0,permalink,link,resource.name,resource.id,resource.resource_name,resource.parent_fxf,resource.description,resource.attribution,resource.attribution_link,resource.contact_email,...,metadata.access_points.application/html,metadata.access_points.HTML,metadata.access_points..exe,metadata.access_points..exe .zip,metadata.access_points.html,metadata.access_points..ZIP,metadata.access_points..zip,metadata.access_points..txt,metadata.access_points.map-server,column_count
0,https://data.cdc.gov/d/9bhg-hcku,https://data.cdc.gov/NCHS/Provisional-COVID-19...,Provisional COVID-19 Deaths by Sex and Age,9bhg-hcku,,[],"Effective September 27, 2023, this dataset wil...",NCHS/DVS,https://www.cdc.gov/nchs/nvss/vsrr/covid_weekl...,cdcinfo@cdc.gov,...,,,,,,,,,,16
1,https://data.cdc.gov/d/nr4s-juj3,https://data.cdc.gov/NCHS/Provisional-COVID-19...,Provisional COVID-19 Deaths: Focus on Ages 0-1...,nr4s-juj3,,[],"Effective June 28, 2023, this dataset will no ...",NCHS/DVS,https://www.cdc.gov/nchs/nvss/vsrr/covid19/ind...,cdcinfo@cdc.gov,...,,,,,,,,,,8
2,https://data.cdc.gov/d/vbim-akqf,https://data.cdc.gov/Case-Surveillance/COVID-1...,COVID-19 Case Surveillance Public Use Data,vbim-akqf,,[],<b>Note:</b>\nReporting of new COVID-19 Case S...,"CDC Data, Analytics and Visualization Task Force",https://www.cdc.gov/coronavirus/2019-ncov/case...,cdcinfo@cdc.gov,...,,,,,,,,,,12
3,https://data.cdc.gov/d/hk9y-quqm,https://data.cdc.gov/NCHS/Conditions-Contribut...,"Conditions Contributing to COVID-19 Deaths, by...",hk9y-quqm,,[],"Effective September 27, 2023, this dataset wil...",NCHS/DVS,https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/,cdcinfo@cdc.gov,...,,,,,,,,,,14
4,https://data.cdc.gov/d/w9zu-fywh,https://data.cdc.gov/Vaccinations/COVID-19-Vac...,COVID-19 Vaccine Distribution Allocations by J...,w9zu-fywh,,[],New weekly allocations of doses are posted eve...,HHS ASPA,,media@hhs.gov,...,,,,,,,,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,https://data.cdc.gov/d/8xdf-byx4,https://data.cdc.gov/National-Center-for-HIV-V...,Tuberculosis Epidemiologic Studies Consortium ...,8xdf-byx4,,[],This study used a mixed-methods approach to id...,,,tbesc3@cdc.gov,...,,,,,,,,,,0
1334,https://data.cdc.gov/d/3ix4-m2h6,https://data.cdc.gov/National-Institute-for-Oc...,The ETS domain-containing hematopoietic transc...,3ix4-m2h6,,[],Fibrogenic multi-walled carbon nanotubes (MWCN...,,,sa-cin-webteam@cdc.gov,...,,,,,,,,,,0
1335,https://data.cdc.gov/d/36v4-xsse,https://data.cdc.gov/NCHS/National-Post-acute-...,National Post-acute and Long-term Care Study A...,36v4-xsse,,[],The main goals of the National Post-acute and ...,NCHS/DHCS,,cdcinfo@cdc.gov,...,,,,,https://www.cdc.gov/rdc/application-process/su...,,,,,0
1336,https://data.cdc.gov/d/eu9y-rrqg,https://data.cdc.gov/NCHS/National-Post-acute-...,National Post-acute and Long-term Care Study R...,eu9y-rrqg,,[],The main goals of the National Post-acute and ...,NCHS/DHCS,,cdcinfo@cdc.gov,...,,,,,https://www.cdc.gov/rdc/application-process/su...,,,,,0


In [6]:
from datetime import datetime

# Filter datasets updated this year
current_year = datetime.now().year
dataset_filtered = dataset_df[
    (pd.to_datetime(dataset_df['resource.data_updated_at']).dt.year == current_year)
]

# Sort by column count in descending order
dataset_filtered = dataset_filtered.sort_values(by='resource.download_count', ascending=False)

dataset_filtered = dataset_filtered[['resource.data_updated_at', 'permalink', 'resource.id', 'resource.name', 'resource.columns_name', 'column_count', 'resource.description', 
'resource.download_count', 'resource.page_views.page_views_last_week']]
dataset_filtered

Unnamed: 0,resource.data_updated_at,permalink,resource.id,resource.name,resource.columns_name,column_count,resource.description,resource.download_count,resource.page_views.page_views_last_week
9,2025-03-27T14:57:56.000Z,https://data.cdc.gov/d/r8kw-7aab,r8kw-7aab,Provisional COVID-19 Death Counts by Week Endi...,"[Data as of, Start Date, End Date, Group, Year...",17,"Effective September 27, 2023, this dataset wil...",631298,688
42,2025-01-30T23:14:15.000Z,https://data.cdc.gov/d/hn4x-zwk7,hn4x-zwk7,"Nutrition, Physical Activity, and Obesity - Be...","[StratificationCategory1, StratificationID1, S...",37,"This dataset includes data on adult's diet, ph...",119428,468
51,2025-03-13T13:53:37.000Z,https://data.cdc.gov/d/xkb8-kh2a,xkb8-kh2a,VSRR Provisional Drug Overdose Death Counts,"[State Name, Data Value, Indicator, State, Foo...",12,This data presents provisional counts for drug...,117228,229
8,2025-03-21T01:53:57.000Z,https://data.cdc.gov/d/kh8y-3es6,kh8y-3es6,HHS Provider Relief Fund,"[Provider Name, Payment, City, State]",4,HHS is providing support to healthcare provide...,101499,360
71,2025-02-14T17:19:02.000Z,https://data.cdc.gov/d/hfr9-rurv,hfr9-rurv,Alzheimer's Disease and Healthy Aging Data,"[StratificationCategoryID2, StratificationCate...",33,2015-2022. This data set contains data from BR...,85663,347
...,...,...,...,...,...,...,...,...,...
1331,2025-02-18T17:17:45.000Z,https://data.cdc.gov/d/3g9b-sxea,3g9b-sxea,Evaluation of A Passive Back-Support Exoskelet...,[],0,The objective of this study was to evaluate th...,10,1
1334,2025-02-18T20:10:05.000Z,https://data.cdc.gov/d/3ix4-m2h6,3ix4-m2h6,The ETS domain-containing hematopoietic transc...,[],0,Fibrogenic multi-walled carbon nanotubes (MWCN...,9,1
1330,2025-02-18T18:17:20.000Z,https://data.cdc.gov/d/9bkd-2ag5,9bkd-2ag5,State and Local Area Integrated Telephone Surv...,[],0,The State and Local Area Integrated Telephone ...,0,5
1335,2025-02-20T20:25:40.000Z,https://data.cdc.gov/d/36v4-xsse,36v4-xsse,National Post-acute and Long-term Care Study A...,[],0,The main goals of the National Post-acute and ...,0,9


In [16]:
# Dataset ID to download
dataset_id = '55yu-xksw'  # Use only the dataset ID, not the full URL

# Download the dataset
results = client.get(dataset_id, limit=1000)  # Adjust the limit as needed

# Convert the results to a pandas DataFrame
dataset = pd.DataFrame.from_records(results)

# Display the first few rows of the dataset
dataset.head()

Unnamed: 0,year,locationabbr,locationdesc,geographiclevel,datasource,class,topic,data_value,data_value_unit,data_value_type,...,stratification2,topicid,locationid,y_lat,x_lon,georeference,:@computed_region_skr5_azej,:@computed_region_hjsp_umg2,data_value_footnote_symbol,data_value_footnote
0,2020,AK,Denali,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,348.8,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,White,T2,2068,63.67881971,-149.9608012,"{'type': 'Point', 'coordinates': [-149.9608012...",1671,51,,
1,2020,CA,California,State,NVSS,Cardiovascular Diseases,Heart Disease Mortality,230.1,"per 100,000 population","Age-adjusted, 3-year Average Rate",...,More than one race,T2,6,37.2414,-119.601,"{'type': 'Point', 'coordinates': [-119.601, 37...",1196,8,,
2,2020,CO,Park County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,135.9,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,White,T2,8093,39.11561621,-105.7086982,"{'type': 'Point', 'coordinates': [-105.7086982...",1062,9,,
3,2020,FL,Walton County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,126.5,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Asian,T2,12131,30.65596581,-86.15745736,"{'type': 'Point', 'coordinates': [-86.15745736...",1093,30,,
4,2020,GA,Whitfield County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,155.1,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Hispanic,T2,13313,34.80385386,-84.96211184,"{'type': 'Point', 'coordinates': [-84.96211184...",1620,31,,


In [17]:
dataset_structure = next((dataset for dataset in datasets_metadata if dataset.get('resource', {}).get('id') == dataset_id), None)
dataset_structure

{'resource': {'name': 'Heart Disease Mortality Data Among US Adults (35+) by State/Territory and County – 2019-2021',
  'id': '55yu-xksw',
  'resource_name': None,
  'parent_fxf': [],
  'description': '2019 to 2021, 3-year average. Rates are age-standardized. County rates are spatially smoothed. The data can be viewed by sex and race/ethnicity. Data source: National Vital Statistics System. Additional data, maps, and methodology can be viewed on the Interactive Atlas of Heart Disease and Stroke https://www.cdc.gov/heart-disease-stroke-atlas/about/index.html',
  'attribution': 'CDC Division for Heart Disease and Stroke Prevention, Interactive Atlas of Heart Disease and Stroke',
  'attribution_link': 'https://www.cdc.gov/heart-disease-stroke-atlas/about/index.html',
  'contact_email': 'dhdsprequests@cdc.gov',
  'type': 'dataset',
  'updatedAt': '2025-02-04T19:31:00.000Z',
  'createdAt': '2024-02-16T22:01:02.000Z',
  'metadata_updated_at': '2025-02-04T19:31:00.000Z',
  'data_updated_at': 