# Web Scraping the Centers for Medicare & Medicaid Services Website to Group MS-DRG Codes

###### CMS Website
###### https://www.cms.gov/ICD10Manual/version33-fullcode-cms/fullcode_cms/P0002.html

In [None]:
# import dependencies
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [None]:
# url with all the links to the individual MS-DRGs
index_url = "https://www.cms.gov/ICD10Manual/version33-fullcode-cms/fullcode_cms/P0002.html"

In [None]:
# scrape urls for each individual MS-DRG link
response = requests.get(index_url)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all('a', class_="compl")

url_begin = "https://www.cms.gov/ICD10Manual/version33-fullcode-cms/fullcode_cms/"

MSDRG_urls = []
MSDRG_nums = []
MSDRG_descr = []
for result in results:
    
    #scrape MS-DRG urls
    url_end = result['href']
    url_complete = url_begin + url_end
    MSDRG_urls.append(url_complete)
    
    #scrape MS-DRG numbers and descriptions
    MSDRG_text = result.text
    text_split = MSDRG_text.split()
    
    #MS-DRG numbers
    MSDRG_num = text_split[1]
    MSDRG_nums.append(MSDRG_num)
    
    #MS-DRG descriptions
    text_split = MSDRG_text.split()
    text_split2 = MSDRG_text.split("&nbsp;&nbsp;&nbsp;")
    for text in text_split2:
        text_list = text.split("\xa0\xa0\xa0")
        MSDRG_dx = text_list[1]
        MSDRG_descr.append(MSDRG_dx)

In [None]:
# scrape MS-DRG diagnosis categories and general descriptions from each MS-DRG link
DX_cats = []
DX_gen_descr = []

for url in MSDRG_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    results = soup.find_all('td', class_="heading")
    
    DX_cat = results[0].text
    DX_descr = results[1].text
    
    DX_cats.append(DX_cat)
    DX_gen_descr.append(DX_descr)

In [None]:
# create data dictionary
data = {'MSDRG_num': MSDRG_nums, 'MSDRG_cat': DX_cats, 'MSDRG_gen_descr': DX_gen_descr, 'MSDRG_descr': MSDRG_descr}

In [None]:
# convert data dictionary to pandas data frame
CMS_MSDRG = pd.DataFrame.from_dict(data)

In [None]:
# export data to csv
CMS_MSDRG.to_csv("../Datasets/CMS_MSDRG.csv", index = False)