In [4]:
from bs4 import BeautifulSoup
import datapackage
import datetime as dt
import numpy as np
import pandas as pd
import random 
import requests
import string

In [5]:
# columns
# ID - To be generated in MySQL
# Name
# State
# Type
# In Network

In [6]:
# create DataFrame
providers = pd.DataFrame()

In [7]:
# 1. types 
types = ['Behavioral Health', 'Cardiology', 'Dental', 'Dermatology', 'Durable Medical Equipment',\
         'Hospital', 'Internal Medicine', 'Neurology', 'OBGYN', 'Oncology', 'Orthodontics',\
         'Pediatrics', 'Physical Therapy', 'Primary Care', 'Urgent Care', 'Vision']


# capitalize each item in types list
provider_types = [i.upper() for i in types]

In [39]:
# 2. state/type colmns (company operates in the entire US, but primarily in the NE)
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

# create weight function, code adapted from (https://www.python-course.eu/weighted_choice_and_sample.php)
def weighted_choice(objects, weights):
    """ returns a random element from the sequence of 'objects', 
        the likelihood of the objects is weighted according 
        to the sequence of 'weights', i.e. percentages."""

    weights = np.array(weights, dtype=np.float64)
    sum_of_weights = weights.sum()
    # standardization:
    np.multiply(weights, 1 / sum_of_weights, weights)
    weights = weights.cumsum()
    x = random.random()
    for i in range(len(weights)):
        if x < weights[i]:
            return objects[i]
        
# state weights, where each number represents the weight attached to each state in the data set. Higher weights
# mean that more providers will be from that state. NE states were given higher weights
state_weights = [3, 3, 6, 4, 20, 10, 7, 5, 10, 4, 
          1, 1, 10, 5, 3, 2, 8, 3, 8, 15, 
          30, 5, 5, 5, 5, 5, 2, 2, 6, 30, 
          2, 50, 20, 1, 20, 1, 1, 40, 6, 6, 
          1, 4, 10, 1, 6, 25, 5, 5, 4, 1]


In [42]:
# create state_type list, where the first operation ensures that each state has one of each type of provider,
# while the second fills out the last of the 3000 with random states based on calculated weights

# create a state_type list 
state_type = [f"{i} {j}" for i in states for j in types] 
state_type += [f"{weighted_choice(states, state_weights)} {random.choice(types)}" for i in range(3000-(len(state_type)))]

In [43]:
# create states list to be added to dataframe
# states = [i.split()[0] for i in state_type]

In [54]:
# create a state_type list for one and two-word states to be combined and added to dataframe
states2 = [i.split()[0] + ' ' + i.split()[1] for i in state_type if i.split()[0] + ' ' + i.split()[1] in states]
states1 = [i.split()[0] for i in state_type if i.split()[0] + ' ' + i.split()[1] not in states]
states_list = states2 + states1

In [69]:
# create a provider_type list for one and two-word states to be combined and added to dataframe
provider_types2 = [i.split()[2:] for i in state_type if i.split()[0] + ' ' + i.split()[1] in states2]
provider_types1 = [i.split()[1:] for i in state_type if i.split()[0] + ' ' + i.split()[1] not in states2]
provider_types = provider_types2 + provider_types1

In [70]:
# join strings in provider_types list 
provider_types = [' '.join(i) for i in provider_types]

In [72]:
# 3. Names (taken from US city names combined with various types of medicine)

# we will create in-network provider names using US city names attached to a provider type.
# city data pulled from https://datahub.io/core/country-list/r/0.html

# US city names
cities = pd.read_csv('world-cities_csv.csv')
us_cities = cities.query('country=="United States"')
us_cities = list(us_cities['name'])
# use a set to prevent duplicate entries, then convert to list
us_cities = {i.upper().split()[0] for i in us_cities if "-" not in i}
us_cities = list(us_cities)

# combine both city names to provider_types list to get provider names
names = [f"{random.choice(us_cities)} {provider_types[i].upper()}" for i in range(3000)]

In [73]:
# insert lists into dataframe
providers["Provider_Name"] = names
providers["State"] = states_list
providers["Type"] = provider_types

In [79]:
# sort dataframe and reset index
providers = providers.sort_values(by=['State', "Type"])

In [90]:
# 4. in-network

# 90% of providers are in network, 10% out of network
network = [weighted_choice(["In-network", "Out-of-network"], [0.9,0.1]) for i in range(3000)]

In [91]:
# insert network list into dataframe
providers['Network_Status'] = network

In [93]:
providers

Unnamed: 0,Provider_Name,State,Type,Network_Status
825,OPELIKA BEHAVIORAL HEALTH,Alabama,Behavioral Health,Out-of-network
2602,DALTON BEHAVIORAL HEALTH,Alabama,Behavioral Health,In-network
826,SPARKS CARDIOLOGY,Alabama,Cardiology,In-network
827,PORT DENTAL,Alabama,Dental,In-network
2431,UNION DENTAL,Alabama,Dental,Out-of-network
828,ERIE DERMATOLOGY,Alabama,Dermatology,In-network
829,MASSAPEQUA DURABLE MEDICAL EQUIPMENT,Alabama,Durable Medical Equipment,In-network
830,MARYLAND HOSPITAL,Alabama,Hospital,In-network
831,CANTONMENT INTERNAL MEDICINE,Alabama,Internal Medicine,In-network
832,MIDWEST NEUROLOGY,Alabama,Neurology,In-network


In [94]:
# SQL create table and insert
from sqlalchemy import create_engine
import pymysql

engine = create_engine('mysql+pymysql://root:funny20!@#@localhost')

In [95]:
# use the health_company database
engine.execute('USE health_company;')

<sqlalchemy.engine.result.ResultProxy at 0x1194c94e0>

In [98]:
# create providers table (CHANGE ORDER OF STATE ADN TYPE)
engine.execute('CREATE TABLE providers (\
	Provider_ID INT NOT NULL AUTO_INCREMENT,\
	Provider_Name VARCHAR(50),\
    State VARCHAR(20),\
    Type VARCHAR(50),\
    Network_Status VARCHAR(50),\
    PRIMARY KEY(Provider_ID)\
    ) AUTO_INCREMENT = 200000;')

<sqlalchemy.engine.result.ResultProxy at 0x11a880710>

In [99]:
# show tables to verify table creation
engine.execute('SHOW TABLES').fetchall()

[('drugs',),
 ('members',),
 ('pharmacies',),
 ('procedure_claims',),
 ('procedures',),
 ('providers',),
 ('rx_claims',),
 ('states',)]

In [100]:
# describe pharmacy table to verify columns
engine.execute('DESC providers;').fetchall()

[('Provider_ID', 'int', 'NO', 'PRI', None, 'auto_increment'),
 ('Provider_Name', 'varchar(50)', 'YES', '', None, ''),
 ('State', 'varchar(20)', 'YES', '', None, ''),
 ('Type', 'varchar(50)', 'YES', '', None, ''),
 ('Network_Status', 'varchar(50)', 'YES', '', None, '')]

In [101]:
# write to providers table
providers.to_sql('providers', con=engine, if_exists='append', index=False)

In [102]:
# select providers table
engine.execute('SELECT * FROM providers;').fetchall()

[(200000, 'OPELIKA BEHAVIORAL HEALTH', 'Alabama', 'Behavioral Health', 'Out-of-network'),
 (200001, 'DALTON BEHAVIORAL HEALTH', 'Alabama', 'Behavioral Health', 'In-network'),
 (200002, 'SPARKS CARDIOLOGY', 'Alabama', 'Cardiology', 'In-network'),
 (200003, 'PORT DENTAL', 'Alabama', 'Dental', 'In-network'),
 (200004, 'UNION DENTAL', 'Alabama', 'Dental', 'Out-of-network'),
 (200005, 'ERIE DERMATOLOGY', 'Alabama', 'Dermatology', 'In-network'),
 (200006, 'MASSAPEQUA DURABLE MEDICAL EQUIPMENT', 'Alabama', 'Durable Medical Equipment', 'In-network'),
 (200007, 'MARYLAND HOSPITAL', 'Alabama', 'Hospital', 'In-network'),
 (200008, 'CANTONMENT INTERNAL MEDICINE', 'Alabama', 'Internal Medicine', 'In-network'),
 (200009, 'MIDWEST NEUROLOGY', 'Alabama', 'Neurology', 'In-network'),
 (200010, 'KIRYAS NEUROLOGY', 'Alabama', 'Neurology', 'In-network'),
 (200011, 'CHESTERFIELD NEUROLOGY', 'Alabama', 'Neurology', 'In-network'),
 (200012, 'APPLETON NEUROLOGY', 'Alabama', 'Neurology', 'In-network'),
 (200013

In [97]:
# show tables to verify table creation
engine.execute('DROP TABLE providers')

InternalError: (pymysql.err.InternalError) (1051, "Unknown table 'health_company.providers'")
[SQL: DROP TABLE providers]
(Background on this error at: http://sqlalche.me/e/2j85)

In [103]:
engine.dispose()