In [1]:
from bs4 import BeautifulSoup
import datapackage
import datetime as dt
import numpy as np
import pandas as pd
import random 
import requests
import string

In [2]:
# ID - To be generated in MySQL
# Name
# State
# In Network

In [3]:
# create DataFrame
pharmacies = pd.DataFrame()

In [4]:
# 1. name (pharmacy names will consist of common US surnames and the word "PHARMACY")

# scrape webpage to import 1000 most common surnames in the US as a data frame, df
url = 'https://names.mongabay.com/data/1000.html'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]

# get surname column as a list, deleting index row, adding additional name to get to 1000 names
last = list(df[0]) + ["Castaway"]
del last[0]

# add "PHARMACY" to each item in the list, capitalizing each item
last = [i.upper() + " PHARMACY" for i in last]

In [21]:
# 2. state

# US state list
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

# create weight function, code adapted from (https://www.python-course.eu/weighted_choice_and_sample.php)
def weighted_choice(objects, weights):
    """ returns a random element from the sequence of 'objects', 
        the likelihood of the objects is weighted according 
        to the sequence of 'weights', i.e. percentages."""

    weights = np.array(weights, dtype=np.float64)
    sum_of_weights = weights.sum()
    # standardization:
    np.multiply(weights, 1 / sum_of_weights, weights)
    weights = weights.cumsum()
    x = random.random()
    for i in range(len(weights)):
        if x < weights[i]:
            return objects[i]
        
# state weights, where each number represents the weight attached to each state in the data set. Higher weights
# mean that more pharmacies will be from that state. NE states were given higher weights
state_weights = [3, 3, 6, 4, 20, 10, 7, 5, 10, 4, 
          1, 1, 10, 5, 3, 2, 8, 3, 8, 15, 
          30, 5, 5, 5, 5, 5, 2, 2, 6, 30, 
          2, 50, 20, 1, 20, 1, 1, 40, 6, 6, 
          1, 4, 10, 1, 6, 25, 5, 5, 4, 1]

# create pharm_state list, where the first operation ensures that each state has two in-network pharmacies,
# while the second fills out the last of the 1000 with random states based on calculated weights
pharm_state = 2*states
pharm_state += [weighted_choice(states, state_weights)for i in range(1000-100)]


In [44]:
# 3. in-network

# list of 900 in-network pharmacies, 100 out-of-network pharmacies
network = ["In-network" for i in range(900)] + ["Out-of-network" for i in range(100)]

In [45]:
# insert into dataframe
pharmacies['Pharmacy_Name'] = last
pharmacies["State"] = pharm_state
pharmacies['Network_Status'] = network

In [21]:
# SQL create table and insert
from sqlalchemy import create_engine
import pymysql

engine = create_engine('mysql+pymysql://USER:PASSWORD@HOST')

In [2]:
# use the health_company database
engine.execute('USE health_company;')

In [1]:
# create pharmacies table
engine.execute('CREATE TABLE pharmacies (\
    Pharmacy_ID INT NOT NULL AUTO_INCREMENT,\
    Pharmacy_Name VARCHAR(50),\
    State VARCHAR(2),\
    Network_Status VARCHAR(50),\
    PRIMARY KEY(Pharmacy_ID)\
    ) AUTO_INCREMENT = 300000;')

In [52]:
# write to pharmacies table
pharmacies.to_sql('pharmacies', con=engine, if_exists='append', index=False)

In [26]:
# close connection
engine.dispose()