In [2]:
# columns
# ID (to be created in SQL)
# member_id
# provider_id
# Code
# Type
# State
# network_status
# cost
# payout
# date_of_service

In [4]:
import datetime as dt
import numpy as np
import pandas as pd
import random 

In [5]:
# create dataframe
procedure_claims= pd.DataFrame()

In [6]:
# 1. member_id and state

# SQL connection
from sqlalchemy import create_engine
import pymysql

# estabish connection
engine = create_engine('mysql+pymysql://USER:PASSWORD@HOST')

In [1]:
# use health_company database
engine.execute("USE health_company;")

In [41]:
# 1. import Member_ID, State columns from 'members' table in MySQL
df_member_id = pd.read_sql_query("select Member_ID, State from members;", con=engine)

In [57]:
# 2. provider_id, Network_Status, Type

# import Provider_ID , Network_Status column from 'providers' table in MySQL
df_prov_id = pd.read_sql_query("select Provider_ID , State, Network_Status, Type from providers;", con=engine)

In [58]:
# merge df_member_id and df_prov_id on State and take a random n=20000 sample
procedure_claims = pd.merge(df_member_id, df_prov_id, on="State", how='inner').sample(n=20000)

In [18]:
# 3. code, cost

# import Code, Cost from procedures table in sql 
df_codes_cost = pd.read_sql_query('select Type, Code, Cost from procedures;', con=engine)

In [64]:
# merge procedure_claims with df_codes_cost and take a n=20000 random sample
procedure_claims = pd.merge(procedure_claims, df_codes_cost, on="Type").sample(n=20000)

In [23]:
# 4. inner join cost column to dataframe with .merge, reset the index, delete the old index column
procedure_claims = pd.merge(procedure_claims, df_codes_cost, on="Code")

In [104]:
procedure_claims = procedure_claims.reset_index()

In [107]:
del[procedure_claims['index']]

In [112]:
# 5. payout

# these will be weighted random fractions of the cost columns. We will assume 80/20 cost sharing, meaning .8 will
# have the highest weight among the fractions. 
def weighted_choice(objects, weights):
    """ returns a random element from the sequence of 'objects', 
        the likelihood of the objects is weighted according 
        to the sequence of 'weights', i.e. percentages."""

    weights = np.array(weights, dtype=np.float64)
    sum_of_weights = weights.sum()
    # standardization:
    np.multiply(weights, 1 / sum_of_weights, weights)
    weights = weights.cumsum()
    x = random.random()
    for i in range(len(weights)):
        if x < weights[i]:
            return objects[i]

# call weighted_choice function, where both 0.8 and a random number between 0,1 are equally likey to be chosen
rates = [weighted_choice([0.8, round(random.uniform(0,1),2)], [0.5, 0.5]) for i in range(20000)]

In [113]:
# initialize empty list payout and iterate through dataframe
payout = []
for i,j in procedure_claims.iterrows():
    # if in-network, multiply Cost and rates to get payout
    if j[3] == "In-network":
        payout += [procedure_claims['Cost'][i]*rates[i]]
    # otherwise, enter 0
    else:
        payout +=[0]

In [114]:
# insert into dataframe
procedure_claims['Payout'] = payout

In [70]:
# 6. date of service

In [115]:
# import enrollment date from MySQL to ensure that DOS comes after enrollment date
enr_list = list(pd.read_sql_query('select Enrollment_Date from members;', con=engine)['Enrollment_Date'])

In [116]:
# define function to ensure March 20 2020 is not exceeded
def march_20(a):
    a += dt.timedelta(days=random.randint(1,100))
    while a > dt.date(2020, 3, 20):
        a -= dt.timedelta(days=1)
    return a

# create list from doubled enr_list
dos = [march_20(i) for i in enr_list*2]

In [117]:
# insert into dataframe
procedure_claims["Date_Of_Service"] = dos

In [122]:
# set the column order
procedure_claims = procedure_claims[['Member_ID', 'Provider_ID','Code', 'Type', 'State', 'Network_Status', 'Cost', 'Payout', 'Date_Of_Service']]

In [2]:
# create table in MySQL
engine.execute('CREATE TABLE procedure_claims (\
                  Claim_ID INT NOT NULL AUTO_INCREMENT,\
                  Member_ID INT,\
                  Provider_ID INT,\
                  Code VARCHAR(6),\
                  Type VARCHAR(50),\
                  State VARCHAR(20),\
                  Network_Status VARCHAR(50),\
                  Cost INT,\
                  Payout DECIMAL(6,2),\
                  Date_Of_Service DATE,\
                  PRIMARY KEY(Claim_ID)\
              ) AUTO_INCREMENT = 400000;')

In [125]:
# write to providers table
procedure_claims.to_sql('procedure_claims', con=engine, if_exists='append', index=False)

In [127]:
# close connection
engine.dispose()