In [1]:
# 10000 members

# Member_Name
# Age
# State
# Plan
# Gender
# Enrollment_Date

In [2]:
from bs4 import BeautifulSoup
import datapackage
import datetime as dt
import numpy as np
import pandas as pd
import random 
import requests
import string
import weighted_choice

In [3]:
# create dataframe 
members = pd.DataFrame()

In [4]:
# 1. names 
# we will create member names using city names as first names and common surnames. 
# City data pulled from https://datahub.io/core/country-list/r/0.html

# first names (to be taken from US city names)
cities = pd.read_csv('world-cities_csv.csv')
us_cities = cities.query('country=="United States"')
us_cities = list(us_cities['name'])
us_cities = [i.split()[0] for i in us_cities if "-" not in i]

# last names
# scrape webpage to import 1000 most common surnames in the US as a data frame, df
url = 'https://names.mongabay.com/data/1000.html'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]
# get surname column as a list, deleting first row
last = list(df[0])
del last[0]

# create full names
names = [random.choice(us_cities).upper() + ' ' + random.choice(last) for i in range(10000)]

# enter into members
members["Member_Name"] = names

In [5]:
# 2. age
members["Age"] = [random.randint(11,100) for i in range(10000)]

In [6]:
# 3. state (company operates in the entire US, but primarily in the NE)
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

# create weight function (code adapted from https://www.python-course.eu/weighted_choice_and_sample.php)
def weighted_choice(objects, weights):
    """ returns a random element from the sequence of 'objects', 
        the likelihood of the objects is weighted according 
        to the sequence of 'weights', i.e. percentages."""

    weights = np.array(weights, dtype=np.float64)
    sum_of_weights = weights.sum()
    # standardization:
    np.multiply(weights, 1 / sum_of_weights, weights)
    weights = weights.cumsum()
    x = random.random()
    for i in range(len(weights)):
        if x < weights[i]:
            return objects[i]
        
# state weights - Where each number represents the weight attached to each state in the data set. Higher weights
# mean that more members will be from that state (NE region states were given higher weights)
state_weights = [3, 3, 6, 4, 20, 10, 7, 5, 10, 4, 
          1, 1, 10, 5, 3, 2, 8, 3, 8, 15, 
          30, 5, 5, 5, 5, 5, 2, 2, 6, 30, 
          2, 50, 20, 1, 20, 1, 1, 40, 6, 6, 
          1, 4, 10, 1, 6, 25, 5, 5, 4, 1]

# call weighted_choice function
member_state = [weighted_choice(states, state_weights)for i in range(10000)]

# insert into dataframe
members["State"] = member_state

In [7]:
# 4. plan, weighted, where 10% of members have Bronze, 30% have Silver, 40% have Gold, 20% have Platinum
plans = ['Bronze Star', 'Silver Star', 'Gold Star', 'Platinum Star']

# plan weights
plan_weights = [1000,3000,4000,2000]

# run weighted choice funtion and insert into dataframe
mbr_plans = [weighted_choice(plans, plan_weights) for i in range(10000)]
members["Plan"] = mbr_plans

In [8]:
# 5. gender
gender = []
for i in members['Member_Name'].items():
    if i[-1].split()[0][-1] in ["A", "E", "H", "I", "U", "S", "W", "Y"]:
        gender += ['F']
    else:
        gender += ['M']

# insert into dataframe
members["Gender"] = gender

In [9]:
# check gender count
members.groupby("Gender")["Member_Name"].count()

Gender
F    4297
M    5703
Name: Member_Name, dtype: int64

In [10]:
# 6. enrollment date

# create list of all possible dates from January 2010 to March 2020
start = dt.datetime.strptime("2010-01-01", "%Y-%m-%d")
end = dt.datetime.strptime("2020-03-01", "%Y-%m-%d")
date_array = (start + dt.timedelta(days=x) for x in range(0, (end-start).days))
 
date_list = [i.strftime("%Y-%m-%d") for i in date_array]

members['Enrollment_Date'] = [random.choice(date_list) for i in range(10000)]

In [11]:
members.head()

Unnamed: 0,Member_Name,Age,State,Plan,Gender,Enrollment_Date
0,FRAMINGHAM QUINTERO,94,Virginia,Silver Star,M,2017-09-16
1,BOWLING WONG,14,Missouri,Gold Star,M,2014-08-11
2,TULSA STEPHENS,74,California,Silver Star,F,2013-08-10
3,FOREST NELSON,48,Maryland,Silver Star,M,2017-03-01
4,SURPRISE RAMIREZ,49,New York,Platinum Star,F,2015-01-12


In [12]:
# SQL connection
from sqlalchemy import create_engine
import pymysql

# estabish connection
engine = create_engine('mysql+pymysql://USER:PASSWORD@HOST')

In [13]:
# show databases
engine.execute("SHOW DATABASES").fetchall()

[('country_data',),
 ('first_schema',),
 ('health_company',),
 ('information_schema',),
 ('mysql',),
 ('performance_schema',),
 ('python_mysql',),
 ('sys',)]

In [14]:
# Use database health_company
engine.execute("USE health_company;")

<sqlalchemy.engine.result.ResultProxy at 0x11a937908>

In [18]:
# create table members (Do this in MySQL)
engine.execute('CREATE TABLE members (\
	Member_ID INT NOT NULL AUTO_INCREMENT,\
	Member_Name VARCHAR(50),\
    Age INT,\
    State VARCHAR(20),\
    Plan VARCHAR(20),\
    Gender VARCHAR(1),\
    Enrollment_Date DATE,\
    PRIMARY KEY(Member_ID)\
    ) AUTO_INCREMENT = 100000;')

<sqlalchemy.engine.result.ResultProxy at 0x11c87e470>

In [19]:
# write to members table
members.to_sql('members', con=engine, if_exists='append', index=False)

In [20]:
# select all from members
engine.execute("select * from members").fetchall()

[(100000, 'FRAMINGHAM QUINTERO', 94, 'Virginia', 'Silver Star', 'M', datetime.date(2017, 9, 16)),
 (100001, 'BOWLING WONG', 14, 'Missouri', 'Gold Star', 'M', datetime.date(2014, 8, 11)),
 (100002, 'TULSA STEPHENS', 74, 'California', 'Silver Star', 'F', datetime.date(2013, 8, 10)),
 (100003, 'FOREST NELSON', 48, 'Maryland', 'Silver Star', 'M', datetime.date(2017, 3, 1)),
 (100004, 'SURPRISE RAMIREZ', 49, 'New York', 'Platinum Star', 'F', datetime.date(2015, 1, 12)),
 (100005, 'PORT THOMPSON', 97, 'Virginia', 'Gold Star', 'M', datetime.date(2020, 1, 28)),
 (100006, 'DRACUT MONTGOMERY', 28, 'North Carolina', 'Gold Star', 'M', datetime.date(2011, 6, 14)),
 (100007, 'WEST HARRIS', 21, 'Arkansas', 'Silver Star', 'M', datetime.date(2015, 12, 22)),
 (100008, 'TRENTON MAYER', 79, 'Nevada', 'Platinum Star', 'M', datetime.date(2014, 3, 20)),
 (100009, 'BROWNWOOD CISNEROS', 87, 'Ohio', 'Bronze Star', 'M', datetime.date(2016, 3, 7)),
 (100010, 'FREDERICKSON ATKINS', 30, 'West Virginia', 'Gold Star'

In [21]:
# close connection
engine.dispose()