# Data Cleaning for clients

First let's load the dataset and explore a little bit.

In [62]:
import pandas as pd
import numpy as np

CLIENTS_FILE = './uwwi_dataset_clients_v2.csv'     # updated to clients_v2.csv

df_client = pd.read_csv(CLIENTS_FILE)
df_client.head(5)

  df_client = pd.read_csv(CLIENTS_FILE)


Unnamed: 0,Client_Id,Client_CreateStamp,Client_EditStamp,ClientOption_PreferredLanguageOfCaller,ClientSystem_PreferredLanguageOther,ClientAddressus_ClientAddressus_city,ClientAddressus_ClientAddressus_county,ClientAddressus_ClientAddressus_state,ClientAddressus_ClientAddressus_zip,ClientCustom_AgeOfPersonNeedingAssistance,ClientCustom_EthnicityOther,ClientOption_GenderOptionId,ClientOption_VeteranStatusOptionlist
0,1,2017-04-07T15:34:04.872169-05:00,2022-04-23T00:53:31.812322-05:00,[],,FOND DU LAC,FOND DU LAC,WI,54935.0,,,[],[]
1,2,2017-04-10T08:08:10.404723-05:00,2022-04-23T00:53:33.172375-05:00,[],,MADISON,DANE,WI,53705.0,,,[],[]
2,3,2017-04-10T08:21:14.98351-05:00,2022-02-28T19:01:21.95251-06:00,[],,,,,54115.0,,,[],[]
3,4,2017-04-10T08:25:16.115921-05:00,2022-04-23T00:53:33.445243-05:00,[],,MENASHA,WINNEBAGO,WI,54952.0,,,[],[]
4,5,2017-04-10T08:42:59.367602-05:00,2022-04-23T00:53:33.678873-05:00,[],,NEENAH,WINNEBAGO,WI,54956.0,,,[],[]


First let's clean up the column names. This is the Client dataset, so it is not necessary to have every variable name start with "Client" or "Client_". Some columns also have "Addressus" repeated (which presumably stands for Address United States?).

In [63]:
cols = df_client.columns

# we do not need the Client Prefix
new_col_names = [cols[i] for i in range(len(cols))]

for i in range(len(cols)):
    name = new_col_names[i]
    name = name.removeprefix('Client') # redundant
    name = name.removeprefix('Addressus') # redundant
    name = name.removeprefix("_")
    name = name.removeprefix('Client') # redundant
    name = name.removeprefix('Address') # redundant
    new_col_names[i] = name
    
print(new_col_names)
new_col_names = {cols[i]: new_col_names[i] for i in range(len(cols))}
df_client = df_client.rename(columns=new_col_names)

['Id', 'CreateStamp', 'EditStamp', 'Option_PreferredLanguageOfCaller', 'System_PreferredLanguageOther', 'us_city', 'us_county', 'us_state', 'us_zip', 'Custom_AgeOfPersonNeedingAssistance', 'Custom_EthnicityOther', 'Option_GenderOptionId', 'Option_VeteranStatusOptionlist']


Let's go through columns one at a time and clean them up/reformat them. 

First, the Client_Id column looks like it's okay

In [64]:
np.all([isinstance(x,int) for x in df_client["Id"]])

True

Next the CreateStamp and EditStamp columns are complicated time strings. These could be converted to pandas timestamp objects here but this is not a current priority. 

In [65]:
# for future - convert to Timestamp
df_client['CreateStamp'] = pd.to_datetime(df_client['CreateStamp']) #, utc=True)         # Add utc=True to create column as datetime64[ns], otherwise will still be 'object'
df_client['EditStamp'] = pd.to_datetime(df_client['EditStamp']) #, utc=True)

# column values are now 'datetime.datetime' or 'pandas._libs.tslibs.timestamps.Timestamp'
print(type(df_client['CreateStamp'][0]))
print(type(df_client['EditStamp'][0]))

# column is still 'object' if 'utc=True' not set above 
print(df_client.info())

<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830450 entries, 0 to 830449
Data columns (total 13 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   Id                                   830450 non-null  int64  
 1   CreateStamp                          830450 non-null  object 
 2   EditStamp                            830450 non-null  object 
 3   Option_PreferredLanguageOfCaller     830450 non-null  object 
 4   System_PreferredLanguageOther        446 non-null     object 
 5   us_city                              798744 non-null  object 
 6   us_county                            797862 non-null  object 
 7   us_state                             799709 non-null  object 
 8   us_zip                               762323 non-null  object 
 9   Custom_AgeOfPersonNeedingAssistance  93127 non-null   float64
 10  Custom_EthnicityOther   

Next the preferred language columns are lists and NaNs. These could be represented better but this is not a current priority.

In [66]:
# for future - convert to List

# removes brackets and single quotes from the values
df_client["Option_PreferredLanguageOfCaller"] = df_client["Option_PreferredLanguageOfCaller"].str.strip('[]').str.strip("''")
print(df_client["Option_PreferredLanguageOfCaller"].str.strip('[]').str.strip("''").value_counts())

# print null value counts
print()
print(df_client["System_PreferredLanguageOther"].isnull().value_counts())

                          702720
English                   116272
Spanish                     8170
Undetermined                1627
Declined                     609
Other                        580
American Sign Language       179
Hmong                         91
Chinese                       59
French                        35
Vietnamese                    31
German                        23
Russian                       21
Korean                        17
Italian                       11
Tagalog                        5
Name: Option_PreferredLanguageOfCaller, dtype: int64

True     830004
False       446
Name: System_PreferredLanguageOther, dtype: int64


In the address blocks, we have some issues of inconsistent formatting, and numerically encoded zip codes. Let's change the zip codes to strings, and reformat the City and County names

In [86]:

print(pd.unique(df_client["us_city"]))
print(pd.unique(df_client["us_county"]))
print(pd.unique(df_client["us_zip"]))

['FOND DU LAC' 'MADISON' nan ... '53092' 'Coldwater' 'Taloga']
['FOND DU LAC' 'DANE' nan ... 'Oneida ' 'Dewey' 'Branch']
['54935' '53705' '54115' ... '49036' '33903' '73667']


In [88]:
str_zips = [x if pd.isnull(x) else str(x)[0:5] for x in df_client["us_zip"]]
df_client["us_zip"] = str_zips

['54935' '53705' '54115' ... '49036' '33903' '73667']


The US cities column requires more cleaning than can be effectively done during this time. It will be left in favor of the zips.

In [85]:
# for future

# some of these are all caps such as MALVERN
# some of them are all lower such as new glarus
# some of them zip codes such as 54952
# some of them are just missing as nan
# some of them are title case such as Stevens Point
# some are abbrviations such as ATL
# some are misspelled such as "milaukee" or "miwlaukee"

'''
# could try pyspellchecker

import sys
!{sys.executable} -m pip install pyspellchecker

from spellchecker import SpellChecker
checker = SpellChecker()
print(checker)
city = 'milwauke'
print(checker.correction(city))
'''

# at least one is just an empty space
# some are complete errors, such as "Hispanic or Latino"

# the second half seems to be much better formatted
# being all consistently title case
# however it still contains some zip codes
#[print(x) for x in pd.unique(df_client["us_city"])]

<spellchecker.spellchecker.SpellChecker object at 0x000002B39D966B00>
<built-in method title of str object at 0x000002B391540BB0>


In [70]:
# much of the same issues as with city
# one is "refused"
# several of these contain whole addresses, which is a privacy issue
# these will need to be combed through with more precision than I can offer at the moment

# [print(x) for x in pd.unique(df_client["us_county"])]
print(None)

None


The states are just strings, except for one which is labeled as "Array" and should instead likely be nan

In [71]:
pd.unique(df_client["us_state"])

array(['WI', nan, 'MN', 'IA', 'AZ', 'IN', 'TX', 'IL', 'Array', 'AR', 'MO',
       'FL', 'MA', 'MI', 'CA', 'OH', 'SD', 'TN', 'SC', 'ND', 'NE', 'CO',
       'GA', 'WV', 'AL', 'OR', 'WA', 'KY', 'NV', 'VA', 'CT', 'ID', 'NC',
       'MS', 'NY', 'LA', 'NH', 'DC', 'NJ', 'PA', 'OK', 'DE', 'PR', 'UT',
       'KS', 'HI', 'VI', 'MT', 'RI', 'ME', 'MD', 'VT', 'WY', 'NM', 'AK',
       'GU'], dtype=object)

In [72]:
ii = np.where(df_client.us_state=="Array")
print(ii)
for i in ii[0]:
    print(i)
    df_client.loc[i,"us_state"] = pd.NA

(array([   729,    747,   5625,  11611,  19460,  30476,  35933,  41735,
        63311,  74152,  80456,  81294,  92380,  93030, 107551, 121273,
       129797, 136286, 138753, 147351, 148194], dtype=int64),)
729
747
5625
11611
19460
30476
35933
41735
63311
74152
80456
81294
92380
93030
107551
121273
129797
136286
138753
147351
148194


This leaves remaining Age, Ethnicity, Gender, and Veteran, which I do not have time to tackle at the moment

### 2. Quick Choropleth

We will use the Folium library to plot the zipcodes and their corresponding number of clients. For this task we will need a ZIPCODE GeoJSON file for Wisconsin. First let's calculate for each zip code the number of matching clients.


In [73]:
zip_codes = list(pd.unique(df_client.us_zip))
num_clients = [0 for x in zip_codes]
for z in df_client.us_zip:
    if not pd.isnull(z):
        num_clients[zip_codes.index(z)] += 1

In [74]:
d = {'zipcode': zip_codes, 'num_clients': num_clients}
df_num_clients = pd.DataFrame(data=d)

If we just plot the number of clients we will just get something that is a map of population, so what we really want is fraction of population served.

In [75]:
ZIPS_FILE = './wisconsin-zips.csv'

df_pop = pd.read_csv(ZIPS_FILE)
df_pop.head(5)

FileNotFoundError: [Errno 2] No such file or directory: './wisconsin-zips.csv'

In [None]:
df_pop["Zip Code"] = [str(x) for x in df_pop["Zip Code"]]
df_pop["Population"] = [int(x.replace(",","")) for x in list(df_pop["Population"])]

In [None]:
pops = [df_pop["Population"][list(df_pop["Zip Code"]).index(x)] if x in list(df_pop["Zip Code"]) and not pd.isnull(x) else pd.NA for x in zip_codes]
d = {'zipcode': zip_codes, 'num_clients': num_clients, 'population': pops, 'clients_per_capita': np.array(num_clients)/(np.array(pops)+1),'residents_per_client': np.array(pops)/np.array(num_clients)}
df_num_clients = pd.DataFrame(data=d)

In [None]:
df_num_clients.clients_per_capita

In [None]:
# Install folium library
!pip install folium

In [None]:
import folium
import pandas as pd
import json
import requests

# GeoJSON file definition
wisconsin_geojson = "https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/wi_wisconsin_zip_codes_geo.min.json"


# Creating the map centered at Wisconsin state
m = folium.Map(location=[44.808444, -89.673194], 
               tiles="cartodbpositron", 
               zoom_start=6.8)

# Creating the Choropleth
m.choropleth(geo_data=json.loads(requests.get(wisconsin_geojson).text),
             data=df_num_clients[pd.notnull(df_num_clients.residents_per_client)],
             columns=['zipcode', 'residents_per_client'],
             key_on='feature.properties.ZCTA5CE10', 
             fill_color='YlOrRd', fill_opacity=1, line_opacity=0.2,
             legend_name='Residents_Per_Client')

m


In [None]:
m.save(outfile = './choropleth_underserved_areas.html' )

### 4. Final thoughts of this dataset

This map highlights in darker colors regions that have a high number of residents per client. Many of these regions overlap with regions of high need in the ADI choropleth. One conclusion is that those areas of overlap represent areas where there is a high degree of need, but where clients are not reaching out for support. 