In [179]:
import pandas as pd
import regex as re

In [180]:
pd.set_option('display.max_rows', 500)

In [181]:
import_df = pd.read_csv("../data/raw/uw-2020-baseline-values.csv")

In [182]:
df = import_df.groupby(["Public Water System ID", "Supplier Name"]).size().reset_index()[["Public Water System ID", "Supplier Name"]]

In [183]:
df.columns = ["id", "supplier_name"]

Strip all trailing white space from `supplier_name`

In [184]:
df['supplier_name'] = df['supplier_name'].str.strip()

And double spaces

In [185]:
df['supplier_name'] = df['supplier_name'].str.replace(r'\s+', ' ', regex=True)

Then find any asterisks...

In [186]:
df[df['supplier_name'].str.contains("\*")]

Unnamed: 0,id,supplier_name
252,CA3610013,Loma Linda City of *


...and destroy them completely

In [187]:
df['supplier_name'] = df['supplier_name'].str.replace("\*","")

  df['supplier_name'] = df['supplier_name'].str.replace("\*","")


Make a column for a clean display name. In most cases, it will match `supplier_name`

In [188]:
df["display_name"] = df["supplier_name"]

Look for all the places with commas

In [189]:
df[df.supplier_name.str.contains(",")]

Unnamed: 0,id,supplier_name,display_name
32,CA1010018,"Kerman, City of","Kerman, City of"
33,CA1010019,"Kingsburg, City of","Kingsburg, City of"
48,CA1310006,"Imperial, City of","Imperial, City of"
59,CA1510020,"Tehachapi, City of","Tehachapi, City of"
68,CA1610004,"Corcoran, City of","Corcoran, City of"
159,CA2410005,"Los Banos, City of","Los Banos, City of"
164,CA2710008,"Greenfield, City of","Greenfield, City of"
167,CA2710011,"Soledad, City of","Soledad, City of"
170,CA2810005,"American Canyon, City of","American Canyon, City of"
228,"CA3310082,CA3310029","Perris, City of","Perris, City of"


Ol' switcharoo

In [190]:
df.loc[
    df.supplier_name.str.contains(","), 
    "display_name"
] = df.supplier_name.str.split(",").str[1] + " " + df.supplier_name.str.split(",").str[0]

In [191]:
df[df.supplier_name == "Tulare, City of"]

Unnamed: 0,id,supplier_name,display_name
388,CA5410015,"Tulare, City of",City of Tulare


In [192]:
# df.loc[
#     df.supplier_name.str.contains(", city of"), 
#     "display_name"] = "City of " + df.supplier_name.str.replace("\, City of", "")

In [193]:
# df.loc[
#     df.supplier_name.str.contains(", Town of"), 
#     "display_name"] = "Town of " + df.supplier_name.str.replace("\, Town of", "")

Now look for any others with "City of"

In [194]:
df[df.supplier_name.str.contains("City of")]

Unnamed: 0,id,supplier_name,display_name
3,CA0110006,Hayward City of,Hayward City of
4,CA0110008,Pleasanton City of,Pleasanton City of
6,CA0110011,Livermore City of Division of Water Resources,Livermore City of Division of Water Resources
15,CA0710001,Antioch City of,Antioch City of
18,CA0710004,Brentwood City of,Brentwood City of
19,CA0710006,Martinez City of,Martinez City of
21,CA0710008,Pittsburg City of,Pittsburg City of
23,"CA0810001,CA0810003",Crescent City City of,Crescent City City of
29,CA1010003,Clovis City of,Clovis City of
30,CA1010004,Coalinga City of,Coalinga City of


List of names that need to be rewritten

In [195]:
to_rewrite = [
    "La Habra City of Public Works",
    "Livermore City of Division of Water Resources",
    "City of Big Bear Lake, Dept of Water & Power",
    "Lodi City of Public Works Department",
    "California Water Service Company Dixon, City of",
]

In [196]:
df.loc[
    df.supplier_name.str.contains("City of"), 
    "display_name"
] = "City of " + df.supplier_name.str.replace(", ","").str.replace("City of", "")

In [197]:
df.loc[
    df.supplier_name.str.contains("city of"), 
    "display_name"
] = "City of " + df.supplier_name.str.replace(", ","").str.replace("city of", "")

In [198]:
df[df.supplier_name == "Adelanto City of"]

Unnamed: 0,id,supplier_name,display_name
246,CA3610001,Adelanto City of,City of Adelanto


In [199]:
df[df.supplier_name.str.contains("city")]

Unnamed: 0,id,supplier_name,display_name
363,CA4810004,"Rio Vista, city of",City of Rio Vista


In [200]:
df[df.supplier_name == "Woodland City of"]

Unnamed: 0,id,supplier_name,display_name
409,CA5710006,Woodland City of,City of Woodland


How about towns?

In [201]:
df[df.supplier_name.str.contains("Town of")]

Unnamed: 0,id,supplier_name,display_name
323,CA4110016,Hillsborough Town of,Hillsborough Town of
375,CA4910017,"Windsor, Town of",Town of Windsor


In [202]:
df.loc[
    df.supplier_name.str.contains("Town of"), 
    "display_name"
] = "Town of " + df.supplier_name.str.replace(", ","").str.replace("Town of", "")

In [203]:
df[df.display_name.str.contains("Town")]

Unnamed: 0,id,supplier_name,display_name
323,CA4110016,Hillsborough Town of,Town of Hillsborough
375,CA4910017,"Windsor, Town of",Town of Windsor


Now revisit the ones that need to be renamed manually

In [204]:
df[df.supplier_name.isin(to_rewrite)]

Unnamed: 0,id,supplier_name,display_name
6,CA0110011,Livermore City of Division of Water Resources,City of Livermore Division of Water Resources
179,CA3010018,La Habra City of Public Works,City of La Habra Public Works
266,"CA3610044,CA3610022,CA3600283,CA3600395","City of Big Bear Lake, Dept of Water & Power",City of Big Bear LakeDept of Water & Power
300,CA3910004,Lodi City of Public Works Department,City of Lodi Public Works Department
361,CA4810002,"California Water Service Company Dixon, City of",City of California Water Service Company Dixon


Look for any other names that might have problematic special characters

In [205]:
BAD_CHARS = ['&', '(', ')', ';', '-', '\/']
pat = '|'.join(['({})'.format(re.escape(c)) for c in BAD_CHARS])

df[df['display_name'].str.contains(pat)]

  df[df['display_name'].str.contains(pat)]


Unnamed: 0,id,supplier_name,display_name
78,CA1910011,Golden State Water Company Bell-Bell Gardens,Golden State Water Company Bell-Bell Gardens
79,CA1910013,Bellflower-Somerset Mutual Water Company,Bellflower-Somerset Mutual Water Company
98,"CA1910052,CA1910186,CA1910139",California-American Water Company Los Angeles ...,California-American Water Company Los Angeles ...
163,"CA2710004,CA2701466,CA2701882,CA2710022",California-American Water Company Monterey Dis...,California-American Water Company Monterey Dis...
171,CA2910003,Truckee-Donner Public Utilities District,Truckee-Donner Public Utilities District
207,CA3310002,Beaumont-Cherry Valley Water District,Beaumont-Cherry Valley Water District
239,CA3410018,Rio Linda - Elverta Community Water District,Rio Linda - Elverta Community Water District
243,"CA3410031,CA3410010,CA3410017,CA3410013,CA3110...",California-American Water Company Sacramento D...,California-American Water Company Sacramento D...
266,"CA3610044,CA3610022,CA3600283,CA3600395","City of Big Bear Lake, Dept of Water & Power",City of Big Bear LakeDept of Water & Power
273,CA3610073,Hi-Desert Water District,Hi-Desert Water District


The final boss

In [206]:
df[df.display_name.str.contains("Big Bear Lake")]

Unnamed: 0,id,supplier_name,display_name
266,"CA3610044,CA3610022,CA3600283,CA3600395","City of Big Bear Lake, Dept of Water & Power",City of Big Bear LakeDept of Water & Power


In [207]:
df.iloc[266]["display_name"]

'City of  Big Bear LakeDept of Water & Power'

In [208]:
df.loc[
    (df.display_name == 'City of  Big Bear LakeDept of Water & Power'), 
    "display_name"
] = "City of Big Bear Lake, Dept. of Water & Power"

In [209]:
df[df.display_name == "City of Big Bear Lake, Dept. of Water & Power"]

Unnamed: 0,id,supplier_name,display_name
266,"CA3610044,CA3610022,CA3600283,CA3600395","City of Big Bear Lake, Dept of Water & Power","City of Big Bear Lake, Dept. of Water & Power"


Final clean

In [210]:
df['display_name'] = df['display_name'].str.strip()

In [211]:
df

Unnamed: 0,id,supplier_name,display_name
0,CA0110001,Alameda County Water District,Alameda County Water District
1,CA0110003,California Water Service Company Livermore,California Water Service Company Livermore
2,CA0110005,East Bay Municipal Utilities District,East Bay Municipal Utilities District
3,CA0110006,Hayward City of,City of Hayward
4,CA0110008,Pleasanton City of,City of Pleasanton
5,CA0110009,Dublin San Ramon Services District,Dublin San Ramon Services District
6,CA0110011,Livermore City of Division of Water Resources,City of Livermore Division of Water Resources
7,"CA0310002,CA0310019,CA0310012,CA0310021,CA0310003",Amador Water Agency,Amador Water Agency
8,"CA0410002,CA1110002",California Water Service Company Chico District,California Water Service Company Chico District
9,CA0410005,California Water Service Company Oroville,California Water Service Company Oroville


In [212]:
df.to_csv("../data/metadata/urban-water-suppliers-clean-names.csv", index=False)