# Tipped Wage Report: Data Cleaning

#### Analyst: Dhruv Singh <br> Report Name: Tipped Wage Report, Part 1 <br> Report Quarter, Year: Q2 2021 <br> Date Updated: 08/18/2021

In [1]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

### Reading in Data

In [2]:
essp = pd.read_csv('data/ESSP - 2021 Q2 Tip Wage Report.csv', skiprows=2)

In [3]:
manual = pd.read_csv('data/Manual - 2021 Q2 Tip Wage Report.csv', usecols=[i for i in range(22)])

## Cleaning ESSP Data

In [4]:
# viewing dataframe top rows
essp.head()

Unnamed: 0,EMPLOYER ACCOUNT ID,YEARQUARTER,EMPLOYERNAME,BUSINESSNAME,AGENTNAME,PHONE NUM,EMAIL,MAILINGADDRESS,DCWORKADDRESS,SSN,EMPLOYEENAME,TIPPED WAGE HOURS,EMPLOYER PAID HOURLY RATE,WAGES AMT,TIPPED WAGES AMOUNT,AVGTOTALHOURLYRATE,AVGHOURLYTIPSRECEIVED,MINWAGEMET,COMPLIANCEBASERATE,FILING DATE
0,10603,2021-2,P & P CORPORATION,P & P CORPORATION,PAYCE INC.,4432799000.0,martin@lachaumieredc.com,"1220B E JOPPA ROAD #324,TOWSON,MD 21286","2813 M ST NW ,WASHI...",bvOpVg7uC2QpWOIdmncShA==,FELIPE VELASQUEZ,272,$5.00,"$1,360.00","$19,002.00",$74.86,$69.86,Yes,Yes,7/31/2021
1,10603,2021-2,P & P CORPORATION,P & P CORPORATION,PAYCE INC.,4432799000.0,martin@lachaumieredc.com,"1220B E JOPPA ROAD #324,TOWSON,MD 21286","2813 M ST NW ,WASHI...",f4jFCsunNbBCJRXxgCdWAw==,PETER KRIDLER,255,$5.00,"$1,275.00","$18,022.00",$75.67,$70.67,Yes,Yes,7/31/2021
2,10603,2021-2,P & P CORPORATION,P & P CORPORATION,PAYCE INC.,4432799000.0,martin@lachaumieredc.com,"1220B E JOPPA ROAD #324,TOWSON,MD 21286","2813 M ST NW ,WASHI...",GsEq+iFu17oUjjiL9U5ZtQ==,ABDELLATIF AMMOUR,66,$5.00,$330.00,"$2,954.00",$49.76,$44.76,Yes,Yes,7/31/2021
3,10603,2021-2,P & P CORPORATION,P & P CORPORATION,PAYCE INC.,4432799000.0,martin@lachaumieredc.com,"1220B E JOPPA ROAD #324,TOWSON,MD 21286","2813 M ST NW ,WASHI...",ock2FoIzG5Q63rBjvvbsGQ==,RENE GUEVARA,335,$5.00,"$1,677.00","$20,859.00",$67.27,$62.27,Yes,Yes,7/31/2021
4,10603,2021-2,P & P CORPORATION,P & P CORPORATION,PAYCE INC.,4432799000.0,martin@lachaumieredc.com,"1220B E JOPPA ROAD #324,TOWSON,MD 21286","2813 M ST NW ,WASHI...",RoAoUtRP/F8cg+MQYfq5dw==,ELMER FLORES,387,$5.00,"$1,935.00","$11,552.00",$34.85,$29.85,Yes,Yes,7/31/2021


In [5]:
# subsetting
essp = essp[["EMPLOYERNAME", "TIPPED  WAGE  HOURS", "EMPLOYER  PAID  HOURLY  RATE", "WAGES  AMT", "TIPPED  WAGES  AMOUNT", "AVGTOTALHOURLYRATE", "AVGHOURLYTIPSRECEIVED"]]

In [6]:
# renaming columns
essp = essp.rename(columns={"EMPLOYERNAME": "emp_name",
                        "TIPPED  WAGE  HOURS": "hours_worked",
                        "EMPLOYER  PAID  HOURLY  RATE": "hourly_rate_emp", 
                        "WAGES  AMT": "total_wage", 
                        "TIPPED  WAGES  AMOUNT": "total_tips", 
                        "AVGTOTALHOURLYRATE": "hourly_rate_avg", # calculated column
                        "AVGHOURLYTIPSRECEIVED": "tips_avg"})

In [7]:
# adding source column
essp['source'] = 'ESSP'

In [8]:
essp.head()

Unnamed: 0,emp_name,hours_worked,hourly_rate_emp,total_wage,total_tips,hourly_rate_avg,tips_avg,source
0,P & P CORPORATION,272,$5.00,"$1,360.00","$19,002.00",$74.86,$69.86,ESSP
1,P & P CORPORATION,255,$5.00,"$1,275.00","$18,022.00",$75.67,$70.67,ESSP
2,P & P CORPORATION,66,$5.00,$330.00,"$2,954.00",$49.76,$44.76,ESSP
3,P & P CORPORATION,335,$5.00,"$1,677.00","$20,859.00",$67.27,$62.27,ESSP
4,P & P CORPORATION,387,$5.00,"$1,935.00","$11,552.00",$34.85,$29.85,ESSP


In [9]:
# extracting values from string columns
essp["hourly_rate_emp"] = essp["hourly_rate_emp"].str.extract(r"([-+]?\d*\.\d+|[-+]\d+)").astype('float')
essp["total_wage"] = essp["total_wage"].str.extract(r"([-+]?\d*\,\d*\.\d+|[-+]?\d*\.\d+|[-+]\d+)")
essp["total_tips"] = essp["total_tips"].str.extract(r"([-+]?\d*\,\d*\.\d+|[-+]?\d*\.\d+|[-+]\d+)")
essp["hourly_rate_avg"] = essp["hourly_rate_avg"].str.extract(r"([-+]?\d*\.\d+|[-+]\d+)").astype('float') # calculated column
essp["tips_avg"] = essp["tips_avg"].str.extract(r"([-+]?\d*\.\d+|[-+]\d+)").astype('float')

In [10]:
# replacing comma in total wages and total tips, and converting to float
essp["total_wage"] = essp["total_wage"].str.replace(',', '').astype('float')
essp["total_tips"] = essp["total_tips"].str.replace(',', '').astype('float')

## Cleaning Manual Data

In [11]:
# viewing dataframe top rows
manual.head()

Unnamed: 0,EAN,Year-Quarter,Employer Name,Business Name,Agent Name,Contact Phone #,Contact E-Mail,Street Address,"City, State, Zip","Employee Name (2,975 Employee records)",...,Total Employer Paid Wages During Quarter,Total Tips Received During Quarter,Actual Employer Paid Hourly Rate,Avg. Hourly Tips Received,Avg. Total Hourly Rate (Employer Paid And Tips),Minimum Wage Met? (224 Potential Violations),Tipped Base Rate In Compliance?,Wages Due,Wages Received,Wages Owed
0,A.W,2021-Q2,"Hank's on the Wharf, LLC","Hank's on the Wharf, LLC",JEFF STRINE,202-271-4206,jstrine@hanksdc.com,701 Wharf Street SW,"Washington, DC 20024",Vilma Alfaro,...,"$5,608.90","$9,048.88",$11.00,$17.75,$28.75,YES,YES,"$6,373.75","$14,657.78","$(8,284.03)"
1,A.W,2021-Q2,"Hank's on the Wharf, LLC","Hank's on the Wharf, LLC",JEFF STRINE,202-271-4206,jstrine@hanksdc.com,701 Wharf Street SW,"Washington, DC 20024",Vilma Alfaro,...,$284.71,$0.00,$18.50,$-,$18.50,YES,YES,$192.38,$284.71,$(92.34)
2,A.W,2021-Q2,"Hank's on the Wharf, LLC","Hank's on the Wharf, LLC",JEFF STRINE,202-271-4206,jstrine@hanksdc.com,701 Wharf Street SW,"Washington, DC 20024",Estefany Alvarez,...,"$3,087.70","$5,915.20",$7.50,$14.37,$21.87,YES,YES,"$5,146.13","$9,002.90","$(3,856.78)"
3,A.W,2021-Q2,"Hank's on the Wharf, LLC","Hank's on the Wharf, LLC",JEFF STRINE,202-271-4206,jstrine@hanksdc.com,701 Wharf Street SW,"Washington, DC 20024",Estefany Alvarez,...,$30.75,"$2,030.17",$15.00,$990.33,"$1,005.33",YES,YES,$25.63,"$2,060.92","$(2,035.30)"
4,A.W,2021-Q2,"Hank's on the Wharf, LLC","Hank's on the Wharf, LLC",JEFF STRINE,202-271-4206,jstrine@hanksdc.com,701 Wharf Street SW,"Washington, DC 20024",Kokouvi Apetoh,...,"$2,079.24","$4,982.62",$7.50,$17.97,$25.47,YES,YES,"$3,465.38","$7,061.86","$(3,596.49)"


### Cleaning Manual Data

In [12]:
# subsetting
manual = manual[["Employer Name ",  "Total Hours Worked During Quarter", " Total Employer Paid Wages During Quarter ", " Total Tips Received During Quarter ", " Actual Employer Paid Hourly Rate ", " Avg. Hourly Tips Received ", " Avg. Total Hourly Rate (Employer Paid And Tips) "]]

In [13]:
# renaming columns
manual = manual.rename(columns={"Employer Name ": "emp_name",
                                "Total Hours Worked During Quarter": "hours_worked",
                                " Total Employer Paid Wages During Quarter ": "total_wage",
                                " Total Tips Received During Quarter ": "total_tips",
                                " Actual Employer Paid Hourly Rate ": "hourly_rate_emp",
                                " Avg. Hourly Tips Received ": "tips_avg",
                                " Avg. Total Hourly Rate (Employer Paid And Tips) ": "hourly_rate_avg"})

In [14]:
# adding source column
manual['source'] = 'Manual'

In [15]:
# extracting values from string columns
manual["hourly_rate_emp"] = manual["hourly_rate_emp"].str.extract(r"([-+]?\d*\.\d+|[-+]\d+)").astype('float')
manual["total_wage"] = manual["total_wage"].str.extract(r"([-+]?\d*\,\d*\.\d+|[-+]?\d*\.\d+|[-+]\d+)")
manual["total_tips"] = manual["total_tips"].str.extract(r"([-+]?\d*\,\d*\.\d+|[-+]?\d*\.\d+|[-+]\d+)")
manual["hourly_rate_avg"] = manual["hourly_rate_avg"].str.extract(r"([-+]?\d*\.\d+|[-+]\d+)").astype('float') # calculated column
manual["tips_avg"] = manual["tips_avg"].str.extract(r"([-+]?\d*\.\d+|[-+]\d+)").astype('float')

In [16]:
# replacing comma in total wages and total tips, and converting to float
manual["total_wage"] = manual["total_wage"].str.replace(',', '').astype('float')
manual["total_tips"] = manual["total_tips"].str.replace(',', '').astype('float')

## Combining Datasets: ESSP, Manual

In [17]:
combined = essp.append(manual)

In [18]:
combined.head()

Unnamed: 0,emp_name,hours_worked,hourly_rate_emp,total_wage,total_tips,hourly_rate_avg,tips_avg,source
0,P & P CORPORATION,272.0,5.0,1360.0,19002.0,74.86,69.86,ESSP
1,P & P CORPORATION,255.0,5.0,1275.0,18022.0,75.67,70.67,ESSP
2,P & P CORPORATION,66.0,5.0,330.0,2954.0,49.76,44.76,ESSP
3,P & P CORPORATION,335.0,5.0,1677.0,20859.0,67.27,62.27,ESSP
4,P & P CORPORATION,387.0,5.0,1935.0,11552.0,34.85,29.85,ESSP


In [19]:
# qc step: convert negative hour values to positive
combined['hours_worked'] = abs(combined['hours_worked'])

In [20]:
# dropping rows without employer name
combined.dropna(subset = ["emp_name"], inplace=True)

In [21]:
# replacing missing values with 0
combined = combined.fillna(0)

In [22]:
# setting index
combined = combined.set_index('emp_name')

In [23]:
# dropping rows containing all 0's 
combined = combined.loc[~(combined[['hours_worked', 'hourly_rate_emp', 'total_wage', 'total_tips', 'hourly_rate_avg', 'tips_avg']]==0).all(axis=1)]

#### Saving Data

In [24]:
combined.to_csv('1_cleaned_data.csv')