In [1]:
import pandas as pd

# Load the Excel file
file_path = '/Users/lesliee/Desktop/DS311 Group Project/DataDrivers/WIP Users/Leslie_WIP/salary_data_states.xlsx'
sheet_name = 'Sheet1'  # Replace with your actual sheet name if different

# Read the sheet into a DataFrame
df = pd.read_excel(file_path, sheet_name=sheet_name)

# Step 1: Check for missing values
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())

# Handling missing data
print("\nRemoving rows where 'Paid Wage Submitted' or 'Work State' is missing...")
df = df.dropna(subset=['PAID_WAGE_SUBMITTED', 'WORK_STATE'])

# Step 2: Standardize data formats
print("\nStandardizing data formats...")
if 'PAID_WAGE_SUBMITTED' in df.columns:
    df['PAID_WAGE_SUBMITTED'] = pd.to_numeric(df['PAID_WAGE_SUBMITTED'], errors='coerce')

if 'CASE_RECEIVED_DATE' in df.columns:
    df['CASE_RECEIVED_DATE'] = pd.to_datetime(df['CASE_RECEIVED_DATE'], errors='coerce')

if 'DECISION_DATE' in df.columns:
    df['DECISION_DATE'] = pd.to_datetime(df['DECISION_DATE'], errors='coerce')

# Step 3: Check for outliers in 'Paid Wage Submitted'
print("\nDescriptive Statistics for 'Paid Wage Submitted':")
print(df['PAID_WAGE_SUBMITTED'].describe())

# Filter out unreasonable wage data
print("\nFiltering out outliers...")
df = df[(df['PAID_WAGE_SUBMITTED'] >= 10000) & (df['PAID_WAGE_SUBMITTED'] <= 1000000)]

# Step 4: Check for duplicate entries based on 'Case Number'
print("\nNumber of duplicate entries based on 'Case Number':")
print(df.duplicated(subset='CASE_NUMBER').sum())

# Drop duplicates if any
df = df.drop_duplicates(subset='CASE_NUMBER')

# Step 5: Standardize text fields like 'Work State'
print("\nStandardizing 'Work State' column...")
df['WORK_STATE'] = df['WORK_STATE'].str.upper().str.strip()

# Final review of the cleaned data
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# Save the cleaned data to a specified folder path
output_path = '/Users/lesliee/Desktop/DS311 Group Project/DataDrivers/WIP Users/Leslie_WIP/cleaned_salary_data_states.csv'
df.to_csv(output_path, index=False)

print(f"\nData cleaning complete. Cleaned data saved to: {output_path}")



Missing Values Before Cleaning:
CASE_NUMBER                            0
CASE_STATUS                            0
CASE_RECEIVED_DATE                     0
DECISION_DATE                          0
EMPLOYER_NAME                          0
PREVAILING_WAGE_SUBMITTED              0
PREVAILING_WAGE_SUBMITTED_UNIT         0
PAID_WAGE_SUBMITTED                    0
PAID_WAGE_SUBMITTED_UNIT               0
JOB_TITLE                              0
WORK_CITY                              3
EDUCATION_LEVEL_REQUIRED          156185
COLLEGE_MAJOR_REQUIRED            156227
EXPERIENCE_REQUIRED_Y_N           156185
EXPERIENCE_REQUIRED_NUM_MONTHS    162313
COUNTRY_OF_CITIZENSHIP            156185
PREVAILING_WAGE_SOC_CODE               0
PREVAILING_WAGE_SOC_TITLE              0
WORK_STATE                             0
WORK_STATE_ABBREVIATION                0
WORK_POSTAL_CODE                  113604
FULL_TIME_POSITION_Y_N             11093
VISA_CLASS                             0
PREVAILING_WAGE_PER_YEAR