In [14]:
import pandas as pd
import numpy as np


In [15]:
# Load dataset with appropriate encoding
file_path = "2020-personal-property.csv"
df = pd.read_csv(file_path, encoding='latin1')



In [16]:
# Drop unnecessary unnamed columns
df = df.loc[:, ~df.columns.str.contains('Unnamed')]

In [17]:
# Trim whitespace from column names, replace spaces with underscores, and convert to upper case
df.columns = df.columns.str.strip().str.replace(' ', '_').str.upper().str.replace('/', '_OR_')
df

Unnamed: 0,AGENCY_CODE,STATE_AGENCY,PROPERTY_TYPE,PROPERTY_CLASSIFICATION,PROPERTY_NAME,DATE_ACQUIRED_OR_INSTALLED,VALUE_OR_COST,IT_OR_NON-IT
0,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"DATA TERMINAL, TI (S CINT. CTR) MDL 700AS",8/15/1977,"$20,600.00",IT
1,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"COMPUTER, DELL GX620",8/17/2005,"$1,435.04",IT
2,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"COMPUTER, DELL DIMENSION 5150C",5/25/2006,"$1,337.87",IT
3,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,DELL INTEL PENTIUM 4 PROCESSOR 630 W/HT,5/17/2007,$936.16,IT
4,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"VOSTRP 700, INTEL CORE2 DUO",8/27/2007,"$1,138.88",IT
...,...,...,...,...,...,...,...,...
149368,981,OKLAHOMA MUNICIPAL POWER AUTHORITY,PERSONAL PROPERTY,,"ZERO TURN MOWER W/ VAC - 61"" DECK",,"$11,344.75",
149369,981,OKLAHOMA MUNICIPAL POWER AUTHORITY,PERSONAL PROPERTY,,ROTARY BOOM MOWER ATTACHMENT,,"$43,544.76",
149370,981,OKLAHOMA MUNICIPAL POWER AUTHORITY,PERSONAL PROPERTY,,F3990 FRONT DECK MOWER W/ CAB,4/15/2019,"$28,331.72",
149371,981,OKLAHOMA MUNICIPAL POWER AUTHORITY,PERSONAL PROPERTY,,SKID MOUNTED PRESSURE WASHER,4/11/2019,"$6,389.05",


In [18]:
# Handle missing values: Drop rows where essential columns are missing
essential_cols = ["AGENCY_CODE", "STATE_AGENCY", "PROPERTY_NAME", "VALUE_OR_COST", "DATE_ACQUIRED_OR_INSTALLED"]
df.dropna(subset=essential_cols, inplace=True)

In [19]:
# Clean 'VALUE_OR_COST' column by removing extra spaces and converting to numeric
df['VALUE_OR_COST'] = df['VALUE_OR_COST'].astype(str).str.strip().replace('[\$,]', '', regex=True)
df['VALUE_OR_COST'] = pd.to_numeric(df['VALUE_OR_COST'], errors='coerce')

  df['VALUE_OR_COST'] = df['VALUE_OR_COST'].astype(str).str.strip().replace('[\$,]', '', regex=True)


In [20]:
# Standardize 'DATE_ACQUIRED_OR_INSTALLED' format
df['DATE_ACQUIRED_OR_INSTALLED'] = pd.to_datetime(df['DATE_ACQUIRED_OR_INSTALLED'], errors='coerce')

In [21]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [22]:
# Drop 'YEAR' column if it exists
df.drop(columns=['YEAR', 'COUNT'], errors='ignore', inplace=True)

In [23]:
# Normalize text columns
df['STATE_AGENCY'] = df['STATE_AGENCY'].str.upper().str.strip()
df['PROPERTY_NAME'] = df['PROPERTY_NAME'].str.title().str.strip()

In [24]:
# Validate 'IT_OR_NON-IT' column
df['IT_OR_NON-IT'] = df['IT_OR_NON-IT'].str.upper().replace({'IT': 'IT', 'NON-IT': 'NON-IT'})
df['IT_OR_NON-IT'] = df['IT_OR_NON-IT'].apply(lambda x: x if x in ['IT', 'NON-IT'] else np.nan)

In [25]:
# Save the cleaned dataset
df.to_csv("2020_cleaned_personal_property_list.csv", index=False)


In [26]:
# Display cleaned dataset
df.head()


Unnamed: 0,AGENCY_CODE,STATE_AGENCY,PROPERTY_TYPE,PROPERTY_CLASSIFICATION,PROPERTY_NAME,DATE_ACQUIRED_OR_INSTALLED,VALUE_OR_COST,IT_OR_NON-IT
0,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"Data Terminal, Ti (S Cint. Ctr) Mdl 700As",1977-08-15,20600.0,IT
1,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"Computer, Dell Gx620",2005-08-17,1435.04,IT
2,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"Computer, Dell Dimension 5150C",2006-05-25,1337.87,IT
3,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,Dell Intel Pentium 4 Processor 630 W/Ht,2007-05-17,936.16,IT
4,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"Vostrp 700, Intel Core2 Duo",2007-08-27,1138.88,IT
