In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load dataset with appropriate encoding
file_path = "2019-personal-property-list.csv"
df = pd.read_csv(file_path, encoding='latin1')


In [3]:
# Drop unnecessary unnamed columns
df = df.loc[:, ~df.columns.str.contains('Unnamed')]

In [4]:
# Trim whitespace from column names and replace spaces with underscores
df.columns = df.columns.str.strip().str.replace(' ', '_')

In [5]:
# Handle missing values: Drop rows where essential columns are missing
essential_cols = ["AGENCY_CODE", "STATE_AGENCY", "PROPERTY_NAME", "VALUE_OR_COST", "DATE_ACQUIRED_OR_INSTALLED"]
df.dropna(subset=essential_cols, inplace=True)

In [6]:
# Clean 'VALUE_OR_COST' column by removing extra spaces and converting to numeric
df['VALUE_OR_COST'] = df['VALUE_OR_COST'].astype(str).str.strip().replace('[\$,]', '', regex=True)
df['VALUE_OR_COST'] = pd.to_numeric(df['VALUE_OR_COST'], errors='coerce')

  df['VALUE_OR_COST'] = df['VALUE_OR_COST'].astype(str).str.strip().replace('[\$,]', '', regex=True)


In [7]:
# Standardize 'DATE_ACQUIRED_OR_INSTALLED' format
df['DATE_ACQUIRED_OR_INSTALLED'] = pd.to_datetime(df['DATE_ACQUIRED_OR_INSTALLED'], errors='coerce')

In [8]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [9]:
# Drop 'YEAR' column if it exists
df.drop(columns=['YEAR', 'COUNT'], errors='ignore', inplace=True)

In [10]:
# Normalize text columns
df['STATE_AGENCY'] = df['STATE_AGENCY'].str.upper().str.strip()
df['PROPERTY_NAME'] = df['PROPERTY_NAME'].str.title().str.strip()

In [11]:
# Validate 'IT_OR_NON-IT' column
df['IT_OR_NON-IT'] = df['IT_OR_NON-IT'].str.upper().replace({'IT': 'IT', 'NON-IT': 'NON-IT'})
df['IT_OR_NON-IT'] = df['IT_OR_NON-IT'].apply(lambda x: x if x in ['IT', 'NON-IT'] else np.nan)

In [12]:
# Save the cleaned dataset
df.to_csv("2019_cleaned_personal_property_list.csv", index=False)


In [13]:
# Display cleaned dataset
df.head()


Unnamed: 0,AGENCY_CODE,STATE_AGENCY,PROPERTY_TYPE,PROPERTY_CLASSIFICATION,PROPERTY_NAME,DATE_ACQUIRED_OR_INSTALLED,VALUE_OR_COST,IT_OR_NON-IT
0,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"Data Terminal, Ti (S Cint. Ctr) Mdl 700As",1977-08-15,20600.0,IT
1,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"Computer, Dell Gx620",2005-08-17,1435.04,IT
2,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,"Computer, Dell Dimension 5150C",2006-05-25,1337.87,IT
3,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,Dell Intel Pentium 4 Processor 630 W/Ht,2007-05-17,936.16,IT
4,10,OSU-STILLWATER,PERSONAL PROPERTY,OWNED,Dell Dimension 5150 Pentium D Processor,2006-07-28,1049.84,IT
