In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load dataset with appropriate encoding
file_path = "2024-personal-property-inventory.csv"
df = pd.read_csv(file_path, encoding='latin1')
df



  df = pd.read_csv(file_path, encoding='latin1')


Unnamed: 0,AGENCY CODE,STATE AGENCY,PROPERTY TYPE,PROPERTY CLASSIFICATION,PROPERTY NAME,DATE ACQUIRED OR INSTALLED,VALUE OR COST,IT/NON-IT
0,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,RECEPTIONIST DESK,9 21 2010,2695.74,NON-IT
1,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,MAIL ROOM AREA,9 15 2010,4688,NON-IT
2,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,"DESK:DESKS AND TABLES, WOOD",1 31 2019,2822.84,NON-IT
3,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,PRINTER,3 24 2006,520,IT
4,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,DOCUMENT SCANNER,2 28 2011,6475,IT
...,...,...,...,...,...,...,...,...
235885,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,WIDEBAND CONTINUOUS-WAVE CHARACTERIZATION\nPLA...,5 06 2021,759912.3,NON-IT
235886,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,"BUS, 2023 MCI D45CRTLE B9115 LP28632",1 02 2024,785962.0,NON-IT
235887,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,"ART, UNTITLED SECONDARIES W/ PRIMARIES, JASPER...",3 28 2014,1200000.0,NON-IT
235888,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,"BOILER CONTROLS,",1 10 2006,1994678.0,NON-IT


In [3]:
# Drop unnecessary unnamed columns
df = df.loc[:, ~df.columns.str.contains('Unnamed')]

In [4]:
# Trim whitespace from column names, replace spaces with underscores, and convert to upper case
df.columns = df.columns.str.strip().str.replace(' ', '_').str.upper().str.replace('/', '_OR_')
df

Unnamed: 0,AGENCY_CODE,STATE_AGENCY,PROPERTY_TYPE,PROPERTY_CLASSIFICATION,PROPERTY_NAME,DATE_ACQUIRED_OR_INSTALLED,VALUE_OR_COST,IT_OR_NON-IT
0,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,RECEPTIONIST DESK,9 21 2010,2695.74,NON-IT
1,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,MAIL ROOM AREA,9 15 2010,4688,NON-IT
2,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,"DESK:DESKS AND TABLES, WOOD",1 31 2019,2822.84,NON-IT
3,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,PRINTER,3 24 2006,520,IT
4,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,DOCUMENT SCANNER,2 28 2011,6475,IT
...,...,...,...,...,...,...,...,...
235885,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,WIDEBAND CONTINUOUS-WAVE CHARACTERIZATION\nPLA...,5 06 2021,759912.3,NON-IT
235886,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,"BUS, 2023 MCI D45CRTLE B9115 LP28632",1 02 2024,785962.0,NON-IT
235887,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,"ART, UNTITLED SECONDARIES W/ PRIMARIES, JASPER...",3 28 2014,1200000.0,NON-IT
235888,,OKLAHOMA STATE UNIVERSITY,PERSONAL PROPERTY,OWNED,"BOILER CONTROLS,",1 10 2006,1994678.0,NON-IT


In [5]:
# Fix inconsistency in 'PROPERTY_TYPE' column
df['PROPERTY_TYPE'] = df['PROPERTY_TYPE'].str.upper().str.strip()

In [6]:
# Rename the first column properly if it contains encoding issues
df.rename(columns={df.columns[0]: "AGENCY_CODE"}, inplace=True)

In [7]:
# Handle missing values: Drop rows where essential columns are missing
essential_cols = ["AGENCY_CODE", "STATE_AGENCY", "PROPERTY_NAME", "VALUE_OR_COST", "DATE_ACQUIRED_OR_INSTALLED"]
df.dropna(subset=essential_cols, inplace=True)

In [8]:
# Clean 'VALUE_OR_COST' column by removing extra spaces and converting to numeric
df['VALUE_OR_COST'] = df['VALUE_OR_COST'].astype(str).str.strip().replace('[\$,]', '', regex=True)
df['VALUE_OR_COST'] = pd.to_numeric(df['VALUE_OR_COST'], errors='coerce')

  df['VALUE_OR_COST'] = df['VALUE_OR_COST'].astype(str).str.strip().replace('[\$,]', '', regex=True)


In [9]:
# Standardize 'DATE_ACQUIRED_OR_INSTALLED' format
df['DATE_ACQUIRED_OR_INSTALLED'] = pd.to_datetime(df['DATE_ACQUIRED_OR_INSTALLED'], errors='coerce')

In [10]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [11]:
# Drop 'YEAR' column if it exists
df.drop(columns=['YEAR', 'COUNT'], errors='ignore', inplace=True)

In [12]:
# Normalize text columns
df['STATE_AGENCY'] = df['STATE_AGENCY'].str.upper().str.strip()
df['PROPERTY_NAME'] = df['PROPERTY_NAME'].str.title().str.strip()

In [13]:
# Validate 'IT_OR_NON-IT' column
df['IT_OR_NON-IT'] = df['IT_OR_NON-IT'].str.upper().replace({'IT': 'IT', 'NON-IT': 'NON-IT'})
df['IT_OR_NON-IT'] = df['IT_OR_NON-IT'].apply(lambda x: x if x in ['IT', 'NON-IT'] else np.nan)

In [14]:
# Save the cleaned dataset
df.to_csv("2024_cleaned_personal_property_list.csv", index=False)


In [15]:
# Display cleaned dataset
df.head()


Unnamed: 0,AGENCY_CODE,STATE_AGENCY,PROPERTY_TYPE,PROPERTY_CLASSIFICATION,PROPERTY_NAME,DATE_ACQUIRED_OR_INSTALLED,VALUE_OR_COST,IT_OR_NON-IT
0,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,Receptionist Desk,2010-09-21,2695.74,NON-IT
1,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,Mail Room Area,2010-09-15,4688.0,NON-IT
2,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,"Desk:Desks And Tables, Wood",2019-01-31,2822.84,NON-IT
3,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,Printer,2006-03-24,520.0,IT
4,20.0,OKLAHOMA ACCOUNTANCY BOARD,PERSONAL PROPERTY,,Document Scanner,2011-02-28,6475.0,IT
