In [144]:
import pandas as pd
import matplotlib.pyplot as plt

In [145]:
data = pd.read_csv("messy_expense_dataset.csv")

In [146]:
df = pd.DataFrame(data)

In [147]:
df.head()

Unnamed: 0,Date,Category,Amount,Payment_Mode
0,2025/02/19,Shopping,41201.81,Cash
1,Jan 07 2024,Shopping,43039.96,cash
2,Jun 03 2024,,21515.17,Upi
3,Jun 03 2024,Health,21515.17,Upi
4,31-03-24,Dining,23419.0,


In [148]:
df.shape

(1582, 4)

In [149]:
columns = list(df.columns)
print(columns)

['Date', 'Category', 'Amount', 'Payment_Mode']


In [150]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1582 entries, 0 to 1581
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          1582 non-null   object 
 1   Category      1430 non-null   object 
 2   Amount        1582 non-null   float64
 3   Payment_Mode  1435 non-null   object 
dtypes: float64(1), object(3)
memory usage: 49.6+ KB


In [151]:
# df[list(df.columns[df.isna().any()])].isna().sum()
df.isna().sum()

Date              0
Category        152
Amount            0
Payment_Mode    147
dtype: int64

In [152]:
df.duplicated().any()

np.True_

In [153]:
df.duplicated().sum()

np.int64(73)

In [154]:
# get the duplicated data
df[df.duplicated()]

Unnamed: 0,Date,Category,Amount,Payment_Mode
47,19-04-24,Subscriptions,23215.93,Cash
54,2024-01-19,Groceries,29135.09,Cash
69,13-01-24,Entertainment,35184.52,Credit Card
79,May 20 2024,Dining,31004.91,Debit Card
120,2024/05/07,Entertainment,33846.80,upi
...,...,...,...,...
1486,2025-02-20,,32400.50,cash
1491,14-01-25,Health,2383.66,Credit Card
1513,Jul 04 2024,entertaiment,30479.32,cash
1520,2024-01-03,Utilities,33226.19,Cash


In [155]:
df.drop_duplicates(inplace=True)

In [156]:
df.duplicated().any()

np.False_

In [157]:
df.isna().sum()

Date              0
Category        144
Amount            0
Payment_Mode    140
dtype: int64

Even after the removal of duplicated data we still have null values 


In [158]:
dates = df["Date"]

In [159]:
print(dates.head())
print(dates.tail())

0     2025/02/19
1    Jan 07 2024
2    Jun 03 2024
3    Jun 03 2024
4       31-03-24
Name: Date, dtype: object
1576       31-01-25
1577    Jan 22 2024
1578     2024-12-15
1579     2024-06-13
1581     2024/08/09
Name: Date, dtype: object


In [160]:
# Convert the Date column into proper datetime format.

# Any rows that fail conversion → remove them.

# Sort data by Date.

In [161]:
# optimized form of code: 
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
df = df.dropna(subset=["Date"])

  df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")


In [162]:
df.sort_values(by="Date", ascending=True, inplace=True)

In [163]:
print(df.head(3))
print(df.tail(3))

           Date       Category    Amount   Payment_Mode
382  2024-01-02  Entertainment   9496.55  Bank Transfer
1131 2024-01-03           Rent  44615.69            UPI
403  2024-01-04  Entertainment  40773.26     Debit Card
           Date      Category    Amount Payment_Mode
1051 2025-03-24     Transport  33402.19          upi
729  2025-03-25     Groceries  65379.79   Debit Card
23   2025-03-26  entertaiment      0.00  Credit Card


In [164]:
# Convert everything to lowercase.

# Strip spaces.

# Replace spelling mistakes with correct names.

# Remove rows where Category is missing or blank.

In [165]:
df["Category"] = df["Category"].astype(str).str.lower()

# df = df.drop()

In [166]:
df["Category"] = df["Category"].str.strip()

In [167]:
df.head()

Unnamed: 0,Date,Category,Amount,Payment_Mode
382,2024-01-02,entertainment,9496.55,Bank Transfer
1131,2024-01-03,rent,44615.69,UPI
403,2024-01-04,entertainment,40773.26,Debit Card
1350,2024-01-05,entertaiment,26641.36,UPI
339,2024-01-06,utilities,31115.46,UPI


In [168]:
# for spelling corrections: 
df["Category"].unique()

array(['entertainment', 'rent', 'entertaiment', 'utilities', 'transport',
       'dining', 'health', 'shopping', 'groceries', 'utilties', 'nan',
       'fuel', 'groccery', 'subscriptions'], dtype=object)

In [169]:
# build a incorrect:correct dict set: 
spellings = {"entertaiment":"entertainment",
             "utilties":"utilities",
             "nan":None,
             "groccery":"groceries"}

In [170]:
df["Category"] = df["Category"].replace(spellings)

In [171]:
df["Category"].isna().sum()

np.int64(30)

In [172]:
df = df.dropna(subset=["Category"]).reset_index(drop=True)

In [173]:
df["Category"].isna().sum()

np.int64(0)

array(['entertainment', 'rent', 'utilities', 'transport', 'dining',
       'health', 'shopping', 'groceries', 'fuel', 'subscriptions'],
      dtype=object)