<a href="https://colab.research.google.com/github/divyani95/Transaction_details_Project/blob/main/Transaction_details_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the Excel file
file_path = '/mnt/data/Transactions_Details_30Jan2025_102227.xls'

# Load the file without a header to inspect structure
df_raw = pd.read_excel(file_path, header=None)
print("Preview of Raw Data:\n", df_raw.head(16))

# Identify the correct header row and reload with headers
header_row = 14  # Adjust if necessary
df = pd.read_excel(file_path, header=header_row)

# Print actual column names for verification
print("Column Names:", df.columns.tolist())

# Ensure correct column name references (case-insensitive matching)
def get_column_name(possible_names, df_columns):
    """Returns the correct column name from a list of possible names."""
    for name in df_columns:
        if any(possible_name.lower() == name.lower() for possible_name in possible_names):
            return name
    return None  # If no match found

# Identify actual column names
transaction_date_col = get_column_name(["Transaction Date"], df.columns)
debit_amount_col = get_column_name(["Debit Amount"], df.columns)
credit_amount_col = get_column_name(["Credit Amount"], df.columns)
running_balance_col = get_column_name(["Running Balance"], df.columns)
description_col = get_column_name(["Transaction Description"], df.columns)

# Drop unnecessary columns
columns_to_drop = [credit_amount_col, running_balance_col]
df.drop(columns=[col for col in columns_to_drop if col], inplace=True, errors="ignore")

# Filter out rows where 'Debit Amount' is 0.00
if debit_amount_col:
    df = df[df[debit_amount_col] != 0.00]

# Split 'Transaction Date' into 'Date' and 'Time'
if transaction_date_col:
    df[["Date", "Time"]] = df[transaction_date_col].astype(str).str.split(" ", expand=True)
    df.drop(columns=[transaction_date_col], inplace=True)

# Convert 'Date' to proper format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.date

# Convert 'Time' to proper format if available
if 'Time' in df.columns:
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce').dt.time

# Keep only transactions with 'NEFT' or 'RTGS' in 'Transaction Description'
if description_col:
    df = df[df[description_col].str.contains('NEFT|RTGS', case=False, na=False)]
else:
    print("Warning: 'Transaction Description' column not found.")

# Save cleaned dataset
output_path = "/mnt/data/Transactions_Details_Cleaned.xlsx"
df.to_excel(output_path, index=False)

# Print final dataset shape and preview
print("Updated Shape of Dataset:", df.shape)
print(df.head())

# Output file location
print(f"Cleaned dataset saved at: {output_path}")
now i want to remove 00:00:00 from the Transaction Date column