In [8]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")



In [9]:
# Function to read the CSV file into a DataFrame
def read_csv():
    # read the user_transactions.csv file using pandas library and return it
    df = pd.read_csv('user_transactions.csv')
    return df


In [10]:
read_csv().head()

Unnamed: 0,id_,t_date,t_type,t_amt,has_credit_card,account_type
0,312,20-01-2020,deposit,485,Yes,Savings
1,376,03-01-2020,deposit,706,No,Current
2,188,13-01-2020,deposit,601,No,Savings
3,138,11-01-2020,deposit,520,No,Salary
4,373,18-01-2020,deposit,596,No,Salary


In [11]:
# Function to check for null (missing) values in the DataFrame
def check_null_values():
    # do not edit the predefined function name
    df = read_csv()
    # Check for null values using the isnull() method and sum them for each column
    null_values = df.isnull().sum()
    return null_values

In [12]:
check_null_values()

id_                0
t_date             0
t_type             0
t_amt              0
has_credit_card    0
account_type       0
dtype: int64

In [13]:
# Function to check for duplicate rows in the DataFrame
def check_duplicates():
    # do not edit the predefined function name
    df = read_csv()
    # Calculate the number of duplicate rows using the duplicated() method and sum them
    duplicates = df.duplicated().sum()
    return duplicates

In [14]:
check_duplicates()

79

In [15]:
# Function to drop duplicate rows from the DataFrame
def drop_duplicates():
    # do not edit the predefined function name
    df = read_csv()
    # Drop duplicate rows using the drop_duplicates() method with inplace=True
    df.drop_duplicates(inplace=True)

    return df

In [16]:
drop_duplicates()

Unnamed: 0,id_,t_date,t_type,t_amt,has_credit_card,account_type
0,312,20-01-2020,deposit,485,Yes,Savings
1,376,03-01-2020,deposit,706,No,Current
2,188,13-01-2020,deposit,601,No,Savings
3,138,11-01-2020,deposit,520,No,Salary
4,373,18-01-2020,deposit,596,No,Salary
...,...,...,...,...,...,...
5863,155,10-01-2020,deposit,712,Yes,Savings
5864,398,01-01-2020,deposit,196,No,Current
5865,255,14-01-2020,deposit,563,Yes,Savings
5866,185,29-01-2020,deposit,626,Yes,Savings


In [17]:
def data_cleaning():
    """
    Data Cleaning Function:
    Cleans the DataFrame by dropping specified columns and renaming others.

    Returns:
    DataFrame: The cleaned DataFrame after dropping and renaming columns.
    """
    # Step 1: Get the DataFrame with duplicate rows removed and rows with null values dropped
    df = drop_duplicates()

    # Step 2: Columns to remove from the DataFrame
    #columns needs to be removed "has_credit_card" and  "account_type"
    # Drop specified columns from the DataFrame
    df.drop(columns=["has_credit_card", "account_type"], inplace=True)
    
    #Rename columns id_,t_date,t_type,t_amt to consumer_id,transaction_date,transaction_type,transaction_amount
    # Step 5: Rename columns using the new column names
    
    df.rename(columns={"id_": "consumer_id", "t_date": "transaction_date","t_type": "transaction_type", "t_amt": "transaction_amount"}, inplace=True)
    df.to_csv('user_transaction_cleaned.csv', index=False)
    return df

In [18]:
data_cleaning()

Unnamed: 0,consumer_id,transaction_date,transaction_type,transaction_amount
0,312,20-01-2020,deposit,485
1,376,03-01-2020,deposit,706
2,188,13-01-2020,deposit,601
3,138,11-01-2020,deposit,520
4,373,18-01-2020,deposit,596
...,...,...,...,...
5863,155,10-01-2020,deposit,712
5864,398,01-01-2020,deposit,196
5865,255,14-01-2020,deposit,563
5866,185,29-01-2020,deposit,626
