In [49]:
import pandas as pd
import numpy as np

In [50]:
# Read data from datasource
def read_csv_file(file_path):
    try:
        # Attempt to read the CSV file
        data = pd.read_csv(file_path)
        print("File read successfully!")
        return data
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: The file contains parsing errors.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
file_path = '../data/HomeTask_ [1] User Information Labelled.csv'
labelled_user_info_df = read_csv_file(file_path)

if labelled_user_info_df is not None:
    print(labelled_user_info_df.shape)  # Display the first few rows if data was successfully loaded


File read successfully!
(500, 16)


In [51]:
# Drop duplicate rows from 'labelled_user_info_df'
labelled_user_info_df = labelled_user_info_df.drop_duplicates()
print(labelled_user_info_df.shape)

(455, 16)


In [52]:
labelled_user_info_df.head()

Unnamed: 0,user_id,session_id,country_of_residence,has_biometrics,phone_number_country_code,reason_to_use_app,occupation,kyc_state,gender,date_of_birth,document_issuing_country,locale,screening_state,is_adverse_media_minor,is_pep,user_current_state
0,00404b94-5c1b-47b0-9486-27ddda96ba52,5d711cff-d9eb-409d-9664-cf0a6819ce3a,NC,False,68,DAILY_PAYMENT_SERVICES,HEALTH,COMPLETED,F,1978-02-03,FR,fr_FR,COMPLETED,False,False,ACTIVE
1,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,FR,fr_FR,COMPLETED,False,False,SUSPENDED
2,013604f7-775a-43ca-9cfd-6fa811a89487,89f1bf0a-3436-4ae5-9e13-ecbf915bda81,FR,True,33,CRYPTO_SERVICES,OTHER_SERVICES,COMPLETED,M,1995-11-17,FR,fr_FR,COMPLETED,False,False,ACTIVE
3,02c62912-a1d4-41cf-a8cd-681f8e97afc9,9f99203c-eb2a-4b23-92fe-6a87ea3c5124,FR,False,68,INTERNATIONAL_TRANSFERS,UNEMPLOYED,COMPLETED,F,1999-07-16,FR,fr_FR,COMPLETED,False,False,ACTIVE
4,03ac32c8-723e-474b-b537-4a82ddefe32e,174769fb-4eda-403d-9956-4d428eacba13,FR,False,33,INTERNATIONAL_TRANSFERS,UNEMPLOYED,COMPLETED,M,1980-07-29,FR,fr_FR,COMPLETED,False,False,SUSPENDED


In [53]:
labelled_user_info_df.user_id.nunique(), labelled_user_info_df.session_id.nunique()

(432, 432)

In [54]:
# Return data which has more than one same value for some specific features
def get_repeated_data(data, column):
    try:
        # Group by 'column' and filter groups having more than one entry
        repeated_data = data[data.duplicated(subset=[column], keep=False)]
        
        if repeated_data.empty:
            print("No duplicate data found.")
        else:
            print("Duplicate data found:")
            return repeated_data
    except KeyError:
        print(f"Error: The column '{column}' does not exist in the dataset.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [63]:
# Remove Missing data records
def remove_nan_rows(df, id_column):
    # Step 1: Remove rows with NaN values, but only if there are other rows with valid data for the same 'user_id'
    df_cleaned = df.dropna(subset=df.columns.difference([id_column]))
    
    # Step 2: Return the cleaned DataFrame
    return df_cleaned

# Call remove_nan_rows
cleaned_user_info_df = remove_nan_rows(labelled_user_info_df, 'user_id')
print(cleaned_user_info_df.shape)

(432, 16)


In [56]:
# cleaned_user_info_df.head()

In [67]:
cleaned_user_info_df[cleaned_user_info_df.user_id == '29b4ae86-4605-42f2-bb23-eff7bc1f13cc'].head()

Unnamed: 0,user_id,session_id,country_of_residence,has_biometrics,phone_number_country_code,reason_to_use_app,occupation,kyc_state,gender,date_of_birth,document_issuing_country,locale,screening_state,is_adverse_media_minor,is_pep,user_current_state
71,29b4ae86-4605-42f2-bb23-eff7bc1f13cc,56657a8d-d82f-4215-9ed7-72380c5cf1a6,FR,True,33,CRYPTO_SERVICES,RETIRED,COMPLETED,M,1956-08-28,FR,fr_FR,COMPLETED,False,False,ACTIVE


In [65]:
cleaned_user_info_df.shape

(432, 16)

In [59]:
# Get rows with duplicate 'user_id'
duplicate_rows = get_duplicate_user_ids(labelled_user_info_df, 'user_id')
print(duplicate_rows.user_id)

Duplicate user_ids found:
28     0f12e279-bfe4-4642-92d5-cf4407fc58dd
30     0f12e279-bfe4-4642-92d5-cf4407fc58dd
34     12fb705c-c068-4033-a1d6-81c9f937f00c
35     12fb705c-c068-4033-a1d6-81c9f937f00c
69     29b4ae86-4605-42f2-bb23-eff7bc1f13cc
71     29b4ae86-4605-42f2-bb23-eff7bc1f13cc
78     2e3e3b6d-720e-4af3-bc92-b34f7c2d772c
79     2e3e3b6d-720e-4af3-bc92-b34f7c2d772c
87     2f89a26f-09a5-42cd-b169-ec1c9b423f4a
88     2f89a26f-09a5-42cd-b169-ec1c9b423f4a
92     32ef57dc-b3e0-45d3-accf-2bee7a67bdf6
93     32ef57dc-b3e0-45d3-accf-2bee7a67bdf6
95     3321b3ed-3999-4127-9017-45507250dd28
96     3321b3ed-3999-4127-9017-45507250dd28
128    46852363-02ce-4b3b-8dee-4ca3a701f375
130    46852363-02ce-4b3b-8dee-4ca3a701f375
133    47ac57f8-5213-40fa-9ca5-c462682d1c67
134    47ac57f8-5213-40fa-9ca5-c462682d1c67
161    56e131f1-f869-40b7-99c4-8987de23e89c
162    56e131f1-f869-40b7-99c4-8987de23e89c
196    6b2185cd-9067-4364-ac41-9d83d56630ab
197    6b2185cd-9067-4364-ac41-9d83d56630ab
215   

In [62]:
#29b4ae86-4605-42f2-bb23-eff7bc1f13cc -> kyc_state diff
#7105c83f-f17a-4438-bad0-3930654cc6c7 -> locale diff
#0f12e279-bfe4-4642-92d5-cf4407fc58dd -> has_biometrics diff
#12fb705c-c068-4033-a1d6-81c9f937f00c -> reason_to_use_app diff
#2e3e3b6d-720e-4af3-bc92-b34f7c2d772c -> user_current_state diff *****
#2f89a26f-09a5-42cd-b169-ec1c9b423f4a -> reason_to_use_app diff
#29b4ae86-4605-42f2-bb23-eff7bc1f13cc -> kyc_state diff
#29b4ae86-4605-42f2-bb23-eff7bc1f13cc -> kyc_state diff
labelled_user_info_df[labelled_user_info_df.user_id == '2f89a26f-09a5-42cd-b169-ec1c9b423f4a'].head()

Unnamed: 0,user_id,session_id,country_of_residence,has_biometrics,phone_number_country_code,reason_to_use_app,occupation,kyc_state,gender,date_of_birth,document_issuing_country,locale,screening_state,is_adverse_media_minor,is_pep,user_current_state
87,2f89a26f-09a5-42cd-b169-ec1c9b423f4a,df641a09-decf-439c-b5ec-dc80e329f34d,FR,True,33,,,COMPLETED,M,2002-11-04,FR,fr_FR,TO_SOFT_REVIEW,False,False,ACTIVE
88,2f89a26f-09a5-42cd-b169-ec1c9b423f4a,df641a09-decf-439c-b5ec-dc80e329f34d,FR,True,33,CRYPTO_SERVICES,HEALTH,COMPLETED,M,2002-11-04,FR,fr_FR,TO_SOFT_REVIEW,False,False,ACTIVE
