In [77]:
import pandas as pd
import numpy as np
import phonenumbers
import pycountry_convert as pc
import configparser

from phonenumbers.phonenumberutil import country_code_for_region
from datetime import datetime, date

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, classification_report

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.naive_bayes import GaussianNB


from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1)


In [78]:
# Read data from datasource
def read_csv_file(file_path):
    try:
        # Attempt to read the CSV file
        data = pd.read_csv(file_path)
        print("File read successfully!")
        return data
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: The file contains parsing errors.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Call read data func
file_path = '../data/HomeTask_ [1] User Information Labelled.csv'
labelled_user_info_df = read_csv_file(file_path)

if labelled_user_info_df is not None:
    print(labelled_user_info_df.shape)  # Display dataframe shape if data was successfully loaded


File read successfully!
(500, 16)


In [79]:
# Drop duplicate rows from 'labelled_user_info_df'
labelled_user_info_df = labelled_user_info_df.drop_duplicates()
print(labelled_user_info_df.shape)

(455, 16)


In [80]:
# Remove Missing data records
def remove_nan_rows(df, id_column):
    # Step 1: Remove rows with NaN values, but only if there are other rows with valid data for the same 'user_id'
    df_cleaned = df.dropna(subset=df.columns.difference([id_column]))
    
    # Step 2: Return the cleaned DataFrame
    return df_cleaned

# Call remove_nan_rows
cleaned_user_info_df = remove_nan_rows(labelled_user_info_df, 'user_id')
print(cleaned_user_info_df.shape)

(432, 16)


In [81]:
# Call read data func
file_path = '../data/HomeTask_ [2] Transaction Data Labelled.csv'
labelled_transaction_df = read_csv_file(file_path)

if labelled_transaction_df is not None:
    print(labelled_transaction_df.shape)  # Display shape of dataframe if data was successfully loaded

File read successfully!
(5945, 11)


In [82]:
# Merge both datasets on 'user_id'
df = pd.merge(cleaned_user_info_df, labelled_transaction_df, on='user_id')
df.shape

(6622, 26)

In [83]:
df.columns

Index(['user_id', 'session_id', 'country_of_residence', 'has_biometrics',
       'phone_number_country_code', 'reason_to_use_app', 'occupation',
       'kyc_state', 'gender', 'date_of_birth', 'document_issuing_country',
       'locale', 'screening_state', 'is_adverse_media_minor', 'is_pep',
       'user_current_state', 'id', 'reference_transaction_id', 'state', 'type',
       'category_id', 'amount', 'currency', 'direction', 'completed_at',
       'created_at'],
      dtype='object')

In [84]:
nan_control_cols = ['user_id', 'session_id', 'id', 'reference_transaction_id']
# Check for NaN values in the specified columns
def check_nan_in_columns(df, columns):
    # Iterate over the specified columns
    for col in columns:
        nan_count = df[col].isna().sum()  # Count NaN values in the column
        if nan_count > 0:
            print(f"Column '{col}' has {nan_count} NaN values.")
        else:
            print(f"Column '{col}' has no NaN values.")

# Example usage
# Assuming labelled_transaction_df is your DataFrame
check_nan_in_columns(df, nan_control_cols)


Column 'user_id' has no NaN values.
Column 'session_id' has no NaN values.
Column 'id' has no NaN values.
Column 'reference_transaction_id' has no NaN values.


In [85]:

# Method for reducing the memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [86]:
df = reduce_mem_usage(df)


Mem. usage decreased to  1.10 Mb (7.0% reduction)


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6622 entries, 0 to 6621
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   user_id                    6622 non-null   object 
 1   session_id                 6622 non-null   object 
 2   country_of_residence       6622 non-null   object 
 3   has_biometrics             6622 non-null   bool   
 4   phone_number_country_code  6622 non-null   int8   
 5   reason_to_use_app          6622 non-null   object 
 6   occupation                 6622 non-null   object 
 7   kyc_state                  6622 non-null   object 
 8   gender                     6622 non-null   object 
 9   date_of_birth              6622 non-null   object 
 10  document_issuing_country   6622 non-null   object 
 11  locale                     6622 non-null   object 
 12  screening_state            6622 non-null   object 
 13  is_adverse_media_minor     6622 non-null   bool 

In [88]:
df.iloc[:,14:].head()

Unnamed: 0,is_pep,user_current_state,id,reference_transaction_id,state,type,category_id,amount,currency,direction,completed_at,created_at
0,False,SUSPENDED,be097278-6ce9-4afe-84cc-96e24ed59e44,62fe1b69-52a9-4ae6-b13e-2d7932fc8d12,PENDING,PROCESSING_FEE,b818b29c-cd13-4352-863d-f930de135c65,14.992188,EUR,OUT,,2024-08-04 15:26:52.418093 UTC
1,False,SUSPENDED,d0bb6764-6302-4444-94e9-0024302c2f90,e9ab418d-b2a7-46fe-8693-7d6c9110d1bb,COMPLETED,BANK_TRANSFER,e464e1e2-40ca-416d-a541-ba0865e252f9,5.0,EUR,IN,2024-08-06 12:32:23.350767 UTC,2024-08-06 12:32:14.305982 UTC
2,False,SUSPENDED,ab000ca3-28b3-4610-8253-2b4f32d12da3,910b847b-c944-4cc6-8857-ba75d6befa46,DECLINED,CARDS,06bee557-499a-4d75-bc66-155ea0489a19,2.5,EUR,OUT,2024-08-20 11:05:34.973154 UTC,2024-08-20 11:05:34.973154 UTC
3,False,SUSPENDED,dcc483c5-227e-4585-a874-6a0552ad771f,67859c37-b0e4-44b4-98b9-d53d66c97c42,COMPLETED,MANUAL,b818b29c-cd13-4352-863d-f930de135c65,25.015625,EUR,OUT,2024-08-20 08:22:31.556082 UTC,2024-08-20 08:22:31.556082 UTC
4,False,SUSPENDED,0aa0343d-4db8-4c93-b0ba-bc01e54befd8,03baa814-7bc3-4fdd-9f7f-580a5a76bf71,DECLINED,CARDS,8c6dc9fc-af02-4754-a312-25d7d05c091b,11.726562,EUR,OUT,2024-08-13 13:07:10.712891 UTC,2024-08-13 13:07:10.712891 UTC


In [89]:
df.currency.value_counts()

currency
EUR    6622
Name: count, dtype: int64

In [90]:
drop_cols = ['user_id', 'session_id', 'is_adverse_media_minor', 'is_pep', 'kyc_state', 'id',
             'reference_transaction_id', 'currency', 'completed_at', 'created_at', 'date_of_birth', 'continent_of_residence']
numeric_cols = ['amount', 'total_amount', 'average_amount', 'date_of_birth_days_since', 'process_time']
categoric_cols = ['country_of_residence', 'has_biometrics', 'phone_number_country_code', 'reason_to_use_app', 'occupation',
                  'kyc_state', 'gender', 'document_issuing_country', 'locale', 'screening_state', 'state', 'type', 'category_id', 'direction']
date_cols = ['date_of_birth', 'completed_at', 'created_at']

In [91]:
# Feature engineering - you can add new features like transaction frequency or total amount per user
df['transaction_count'] = df.groupby('user_id')['id'].transform('count')
df['total_amount'] = df.groupby('user_id')['amount'].transform('sum')
df['average_amount'] = df.groupby('user_id')['amount'].transform('mean')

In [92]:
# Method for filling the null values
def fill_null_values(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column].fillna('unknown', inplace=True)
        elif df[column].dtype in ['int64', 'float64']:
            df[column].fillna(0, inplace=True)
            df[column][np.isnan(df[column])] = 0
    return df

In [93]:
df = fill_null_values(df)

In [94]:
# Function to calculate the difference in days from 'date_of_birth' to today
def calculate_day_difference(df, column_name):
    # Get today's date
    today = pd.Timestamp.today().normalize()  # Normalize to remove time
    
    try:
        # Convert the column to datetime format first
        df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
        
        # Calculate the difference in days
        df[column_name + '_days_since'] = (today - df[column_name]).dt.days
        
    except (TypeError, ValueError) as e:
        print(f"Error: {e}. Unable to convert column '{column_name}' to datetime.")

    return df

In [95]:
df = calculate_day_difference(df, 'date_of_birth')
df.head()

Unnamed: 0,user_id,session_id,country_of_residence,has_biometrics,phone_number_country_code,reason_to_use_app,occupation,kyc_state,gender,date_of_birth,...,category_id,amount,currency,direction,completed_at,created_at,transaction_count,total_amount,average_amount,date_of_birth_days_since
0,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,b818b29c-cd13-4352-863d-f930de135c65,14.992188,EUR,OUT,unknown,2024-08-04 15:26:52.418093 UTC,11,139.226562,12.65696,8243
1,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,e464e1e2-40ca-416d-a541-ba0865e252f9,5.0,EUR,IN,2024-08-06 12:32:23.350767 UTC,2024-08-06 12:32:14.305982 UTC,11,139.226562,12.65696,8243
2,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,06bee557-499a-4d75-bc66-155ea0489a19,2.5,EUR,OUT,2024-08-20 11:05:34.973154 UTC,2024-08-20 11:05:34.973154 UTC,11,139.226562,12.65696,8243
3,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,b818b29c-cd13-4352-863d-f930de135c65,25.015625,EUR,OUT,2024-08-20 08:22:31.556082 UTC,2024-08-20 08:22:31.556082 UTC,11,139.226562,12.65696,8243
4,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,8c6dc9fc-af02-4754-a312-25d7d05c091b,11.726562,EUR,OUT,2024-08-13 13:07:10.712891 UTC,2024-08-13 13:07:10.712891 UTC,11,139.226562,12.65696,8243


In [96]:
# Fucntion to calculate process time
def calculate_process_time(df, start_column, end_column):
    # Replace NaN values with 0 for both start and end times
    df[start_column] = pd.to_datetime(df[start_column], errors='coerce')  # Ensure it's a datetime
    df[end_column] = pd.to_datetime(df[end_column], errors='coerce')      # Ensure it's a datetime
    
    # Calculate the process time, set to 0 where either start or end is NaN
    df['process_time'] = (df[end_column] - df[start_column]).dt.total_seconds()
    
    # Where start or end time is NaN, set process_time to 0
    df['process_time'] = df['process_time'].fillna(0)
    
    return df

# Usage
df = calculate_process_time(df, 'created_at', 'completed_at')

# Display the DataFrame with the calculated process times
df.head()

Unnamed: 0,user_id,session_id,country_of_residence,has_biometrics,phone_number_country_code,reason_to_use_app,occupation,kyc_state,gender,date_of_birth,...,amount,currency,direction,completed_at,created_at,transaction_count,total_amount,average_amount,date_of_birth_days_since,process_time
0,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,14.992188,EUR,OUT,NaT,2024-08-04 15:26:52.418093+00:00,11,139.226562,12.65696,8243,0.0
1,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,5.0,EUR,IN,2024-08-06 12:32:23.350767+00:00,2024-08-06 12:32:14.305982+00:00,11,139.226562,12.65696,8243,9.044785
2,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,2.5,EUR,OUT,2024-08-20 11:05:34.973154+00:00,2024-08-20 11:05:34.973154+00:00,11,139.226562,12.65696,8243,0.0
3,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,25.015625,EUR,OUT,2024-08-20 08:22:31.556082+00:00,2024-08-20 08:22:31.556082+00:00,11,139.226562,12.65696,8243,0.0
4,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,11.726562,EUR,OUT,2024-08-13 13:07:10.712891+00:00,2024-08-13 13:07:10.712891+00:00,11,139.226562,12.65696,8243,0.0


In [97]:
scale_cols = ['amount','date_of_birth_days_since', 'process_time', 'transaction_count']


# Function to apply Min-Max scaling to specific columns of a DataFrame
def normalize_data(dataframe, columns_to_scale):
    """
    This function applies Min-Max scaling to the specified columns of a DataFrame.

    Parameters:
    dataframe (pd.DataFrame): The input DataFrame with the data.
    columns_to_scale (list): A list of column names to be scaled.

    Returns:
    pd.DataFrame: The DataFrame with scaled columns.
    """

    # Initialize MinMaxScaler
    scaler = MinMaxScaler()

    # Apply the scaler only to the specified columns
    dataframe[columns_to_scale] = scaler.fit_transform(dataframe[columns_to_scale])

    return dataframe

df = normalize_data(df, scale_cols)

# Display the first few rows of the scaled dataframe
df.head()

Unnamed: 0,user_id,session_id,country_of_residence,has_biometrics,phone_number_country_code,reason_to_use_app,occupation,kyc_state,gender,date_of_birth,...,amount,currency,direction,completed_at,created_at,transaction_count,total_amount,average_amount,date_of_birth_days_since,process_time
0,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,0.000375,EUR,OUT,NaT,2024-08-04 15:26:52.418093+00:00,0.011641,139.226562,12.65696,0.067916,2.764266e-09
1,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,0.000125,EUR,IN,2024-08-06 12:32:23.350767+00:00,2024-08-06 12:32:14.305982+00:00,0.011641,139.226562,12.65696,0.067916,3.238875e-06
2,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,6.3e-05,EUR,OUT,2024-08-20 11:05:34.973154+00:00,2024-08-20 11:05:34.973154+00:00,0.011641,139.226562,12.65696,0.067916,2.764266e-09
3,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,0.000625,EUR,OUT,2024-08-20 08:22:31.556082+00:00,2024-08-20 08:22:31.556082+00:00,0.011641,139.226562,12.65696,0.067916,2.764266e-09
4,005256f3-53d4-471c-8bbd-b0c5e16bf2f7,882dd5b7-e378-4947-89e0-900a79b3083f,FR,True,33,TRAVEL_ACCOUNT,SERVICE_AND_SALES,COMPLETED,M,2002-03-09,...,0.000293,EUR,OUT,2024-08-13 13:07:10.712891+00:00,2024-08-13 13:07:10.712891+00:00,0.011641,139.226562,12.65696,0.067916,2.764266e-09


In [98]:
# Ensure 'total_amount' column is not using float16 by converting it to float64
df['total_amount'] = df['total_amount'].astype('float64')

# Create bins for 'total_amount'
total_amount_bins = [df['total_amount'].min(),
               1000,
               5000,
               df['total_amount'].mean(),
               df['total_amount'].mean() + df['total_amount'].std(),
               100000,
               df['total_amount'].max()]

# Split dataset acording to the bins
bins_df = pd.cut(df['total_amount'], bins=total_amount_bins)

# Create groups with bins
data_set_size = df.groupby(bins_df).size().reset_index(name='count')

print("Bins and sizes:")
print(data_set_size)

Bins and sizes:
             total_amount  count
0           (0.0, 1000.0]   1377
1        (1000.0, 5000.0]   1901
2     (5000.0, 14999.593]   1567
3  (14999.593, 43802.039]   1414
4   (43802.039, 100000.0]    246
5  (100000.0, 282938.594]    105


In [99]:
df['total_amount'] = pd.cut(df['total_amount'],
                      bins=total_amount_bins,
                      include_lowest=True,
                      duplicates='drop').cat.codes

In [100]:
# Ensure 'average_amount' column is not using float16 by converting it to float64
df['average_amount'] = df['average_amount'].astype('float64')

# Create bins for 'average_amount'
average_amount_bins = [df['average_amount'].min(),
               1,
               10,
               25,
               100,
               df['average_amount'].mean(),
               df['average_amount'].mean() + df['average_amount'].std(),
               5000,
               df['average_amount'].max()]

# Split dataset acording to the bins
bins_df = pd.cut(df['average_amount'], bins=average_amount_bins)

# Create groups with bins
data_set_size = df.groupby(bins_df).size().reset_index(name='count')

print("Bins and sizes:")
print(data_set_size)

Bins and sizes:
        average_amount  count
0           (0.0, 1.0]     23
1          (1.0, 10.0]    579
2         (10.0, 25.0]   1004
3        (25.0, 100.0]   3005
4     (100.0, 298.623]   1297
5  (298.623, 1409.668]    421
6   (1409.668, 5000.0]    192
7  (5000.0, 14301.143]     89


In [101]:
df['average_amount'] = pd.cut(df['average_amount'],
                      bins=average_amount_bins,
                      include_lowest=True,
                      duplicates='drop').cat.codes

NameError: name 'amount_bins' is not defined

In [None]:
df.iloc[:,15:].head()

In [None]:
location_cols = ['country_of_residence', 'phone_number_country_code', 'document_issuing_country', 'locale']
df[location_cols].head(10)

In [None]:
df['locale'] = df['locale'].str.split('_').str[1]
df['locale'] = df['locale'].replace('001', 'Global')


In [None]:
# Manual mapping of phone country codes to country abbreviations
country_code_to_abbr = {
    33: 'FR',  # France
    44: 'GB',  # United Kingdom
    1: 'US',   # United States/Canada
    39: 'IT',  # Italy
    34: 'ES',  # Spain
    68: 'Unknown',  # Undefined or not standard
    26: 'Unknown',  # Undefined or not standard
    59: 'Unknown',  # Undefined or not standard
    689: 'PF',  # French Polynesia
    687: 'NC',  # New Caledonia
    262: 'YT',  # Mayotte
    590: 'GP',  # Guadeloupe
    262: 'RE',  # Réunion
    351: 'PT',  # Portugal
    40: 'RO',  # Romania
}

# Function to convert country codes to country abbreviations
def convert_country_code_to_abbr(country_code):
    # Return the country abbreviation from the dictionary, or 'Unknown' if not found
    return country_code_to_abbr.get(country_code, 'Unknown')

In [None]:
# Apply the conversion function to the 'phone_number_country_code' and 'country_of_residence' columns
df['phone_number_country_code'] = df['phone_number_country_code'].apply(convert_country_code_to_abbr)
df[location_cols].head(10)

In [None]:
# Function to map country to continent and then to numeric values
def country_to_continent(country_abbr):
    try:
        continent_code = pc.country_alpha2_to_continent_code(country_abbr)
        continent_name = {
            'AF': 'Africa',
            'NA': 'North America',
            'SA': 'South America',
            'AS': 'Asia',
            'EU': 'Europe',
            'OC': 'Oceania',
            'AN': 'Antarctica',
            'Global' : 'Global'
        }
        return continent_name[continent_code]
    except KeyError:
        return 'Unknown'

In [None]:
# Function to convert continent names to numeric codes
def continent_to_numeric(continent):
    continent_map = {'Africa': 1, 'Asia': 2, 'Europe': 3, 'North America': 4, 'Oceania': 5, 
                     'South America': 6, 'Antarctica': 7, 'Global': 10, 'unknown': 100}
    
    return continent_map.get(continent, 9)

In [None]:
# Main function to process the DataFrame
def process_dataframe(df):
    # Keep 'country_of_residence'
    # The 'country_of_residence' column is already in the DataFrame
    
    # Find continents from 'country_of_residence' and convert continents to numeric values
    df['continent_of_residence'] = df['country_of_residence'].apply(country_to_continent)
    df['continent_code'] = df['continent_of_residence'].apply(continent_to_numeric)
    
    # Compare 'phone_number_country_code' with 'country_of_residence' and set 1 if they match, 0 otherwise
    df['phone_number_country_code'] = df.apply(lambda row: 1 if row['phone_number_country_code'] == row['country_of_residence'] else 0, axis=1)
    
    # Step 4: Compare 'document_issuing_country' with 'country_of_residence' and set 1 if they match, 0 otherwise
    df['document_issuing_country'] = df.apply(lambda row: 1 if row['document_issuing_country'] == row['country_of_residence'] else 0, axis=1)
    
    # Step 5: Compare 'locale' with 'country_of_residence' and set 1 if they match, 0 otherwise
    df['locale'] = df.apply(lambda row: 1 if row['locale'] == row['country_of_residence'] else 0, axis=1)
    
    return df

In [None]:
# Process the DataFrame
df = process_dataframe(df)
df[['country_of_residence',	'phone_number_country_code',	'document_issuing_country', 'locale', 'continent_code'	]].head(10)

In [None]:
# Function to get dummies for specified categorical columns
def one_hot_encode(df, categoric_cols):
    """
    This function takes a DataFrame and a list of categorical columns,
    and returns the DataFrame with the specified columns one-hot encoded.
    """
    # Apply pd.get_dummies() to the specified categorical columns
    df_encoded = pd.get_dummies(df, columns=categoric_cols, drop_first=False)  # drop_first=False keeps all dummy columns
    
    return df_encoded


In [None]:
# Call one got encoding fucntion with df and dedicated features
df_2 = one_hot_encode(df, categoric_cols)

In [None]:
df_2.columns

In [None]:
df.shape, df_2.shape

In [None]:
df_2.iloc[:,:20].head()

In [None]:
df_3 = df_2.drop(columns=drop_cols, errors='ignore')  # errors='ignore' ensures no error if column not found
df_3.shape, df_2.shape

In [None]:
df_3.columns

In [None]:
df_3 = reduce_mem_usage(df_3)

In [None]:
# Method for ordering the data
def move_target_to_end(df, target_column):
    """
    Move the target column to the end of the DataFrame.

    Parameters:
    - df: pandas DataFrame
    - target_column: str, the name of the target column

    Returns:
    - df: pandas DataFrame, updated DataFrame
    """

    # Ensure the target column is in the DataFrame
    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in the DataFrame.")
        return df

    # Reorder columns to move the target column to the end
    new_order = [col for col in df.columns if col != target_column] + [target_column]
    df = df[new_order]

    return df
df_3 = move_target_to_end(df_3, 'user_current_state')
df_3.info()

In [None]:
df_3.columns

In [None]:
%%time
# Calculating correlation
#corr_df = corr_df[cols_4_corr].copy()
df_corr = df_3[df_3.columns.to_list()[:9] + df.columns.to_list()[10:] + [df.columns.to_list()[9]]]

corr = df_corr.corr()
cr = corr.copy()
top_corr_columns = []
#Determine best correlate columns over 0.1
top_corr_columns = cr.loc[:, 'response'][:-1]
best_accurate_columns = top_corr_columns[abs(top_corr_columns) > 0.1].sort_values(ascending=False)


len(best_accurate_columns)
     

In [46]:
# Method for correlation control
def drop_highly_correlated_columns(df, threshold):
    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > float(threshold))]

    # Drop features
    df.drop(to_drop, axis=1, inplace=True)
    return df  # Return None if no high correlation is found

In [47]:
X = df_3.drop(columns=['user_current_state'])  # features (independent variables)
y = df_3['user_current_state'].apply(lambda x: 1 if x == 'SUSPENDED' else 0)  # SUSPENDED=1, ACTIVE=0
# X = drop_highly_correlated_columns(X, threshold)

In [48]:
X.columns

Index(['amount', 'transaction_count', 'total_amount', 'average_amount',
       'date_of_birth_days_since', 'process_time', 'continent_code',
       'country_of_residence_FR', 'country_of_residence_GP',
       'country_of_residence_NC', 'country_of_residence_PF',
       'country_of_residence_RE', 'country_of_residence_YT',
       'has_biometrics_False', 'has_biometrics_True',
       'phone_number_country_code_0', 'phone_number_country_code_1',
       'reason_to_use_app_CRYPTO_SERVICES',
       'reason_to_use_app_DAILY_PAYMENT_SERVICES',
       'reason_to_use_app_INTERNATIONAL_TRANSFERS',
       'reason_to_use_app_NON_CUSTODIAL_CRYPTO_WALLET',
       'reason_to_use_app_TRAVEL_ACCOUNT', 'occupation_ADMINISTRATION',
       'occupation_AGRICULTURE_FISHERY_FORESTRY',
       'occupation_CRAFT_AND_RELATED_TRADES', 'occupation_EDUCATION',
       'occupation_FOOD_PREP_AND_CLEANING', 'occupation_HEALTH',
       'occupation_IT_AND_COMMUNICATION',
       'occupation_LEGAL_SOCIAL_AND_CULTURAL_ACTIVI

In [49]:
X = drop_highly_correlated_columns(X, 0.7)
X.shape

(6622, 67)

In [50]:

# Function to write columns to config.ini
def write_columns_to_config(X1, config_file='../config.ini'):
    # Initialize the configparser
    config = configparser.ConfigParser()

    # Create a list of column names from X1.columns
    corr_cols = ','.join(X1.columns)  # Join column names as a comma-separated string

    # Add [params] section if it doesn't exist
    config['params'] = {}
    config['params']['corr_cols'] = corr_cols

    # Write the configuration to a file
    with open(config_file, 'w') as configfile:
        config.write(configfile)

# Usage of write function
write_columns_to_config(X)
corr_cols = X.columns


In [51]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Model Training**

## RandomForestClassifier


In [52]:
# Defining hyperparameter values for the Random Forest Classifier

n_estimators = [100, 250, 500]            # Number of trees in the forest (using 700 trees)
max_depth = [50, 40, 30, 20]             # Maximum depth of each tree (testing depths of 10 and 8)
min_samples_split = [10, 5, 3, 2]     # Minimum number of samples required to split an internal node (testing values 10 and 5)
min_samples_leaf = [5, 4, 3, 2]       # Minimum number of samples required to be a leaf node (testing values 5 and 3)

# Creating a dictionary that stores these hyperparameter options
hyper_random = {
    "n_estimators": n_estimators,              # Number of trees to be used
    "max_depth": max_depth,                    # Maximum tree depths to be used
    "min_samples_split": min_samples_split,    # Minimum samples required for splitting a node
    "min_samples_leaf": min_samples_leaf       # Minimum samples required for a leaf node
}


In [53]:
%%time
# Perform Grid Search to find the best hyperparameters for Random Forest Classifier

clf_rf_tuned = GridSearchCV(RandomForestClassifier(),   # The model being tuned is a Random Forest Classifier
                            hyper_random,               # The hyperparameter grid defined earlier
                            cv=5,                       # 5-fold cross-validation
                            verbose=1,                  # Verbosity level (1 = prints progress)
                            n_jobs=-1)                  # Use all available CPU cores to parallelize the work

# Fit the model on the training data
clf_rf_tuned.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
CPU times: user 2.11 s, sys: 611 ms, total: 2.72 s
Wall time: 1min 13s


In [54]:

best_params_random = clf_rf_tuned.best_params_
print(best_params_random)

{'max_depth': 40, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [55]:
# Parameter settings
CV_clf_rf = RandomForestClassifier(max_depth=best_params_random["max_depth"],
                                   min_samples_leaf=best_params_random["min_samples_leaf"],
                                   min_samples_split=best_params_random["min_samples_split"],
                                   n_estimators= best_params_random["n_estimators"])

In [56]:
# Fit the model (CV_clf_rf) on the training data
CV_clf_rf.fit(X_train, y_train)

# Predict the probabilities for the test data (specifically the probability of class 1)
y_test_predict_random = CV_clf_rf.predict_proba(X_test)[:, 1]

# Predict the class labels for the test data
yhat_random = CV_clf_rf.predict(X_test)

# Compute the calibration curve (fraction of positives vs. mean predicted value)
fraction_of_positives, mean_predicted_value = calibration_curve(y_test,
                                                                y_test_predict_random,
                                                                n_bins=10)

In [57]:
# Print the classification report of the model's predictions on the test data
print(classification_report(y_test, yhat_random))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       943
           1       0.91      0.90      0.90       382

    accuracy                           0.94      1325
   macro avg       0.93      0.93      0.93      1325
weighted avg       0.94      0.94      0.94      1325



In [58]:
result_df = X_test.copy()
result_df['user_current_state'] = y_test
result_df['rf_pred'] = yhat_random
result_df.head()

Unnamed: 0,amount,transaction_count,total_amount,date_of_birth_days_since,process_time,continent_code,country_of_residence_FR,country_of_residence_GP,country_of_residence_NC,country_of_residence_PF,...,category_id_b2ad0633-7bc7-40b9-8128-c483bf6f0c5b,category_id_b818b29c-cd13-4352-863d-f930de135c65,category_id_d4eb5429-7628-42ec-a838-2af8bd189eaa,category_id_d4f711bf-fb5d-45a4-aa30-a2c953b25352,category_id_e2b1df1c-c8d9-47f5-9937-2c40bf0bc514,category_id_f8aaa857-9086-463e-b531-9aaca3a93745,category_id_fc2a7dc9-3fab-47f5-b5f0-54641865d557,direction_IN,user_current_state,rf_pred
96,0.011253,0.156006,2,0.162842,5.960464e-08,3,True,False,False,False,...,False,False,False,False,False,False,False,True,0,0
994,0.0,0.010475,0,0.507324,0.0,5,False,False,True,False,...,True,False,False,False,False,False,False,False,0,0
1400,1.5e-05,0.22937,0,0.002718,1.370907e-05,3,True,False,False,False,...,False,False,False,False,False,False,False,True,1,1
865,5e-05,0.100098,1,0.421875,0.0,1,False,False,False,False,...,False,True,False,False,False,False,False,False,0,0
6097,0.000375,0.123413,2,0.105103,3.576279e-07,3,True,False,False,False,...,False,True,False,False,False,False,False,False,0,0


In [59]:
# Print the shape of the result_df dataset
print("Dataset shape = ", result_df.shape[0])

# For Random Forest predictions
print("For Random Forest :")
# Count of True Negatives: instances where actual label is 0 and predicted as 0 by Random Forest
print("Total negative case = ", result_df[result_df.user_current_state == 0].shape[0])
print("Correct : ",result_df[(result_df.user_current_state == 0) & (result_df.rf_pred == 0)].shape[0])

# Count of False Positives: instances where actual label is 0 but predicted as 1 by Random Forest
print("False : ",result_df[(result_df.user_current_state == 0) & (result_df.rf_pred == 1)].shape[0])

print("Total positive case = ", result_df[result_df.user_current_state == 1].shape[0])

# Count of True Positives: instances where actual label is 1 and predicted as 1 by Random Forest
print("Correct : ",result_df[(result_df.user_current_state == 1) & (result_df.rf_pred == 1)].shape[0])

# Count of False Negatives: instances where actual label is 1 but predicted as 0 by Random Forest
print("False : ",result_df[(result_df.user_current_state == 1) & (result_df.rf_pred == 0)].shape[0])


Dataset shape =  1325
For Random Forest :
Total negative case =  943
Correct :  909
False :  34
Total positive case =  382
Correct :  343
False :  39


## GaussianNaiveBias


In [60]:
%%time

#Uncalibrated Gaussian Naive Bayes classifier
clf_nb = GaussianNB()  # Initialize a Gaussian Naive Bayes classifier
clf_nb.fit(X_train, y_train)  # Train the model on the training data

y_test_predict_nb = clf_nb.predict_proba(X_test)[:, 1]  # Get probability predictions for the test set
yhat_nb = clf_nb.predict(X_test)  # Get class predictions for the test set

fraction_of_positives_nb, mean_predicted_value_nb = calibration_curve(y_test, y_test_predict_nb, n_bins=10)



# Calibrated  with isotonic NB
clf_sigmoid_nb = CalibratedClassifierCV(clf_nb, cv=10, method='isotonic')  # Initialize isotonic calibration
clf_sigmoid_nb.fit(X_train, y_train)  # Calibrate the classifier using 10-fold cross-validation

y_test_predict_nb_calib = clf_sigmoid_nb.predict_proba(X_test)[:, 1]  # Get calibrated probability predictions
yhat_calibrated_nb = clf_sigmoid_nb.predict(X_test)  # Get calibrated class predictions

fraction_of_positives_nb_calib, mean_predicted_value_nb_calib = calibration_curve(y_test, y_test_predict_nb_calib, n_bins=10)

CPU times: user 234 ms, sys: 7.33 ms, total: 241 ms
Wall time: 242 ms


In [61]:
print(classification_report(y_test, yhat_nb))


              precision    recall  f1-score   support

           0       0.99      0.40      0.57       943
           1       0.40      0.99      0.57       382

    accuracy                           0.57      1325
   macro avg       0.70      0.70      0.57      1325
weighted avg       0.82      0.57      0.57      1325



In [62]:
print(classification_report(y_test, yhat_calibrated_nb))


              precision    recall  f1-score   support

           0       0.71      1.00      0.83       943
           1       0.00      0.00      0.00       382

    accuracy                           0.71      1325
   macro avg       0.36      0.50      0.42      1325
weighted avg       0.51      0.71      0.59      1325



In [63]:

result_df['Naive_Bias_Pred'] = yhat_nb
result_df['Isotonic_Calibrated_Naive_Bias_Pred'] = yhat_calibrated_nb

result_df.head()

Unnamed: 0,amount,transaction_count,total_amount,date_of_birth_days_since,process_time,continent_code,country_of_residence_FR,country_of_residence_GP,country_of_residence_NC,country_of_residence_PF,...,category_id_d4eb5429-7628-42ec-a838-2af8bd189eaa,category_id_d4f711bf-fb5d-45a4-aa30-a2c953b25352,category_id_e2b1df1c-c8d9-47f5-9937-2c40bf0bc514,category_id_f8aaa857-9086-463e-b531-9aaca3a93745,category_id_fc2a7dc9-3fab-47f5-b5f0-54641865d557,direction_IN,user_current_state,rf_pred,Naive_Bias_Pred,Isotonic_Calibrated_Naive_Bias_Pred
96,0.011253,0.156006,2,0.162842,5.960464e-08,3,True,False,False,False,...,False,False,False,False,False,True,0,0,1,0
994,0.0,0.010475,0,0.507324,0.0,5,False,False,True,False,...,False,False,False,False,False,False,0,0,0,0
1400,1.5e-05,0.22937,0,0.002718,1.370907e-05,3,True,False,False,False,...,False,False,False,False,False,True,1,1,1,0
865,5e-05,0.100098,1,0.421875,0.0,1,False,False,False,False,...,False,False,False,False,False,False,0,0,0,0
6097,0.000375,0.123413,2,0.105103,3.576279e-07,3,True,False,False,False,...,False,False,False,False,False,False,0,0,1,0


## Apply the Model to Unlabeled Data

In [64]:
# Call read data func
file_path = '../data/HomeTask _ [3] User Information Unlabelled.csv'
unlabelled_user_info_df = read_csv_file(file_path)

if unlabelled_user_info_df is not None:
    print(unlabelled_user_info_df.shape)  # Display dataframe shape if data was successfully loaded

File read successfully!
(300, 14)


In [65]:
# Drop duplicate rows from 'labelled_user_info_df'
unlabelled_user_info_df = unlabelled_user_info_df.drop_duplicates()
print(unlabelled_user_info_df.shape)

(282, 14)


In [66]:
# Call remove_nan_rows
unlabelled_cleaned_user_info_df = remove_nan_rows(unlabelled_user_info_df, 'user_id')
print(unlabelled_cleaned_user_info_df.shape)

(248, 14)


In [67]:
# Call remove_nan_rows
unlabelled_cleaned_user_info_df = remove_nan_rows(unlabelled_user_info_df, 'user_id')
print(unlabelled_cleaned_user_info_df.shape)

(248, 14)


In [68]:
# Call read data func
file_path = '../data/HomeTask _ [4] TransactionData Unlabelled.csv'
unlabelled_transaction_df = read_csv_file(file_path)

if unlabelled_transaction_df is not None:
    print(unlabelled_transaction_df.shape)  # Display shape of dataframe if data was successfully loaded

File read successfully!
(9849, 10)


In [69]:
# Merge the date_of_birth column from labelled_user_info_df to unlabelled_cleaned_user_info_df based on 'user_id'
unlabelled_cleaned_user_info_df = unlabelled_cleaned_user_info_df.merge(
    cleaned_user_info_df[['user_id', 'date_of_birth']],  # Select only the 'user_id' and 'date_of_birth' columns
    on='user_id',  # Join on 'user_id'
    how='left'  # Perform a left join to retain all records from unlabeled_user_info_df
)

In [70]:
# Merge the category_id column from labelled_transaction_df to unlabelled_transaction_df based on 'reference_transaction_id'
unlabelled_transaction_df = unlabelled_transaction_df.merge(
    labelled_transaction_df[['reference_transaction_id', 'category_id']],  # Select only the 'reference_transaction_id' and 'category_id' columns
    on='reference_transaction_id',  # Join on 'reference_transaction_id'
    how='left'  # Perform a left join to retain all records from unlabeled_user_info_df
)

In [71]:
# Merge both datasets on 'user_id'
df_test = pd.merge(unlabelled_cleaned_user_info_df, unlabelled_transaction_df, on='user_id')
df_test.shape

(7611, 25)

In [72]:
# Function to move a column in a DataFrame
def move_column(df, col_idx_from, col_idx_to):
    # Get the column name at col_idx_from
    col_name = df.columns[col_idx_from]
    
    # Drop the column and insert it at the new position
    cols = list(df.columns)
    cols.insert(col_idx_to, cols.pop(col_idx_from))
    
    # Re-arrange the DataFrame columns
    df = df[cols]
    
    return df

In [73]:
df_test = move_column(df_test, 14, 9)


In [75]:
df_test.iloc[:,:15].head()

Unnamed: 0,user_id,session_id,country_of_residence,has_biometrics,phone_number_country_code,reason_to_use_app,occupation,kyc_state,gender,date_of_birth,document_issuing_country,locale,screening_state,is_adverse_media_minor,is_pep
0,8b9cdf59-96ce-43b8-bb72-8f5c405894f4,956f9bd1-6605-4f2c-b710-a64c31d84bec,FR,True,33,DAILY_PAYMENT_SERVICES,IT_AND_COMMUNICATION,COMPLETED,M,,FR,fr_FR,COMPLETED,False,False
1,8b9cdf59-96ce-43b8-bb72-8f5c405894f4,956f9bd1-6605-4f2c-b710-a64c31d84bec,FR,True,33,DAILY_PAYMENT_SERVICES,IT_AND_COMMUNICATION,COMPLETED,M,,FR,fr_FR,COMPLETED,False,False
2,8b9cdf59-96ce-43b8-bb72-8f5c405894f4,956f9bd1-6605-4f2c-b710-a64c31d84bec,FR,True,33,DAILY_PAYMENT_SERVICES,IT_AND_COMMUNICATION,COMPLETED,M,,FR,fr_FR,COMPLETED,False,False
3,e409d12e-2b1e-44af-a08d-510d2fb016b2,835d0560-4767-4f01-8a8e-109780286976,FR,True,33,DAILY_PAYMENT_SERVICES,SERVICE_AND_SALES,COMPLETED,M,,IT,fr_BR,TO_SOFT_REVIEW,False,False
4,e409d12e-2b1e-44af-a08d-510d2fb016b2,835d0560-4767-4f01-8a8e-109780286976,FR,True,33,DAILY_PAYMENT_SERVICES,SERVICE_AND_SALES,COMPLETED,M,,IT,fr_BR,TO_SOFT_REVIEW,False,False


In [76]:
df_test.date_of_birth.unique()

array([nan], dtype=object)

In [None]:
df_test = move_column(df_test, 25, 19)

In [None]:
# Assuming df_test is your DataFrame
check_nan_in_columns(df_test, nan_control_cols)

In [None]:
df_test = reduce_mem_usage(df_test)


In [None]:
# Feature engineering - you can add new features like transaction frequency or total amount per user
df_test['transaction_count'] = df_test.groupby('user_id')['id'].transform('count')
df_test['total_amount'] = df_test.groupby('user_id')['amount'].transform('sum')
df_test['average_amount'] = df_test.groupby('user_id')['amount'].transform('mean')

In [None]:
df_test = fill_null_values(df_test)

In [None]:
df_test = calculate_day_difference(df_test, 'date_of_birth')
df_test.head()

In [None]:
# Calculate process time
df_test = calculate_process_time(df_test, 'created_at', 'completed_at')

# Display the DataFrame with the calculated process times
df_test.head()

In [None]:
df_test = normalize_data(df_test, scale_cols)

# Display the first few rows of the scaled dataframe
df_test.head()

In [None]:
# total_amount_bins and average_amount_bins
df_test['total_amount'] = pd.cut(df_test['total_amount'],
                      bins=total_amount_bins,
                      include_lowest=True,
                      duplicates='drop').cat.codes

df_test['average_amount'] = pd.cut(df_test['average_amount'],
                      bins=average_amount_bins,
                      include_lowest=True,
                      duplicates='drop').cat.codes

In [None]:
df_test['locale'] = df_test['locale'].str.split('_').str[1]
df_test['locale'] = df_test['locale'].replace('001', 'Global')

In [None]:
# Apply the conversion function to the 'phone_number_country_code' and 'country_of_residence' columns
df_test['phone_number_country_code'] = df_test['phone_number_country_code'].apply(convert_country_code_to_abbr)
df_test[location_cols].head(10)

In [None]:
# Process the DataFrame
df_test = process_dataframe(df_test)
df_test[['country_of_residence',	'phone_number_country_code',	'document_issuing_country', 'locale', 'continent_code'	]].head(10)

In [None]:
# Call one got encoding fucntion with df and dedicated features
df_test_2 = one_hot_encode(df_test, categoric_cols)

In [None]:
df_test_2.columns

In [None]:
df_test_3 = df_test_2.drop(columns=drop_cols, errors='ignore')  # errors='ignore' ensures no error if column not found
df_test_3.shape, df_test_2.shape

In [None]:
df_test_3.columns

In [None]:
df_test_3 = reduce_mem_usage(df_test_3)

In [None]:
df_test_3

## BURDAYIM

***train ve labeled'ın kolonları tutmuyor***

In [None]:
df_test_3[corr_cols]

In [None]:
df_test_4 = df_test_3[corr_cols].copy()

In [None]:
# Merge the date_of_birth column from labelled_user_info_df to unlabelled_cleaned_user_info_df based on 'user_id'
unlabelled_cleaned_user_info_df = unlabelled_cleaned_user_info_df.merge(
    cleaned_user_info_df[['user_id', 'date_of_birth']],  # Select only the 'user_id' and 'date_of_birth' columns
    on='user_id',  # Join on 'user_id'
    how='left'  # Perform a left join to retain all records from unlabeled_user_info_df
)

In [None]:
# Merge the category_id column from labelled_transaction_df to unlabelled_transaction_df based on 'reference_transaction_id'
unlabelled_transaction_df = unlabelled_transaction_df.merge(
    labelled_transaction_df[['reference_transaction_id', 'category_id']],  # Select only the 'reference_transaction_id' and 'category_id' columns
    on='reference_transaction_id',  # Join on 'reference_transaction_id'
    how='left'  # Perform a left join to retain all records from unlabeled_user_info_df
)

In [None]:
# Load the unlabeled data
# Assuming you have loaded the unlabeled datasets into unlabeled_transaction_df and unlabeled_user_info_df

# Merge the datasets
unlabeled_df = pd.merge(unlabelled_transaction_df, unlabelled_cleaned_user_info_df, on='user_id')

# Use the same feature engineering steps as the training data
unlabeled_df['transaction_count'] = unlabeled_df.groupby('user_id')['id'].transform('count')
unlabeled_df['total_amount'] = unlabeled_df.groupby('user_id')['amount'].transform('sum')
unlabeled_df['average_amount'] = unlabeled_df.groupby('user_id')['amount'].transform('mean')

# Drop any unnecessary columns
X_unlabeled = unlabeled_df.drop(columns=['user_id'])  # Remove ID column

# Make predictions on the unlabeled data
unlabeled_df['predicted_suspended'] = clf_pipeline.predict(X_unlabeled)

# Filter out customers predicted to be SUSPENDED
suspended_customers = unlabeled_df[unlabeled_df['predicted_suspended'] == 1]

# Generate a list of customers for off-boarding
offboarding_customers = suspended_customers[['user_id', 'transaction_count', 'total_amount', 'average_amount']]

print("Customers to be off-boarded based on predicted risk:")
print(offboarding_customers)


In [None]:
offboarding_customers.head()

In [None]:
unlabeled_df.head()

In [None]:
unlabeled_df.predicted_suspended.value_counts()