In [None]:
# Import libraries
import numpy as np  # for numerical operations.

import pandas as pd  # for data manipulation and analysis.

In [None]:
# Load Train Dataset
Train_Provider = pd.read_excel('MFD Train.xlsx', sheet_name='Train_Provider')  # Load Provider Train Data.
Train_Beneficiarydata = pd.read_excel('MFD Train.xlsx', sheet_name='Train_Beneficiary_Data')  # Load Beneficiary Train Data.
Train_Inpatientdata = pd.read_excel('MFD Train.xlsx', sheet_name='Train_Inpatient_Data')  # Load Inpatient Train Data.
Train_Outpatientdata = pd.read_excel('MFD Train.xlsx', sheet_name='Train_Outpatient_Data')  # Load Outpatient Train Data.

# Load Test Dataset
Test_Provider = pd.read_excel("MFD Test.xlsx", sheet_name='Test_Provider')  # Load Provider Test Data.
Test_Beneficiarydata = pd.read_excel("MFD Test.xlsx", sheet_name='Test_Beneficiary_Data')  # Load Beneficiary Test Data.
Test_Inpatientdata = pd.read_excel('MFD Test.xlsx', sheet_name='Test_Inpatient_Data')  # Load Inpatient Test Data.
Test_Outpatientdata = pd.read_excel("MFD Test.xlsx", sheet_name='Test_Outpatient_Data')  # Load Outpatient Test Data.

In [None]:
# Print shapes of train dataframes
print('Shape of Train data :', Train_Provider.shape)
print('Shape of Train_Beneficiarydata data :', Train_Beneficiarydata.shape)  
print('Shape of Train_Inpatientdata data :', Train_Inpatientdata.shape)  
print('Shape of Train_Outpatientdata data :', Train_Outpatientdata.shape)  
 

In [None]:
# Check basic info
print(Test_Provider.info())
print(Test_Beneficiarydata.info())
print(Test_Inpatientdata.info())
print(Test_Outpatientdata.info())

In [None]:
# Missing values summary 
def missing_values_summary(df, name):
    total = df.isnull().sum()
    percent = (total / len(df)) * 100
    missing_df = pd.DataFrame({
        'Total Missing': total,
        'Percent Missing': percent
    })
    print(f"\nMissing Values Summary for {name}:")
    print(missing_df[missing_df['Total Missing'] > 0])  

# Test:
missing_values_summary(Test_Provider, 'Test_Provider')
missing_values_summary(Test_Beneficiarydata, 'Test_Beneficiarydata')
missing_values_summary(Test_Inpatientdata, 'Test_Inpatientdata')
missing_values_summary(Test_Outpatientdata, 'Test_Outpatientdata')

In [None]:
# Train
missing_values_summary(Train_Provider, 'Train_Provider')
missing_values_summary(Train_Beneficiarydata, 'Train_Beneficiarydata')
missing_values_summary(Train_Inpatientdata, 'Train_Inpatientdata')
missing_values_summary(Train_Outpatientdata, 'Train_Outpatientdata')

In [None]:
# Drop columns with threshold% missing
def drop_zero_info_cols(df, threshold=0.90, retain_cols=[]):

    missing_ratio = df.isnull().mean()
    cols_to_drop_missing = missing_ratio[missing_ratio >= threshold].index.tolist()
    
    # Drop zero variance columns
    zero_var_cols = [col for col in df.columns if df[col].nunique() <= 1]
    
    # Combine all columns to drop
    cols_to_drop = list(set(cols_to_drop_missing + zero_var_cols))
    
    # Exclude columns to retain
    cols_to_drop = [col for col in cols_to_drop if col not in retain_cols]
    
    print(f"Dropping {len(cols_to_drop)} columns: {cols_to_drop}")
    
    df_cleaned = df.drop(columns=cols_to_drop)
    
    return df_cleaned


retain_cols = ['OtherPhysician']

train_inpatient_cleaned = drop_zero_info_cols(Train_Inpatientdata, threshold=0.90, retain_cols=retain_cols)
train_outpatient_cleaned = drop_zero_info_cols(Train_Outpatientdata, threshold=0.90, retain_cols=retain_cols)

test_inpatient_cleaned = drop_zero_info_cols(Test_Inpatientdata, threshold=0.90, retain_cols=retain_cols)
test_outpatient_cleaned = drop_zero_info_cols(Test_Outpatientdata, threshold=0.90, retain_cols=retain_cols)


In [None]:
# Check categorical columns
categorical_cols_inpatient = train_inpatient_cleaned.select_dtypes(include='object').columns
categorical_cols_outpatient = train_outpatient_cleaned.select_dtypes(include='object').columns

print("Inpatient Categorical Columns:", categorical_cols_inpatient)
print("Outpatient Categorical Columns:", categorical_cols_outpatient)

In [None]:
# Print no of entries missing and corresponding percentages
def check_null_empty_percentage(df, cat_cols, name=''):
    print(f"{name} - Missing/Empty Summary:")
    total = len(df)
    for col in cat_cols:
        nulls = df[col].isnull().sum()
        empties = (df[col] == '').sum()
        total_missing = nulls + empties
        percent = (total_missing / total) * 100
        print(f"{col}: {total_missing} missing/empty ({percent:.2f}%)")
    print('-' * 50)

# Train Inpatient
check_null_empty_percentage(train_inpatient_cleaned, categorical_cols_inpatient, 'Train Inpatient')

# Test Inpatient
check_null_empty_percentage(test_inpatient_cleaned, categorical_cols_inpatient, 'Test Inpatient')

# Train Outpatient
check_null_empty_percentage(train_outpatient_cleaned, categorical_cols_outpatient, 'Train Outpatient')

# Test Outpatient
check_null_empty_percentage(test_outpatient_cleaned, categorical_cols_outpatient, 'Test Outpatient')


In [None]:
def impute(df):
    
    # Renal Disease Indicator Encoding
    if 'RenalDiseaseIndicator' in df.columns:
        df['RenalDiseaseIndicator'] = df['RenalDiseaseIndicator'].replace({'Y': 1, 'N': 0, 0: 0, 1: 1}).fillna(0).astype(int)
    
    # Mode imputation for Diagnosis Codes
    diagnosis_cols = [col for col in df.columns if 'ClmDiagnosisCode' in col or 'ClmAdmitDiagnosisCode' in col]
    for col in diagnosis_cols:
        mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
        df[col] = df[col].fillna(mode_val)
    
    # 'Unknown' for missing OperatingPhysician & OtherPhysician
    physician_cols = ['OperatingPhysician', 'OtherPhysician']
    for col in physician_cols:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown')
    
    # Replace empty strings in physician columns with 'Missing'
    for col in physician_cols:
        if col in df.columns:
            df[col] = df[col].replace('', 'Missing')
    
    # 4. AttendingPhysician, minimal missing values — fill with mode
    if 'AttendingPhysician' in df.columns:
        mode_val = df['AttendingPhysician'].mode()[0] if not df['AttendingPhysician'].mode().empty else 'Unknown'
        df['AttendingPhysician'] = df['AttendingPhysician'].fillna(mode_val)
    
    return df



In [None]:
# Column lists and imputation
physician_cols = ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']
diagnosis_cols_inpatient = ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
                            'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
                            'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9']

diagnosis_cols_outpatient = ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
                             'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmAdmitDiagnosisCode']

# Inpatient
train_inpatient_cleaned = impute(train_inpatient_cleaned)
test_inpatient_cleaned = impute(test_inpatient_cleaned)

# Outpatient
train_outpatient_cleaned = impute(train_outpatient_cleaned)
test_outpatient_cleaned = impute(test_outpatient_cleaned)


# Beneficiary Data
train_beneficiary_cleaned = impute(Train_Beneficiarydata)
test_beneficiary_cleaned = impute(Test_Beneficiarydata)


In [None]:
# Quick glance at first few records
Test_Provider.head()

In [None]:
Test_Beneficiarydata.head()

In [None]:
test_inpatient_cleaned.head()

In [None]:
test_outpatient_cleaned.head()

In [None]:
train_inpatient_cleaned.head()

In [None]:
train_outpatient_cleaned.head()

In [None]:
# shapes of Test dataframes
print('Shape of Test_Provider:', Test_Provider.shape)
print('Shape of Test_Beneficiarydata:', test_beneficiary_cleaned.shape)
print('Shape of test_inpatient_cleaned:', test_inpatient_cleaned.shape)
print('Shape of test_outpatient_cleaned:', test_outpatient_cleaned.shape)

# shapes of Train dataframes
print('Shape of Train_Provider:', Train_Provider.shape)
print('Shape of Train_Beneficiarydata:', train_beneficiary_cleaned.shape)
print('Shape of train_inpatient_cleaned:', train_inpatient_cleaned.shape)
print('Shape of train_outpatient_cleaned:', train_outpatient_cleaned.shape)

In [None]:
# shape and sample of Train data
print(f"Train Shape: {Train_Provider.shape}\n")
print("Train Sample:\n", Train_Provider.head())

# shape and sample of Test data
print(f"\nTest Shape: {Test_Provider.shape}\n")
print("Test Sample:\n", Test_Provider.head())


In [None]:
# Checking for duplicates
def check_provider_duplicates(df, dataset_name):
    if 'Provider' not in df.columns:
        print(f"'{dataset_name}' dataset does not have a 'Provider' column.")
        return

    # Count provider occurrences
    provider_counts = df['Provider'].value_counts()

    # Print provider counts
    print(f"\n{'='*40}")
    print(f"{dataset_name} Provider Counts:")
    print(provider_counts)

    # Check for duplicates
    has_duplicates = (provider_counts > 1).any()

    if has_duplicates:
        print(f"\nDuplicates found in {dataset_name} Provider column.")
        print(f"Providers with multiple entries:")
        print(provider_counts[provider_counts > 1])
    else:
        print(f"\nNo duplicates found in {dataset_name} Provider column.")
    print(f"{'='*40}")

# Train:
check_provider_duplicates(Train_Provider, "Train")

# Test:
check_provider_duplicates(Test_Provider, "Test")


In [None]:
# Grouping columns by dta types
import inspect

def group_columns_by_dtype(df):
    # Get variable name
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    df_name = [var_name for var_name, var_val in callers_local_vars if var_val is df]
    df_name = df_name[0] if df_name else 'DataFrame'
    
    print(f"\n{'='*40}")
    print(f"Data Types Summary for {df_name}:")
    print(f"{'='*40}\n")
    
    # Get unique data types in dataframe
    data_types = df.dtypes.unique()
    
    # Loop over each unique dtype
    for dtype in data_types:
        # Get column names for current data type
        cols = df.select_dtypes(include=[dtype]).columns.tolist()
        
        print(f"Data Type: {dtype}")
        print(f"Columns ({len(cols)} columns): {cols}\n")

# Dictionary of datasets
datasets = {
    "Train_Provider": Train_Provider,
    "Test_Provider": Test_Provider,
    "Train_Beneficiarydata": train_beneficiary_cleaned,
    "Test_Beneficiarydata": test_beneficiary_cleaned,
    "Train_Inpatientdata": train_inpatient_cleaned,
    "Test_Inpatientdata": test_inpatient_cleaned,
    "Train_Outpatientdata": train_outpatient_cleaned,
    "Test_Outpatientdata": test_outpatient_cleaned,
}

# Loop through datasets
for name, df in datasets.items():
    group_columns_by_dtype(df)


In [None]:
# Merging Datasets
def merge_datasets(train_inpatient_cleaned, test_inpatient_cleaned,
                   train_outpatient_cleaned, test_outpatient_cleaned,
                   train_beneficiary_cleaned, test_beneficiary_cleaned,
                   provider_data=None):

    # Merge Inpatient with Beneficiary
    train_inpatient_merged = pd.merge(train_inpatient_cleaned, train_beneficiary_cleaned, on='BeneID', how='left')
    test_inpatient_merged = pd.merge(test_inpatient_cleaned, test_beneficiary_cleaned, on='BeneID', how='left')
    
    # Merge Outpatient with Beneficiary
    train_outpatient_merged = pd.merge(train_outpatient_cleaned, train_beneficiary_cleaned, on='BeneID', how='left')
    test_outpatient_merged = pd.merge(test_outpatient_cleaned, test_beneficiary_cleaned, on='BeneID', how='left')
    
    # Combine Inpatient and Outpatient for Train and Test separately
    train_combined = pd.concat([train_inpatient_merged, train_outpatient_merged], axis=0, ignore_index=True)
    test_combined = pd.concat([test_inpatient_merged, test_outpatient_merged], axis=0, ignore_index=True)
    
    # Optionally merge with Provider data (only applicable for Train)
    if provider_data is not None:
        train_combined = pd.merge(train_combined, provider_data, on='Provider', how='left')
    
    return train_combined, test_combined


In [None]:
# Call the function
Train_Combined, Test_Combined = merge_datasets(
    train_inpatient_cleaned, test_inpatient_cleaned,
    train_outpatient_cleaned, test_outpatient_cleaned,
    train_beneficiary_cleaned, test_beneficiary_cleaned,
    Train_Provider   # Only provider data for Train, leave Test without provider
)
# Check outputs
print(f"Train Combined Shape: {Train_Combined.shape}")
print(f"Test Combined Shape: {Test_Combined.shape}")

Train_Combined.head()


In [None]:
Test_Combined.head()

In [None]:
# For Train
print(f"Number of duplicate rows in Train: {Train_Combined.duplicated().sum()}")

# For Test
print(f"Number of duplicate rows in Test: {Test_Combined.duplicated().sum()}")

In [None]:
# Date format
date_cols = ['ClaimStartDt', 'ClaimEndDt', 'DOB', 'DOD']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

In [None]:
Test_Combined.head()

In [None]:
Train_Combined.head(100)

In [None]:
# Encoding
cols = ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 'Provider']

for col in cols:
    print(f"\nColumn: {col}")
    print(f"Unique values count: {Train_Combined[col].nunique()}")
    print(f"Top 5 most frequent values:\n{Train_Combined[col].value_counts().head()}")
    print(f"Number of missing/unknown: {(Train_Combined[col] == 'Unknown').sum() + (Train_Combined[col] == 'Missing').sum()}")

In [None]:
print(f"{col} Unique Values - Train: {Train_Combined[col].nunique()}, Test: {Test_Combined[col].nunique()}")

In [None]:
# Label Encoding (for ordered categories or IDs like Physicians, Providers)
from sklearn.preprocessing import LabelEncoder

label_cols = ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 'Provider']

for col in label_cols:
    le = LabelEncoder()
    
    # Combine Train & Test column values to fit encoder
    combined_vals = pd.concat([Train_Combined[col], Test_Combined[col]]).astype(str)
    le.fit(combined_vals)
    
    # Transform both datasets
    Train_Combined[col] = le.transform(Train_Combined[col].astype(str))
    Test_Combined[col] = le.transform(Test_Combined[col].astype(str))

print("Label Encoding completed successfully!")

In [None]:
print(Train_Combined.columns.tolist())

In [None]:
print(Test_Combined.columns.tolist())

In [None]:
# Encoding
categorical_cols = ['Gender', 'Race', 'RenalDiseaseIndicator', 'PotentialFraud']

# For Train Data
cols_to_encode_train = [col for col in categorical_cols if col in Train_Combined.columns]
Train_Combined = pd.get_dummies(Train_Combined, columns=cols_to_encode_train, drop_first=True)

# For Test Data
cols_to_encode_test = [col for col in categorical_cols if col in Test_Combined.columns]
Test_Combined = pd.get_dummies(Test_Combined, columns=cols_to_encode_test, drop_first=True)

# Align columns
Train_Combined, Test_Combined = Train_Combined.align(Test_Combined, join='left', axis=1, fill_value=0)

In [None]:
# Encoding
from sklearn.preprocessing import LabelEncoder

label_cols_remaining = ['State', 'County', 'DiagnosisGroupCode', 'ClmAdmitDiagnosisCode',
                        'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
                        'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
                        'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
                        'ClmProcedureCode_1', 'ClmProcedureCode_2']

for col in label_cols_remaining:
    le = LabelEncoder()
    
    # Combine Train & Test values for consistency
    combined_vals = pd.concat([Train_Combined[col], Test_Combined[col]]).astype(str)
    le.fit(combined_vals)
    
    # Transform both datasets
    Train_Combined[col] = le.transform(Train_Combined[col].astype(str))
    Test_Combined[col] = le.transform(Test_Combined[col].astype(str))

print("Remaining Label Encoding completed successfully!")

In [None]:
# Claimm duration
Train_Combined['ClaimStartDt'] = pd.to_datetime(Train_Combined['ClaimStartDt'])
Train_Combined['ClaimEndDt'] = pd.to_datetime(Train_Combined['ClaimEndDt'])
Test_Combined['ClaimStartDt'] = pd.to_datetime(Test_Combined['ClaimStartDt'])
Test_Combined['ClaimEndDt'] = pd.to_datetime(Test_Combined['ClaimEndDt'])

Train_Combined['Claim_Duration'] = (Train_Combined['ClaimEndDt'] - Train_Combined['ClaimStartDt']).dt.days
Test_Combined['Claim_Duration'] = (Test_Combined['ClaimEndDt'] - Test_Combined['ClaimStartDt']).dt.days

In [None]:
# Length of Stay (Inpatient specific)
Train_Combined['AdmissionDt'] = pd.to_datetime(Train_Combined['AdmissionDt'], errors='coerce')
Train_Combined['DischargeDt'] = pd.to_datetime(Train_Combined['DischargeDt'], errors='coerce')
Test_Combined['AdmissionDt'] = pd.to_datetime(Test_Combined['AdmissionDt'], errors='coerce')
Test_Combined['DischargeDt'] = pd.to_datetime(Test_Combined['DischargeDt'], errors='coerce')

Train_Combined['Length_of_Stay'] = (Train_Combined['DischargeDt'] - Train_Combined['AdmissionDt']).dt.days
Test_Combined['Length_of_Stay'] = (Test_Combined['DischargeDt'] - Test_Combined['AdmissionDt']).dt.days

In [None]:
# Age
Train_Combined['DOB'] = pd.to_datetime(Train_Combined['DOB'], errors='coerce')
Test_Combined['DOB'] = pd.to_datetime(Test_Combined['DOB'], errors='coerce')

Train_Combined['Age'] = Train_Combined['ClaimStartDt'].dt.year - Train_Combined['DOB'].dt.year
Test_Combined['Age'] = Test_Combined['ClaimStartDt'].dt.year - Test_Combined['DOB'].dt.year

In [None]:
# Deceased Flag
Train_Combined['DOD'] = pd.to_datetime(Train_Combined['DOD'], errors='coerce')
Test_Combined['DOD'] = pd.to_datetime(Test_Combined['DOD'], errors='coerce')

Train_Combined['Deceased_Flag'] = (Train_Combined['DOD'].notnull()) & (Train_Combined['ClaimStartDt'] > Train_Combined['DOD'])
Train_Combined['Deceased_Flag'] = Train_Combined['Deceased_Flag'].astype(int)

Test_Combined['Deceased_Flag'] = (Test_Combined['DOD'].notnull()) & (Test_Combined['ClaimStartDt'] > Test_Combined['DOD'])
Test_Combined['Deceased_Flag'] = Test_Combined['Deceased_Flag'].astype(int)

In [None]:
# Claim and Month/Year
Train_Combined['Claim_Month'] = Train_Combined['ClaimStartDt'].dt.month
Train_Combined['Claim_Year'] = Train_Combined['ClaimStartDt'].dt.year

Test_Combined['Claim_Month'] = Test_Combined['ClaimStartDt'].dt.month
Test_Combined['Claim_Year'] = Test_Combined['ClaimStartDt'].dt.year

In [None]:
# Number of Diagnoses & Procedures per Claim
Train_Combined['Num_Diagnoses'] = Train_Combined[[f'ClmDiagnosisCode_{i}' for i in range(1,10)]].notnull().sum(axis=1)
Test_Combined['Num_Diagnoses'] = Test_Combined[[f'ClmDiagnosisCode_{i}' for i in range(1,10)]].notnull().sum(axis=1)

Train_Combined['Num_Procedures'] = Train_Combined[[f'ClmProcedureCode_{i}' for i in range(1,3)]].notnull().sum(axis=1)
Test_Combined['Num_Procedures'] = Test_Combined[[f'ClmProcedureCode_{i}' for i in range(1,3)]].notnull().sum(axis=1)

In [None]:
# Chronic conditions sum
chronic_cols = [col for col in Train_Combined.columns if 'ChronicCond_' in col]

Train_Combined['Chronic_Cond_Sum'] = Train_Combined[chronic_cols].sum(axis=1)
Test_Combined['Chronic_Cond_Sum'] = Test_Combined[chronic_cols].sum(axis=1)

In [None]:
# Average Deductible/Reimbursement Amount per Claim
Train_Combined['Avg_Deductible'] = (Train_Combined['IPAnnualDeductibleAmt'] + Train_Combined['OPAnnualDeductibleAmt']) / 2
Test_Combined['Avg_Deductible'] = (Test_Combined['IPAnnualDeductibleAmt'] + Test_Combined['OPAnnualDeductibleAmt']) / 2

Train_Combined['Avg_Reimbursement'] = (Train_Combined['IPAnnualReimbursementAmt'] + Train_Combined['OPAnnualReimbursementAmt']) / 2
Test_Combined['Avg_Reimbursement'] = (Test_Combined['IPAnnualReimbursementAmt'] + Test_Combined['OPAnnualReimbursementAmt']) / 2

In [None]:
# Unique Beneficiary Count of Claims
beneficiary_claim_count = Train_Combined['BeneID'].value_counts().to_dict()
Train_Combined['Bene_Claim_Count'] = Train_Combined['BeneID'].map(beneficiary_claim_count)
Test_Combined['Bene_Claim_Count'] = Test_Combined['BeneID'].map(lambda x: beneficiary_claim_count.get(x, 0))

In [None]:
# Previous fraud indicator
provider_fraud = Train_Combined.groupby('Provider')['PotentialFraud_Yes'].max()
Train_Combined['Provider_Prev_Fraud'] = Train_Combined['Provider'].map(provider_fraud)
Test_Combined['Provider_Prev_Fraud'] = Test_Combined['Provider'].map(lambda x: provider_fraud.get(x, 0))

In [None]:
# One-hot encoding for claim month
Train_Combined = pd.get_dummies(Train_Combined, columns=['Claim_Month'], prefix='Month', drop_first=True)
Test_Combined = pd.get_dummies(Test_Combined, columns=['Claim_Month'], prefix='Month', drop_first=True)

Train_Combined, Test_Combined = Train_Combined.align(Test_Combined, join='left', axis=1, fill_value=0)

In [None]:
# Dropping the leakage feature
if 'ClaimStartDt' in Train_Combined.columns:
    Train_Combined['Claim_Month'] = Train_Combined['ClaimStartDt'].dt.month
if 'ClaimStartDt' in Test_Combined.columns:
    Test_Combined['Claim_Month'] = Test_Combined['ClaimStartDt'].dt.month

# Columns to drop (dates + identifiers + high-leakage features)
cols_to_drop = [
    'ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt',
    'DOB', 'DOD', 'ClaimID', 'BeneID', 
    'Provider_Prev_Fraud'  # <--- Dropping the leakage feature
]

# Drop from Train & Test if columns exist
Train_Combined.drop(columns=[col for col in cols_to_drop if col in Train_Combined.columns], inplace=True)
Test_Combined.drop(columns=[col for col in cols_to_drop if col in Test_Combined.columns], inplace=True)

print("Date, ID, and leakage columns dropped successfully!")

# Encoding 'Claim_Month' (recommended)
if 'Claim_Month' in Train_Combined.columns:
    Train_Combined = pd.get_dummies(Train_Combined, columns=['Claim_Month'], prefix='Month', drop_first=True)
if 'Claim_Month' in Test_Combined.columns:
    Test_Combined = pd.get_dummies(Test_Combined, columns=['Claim_Month'], prefix='Month', drop_first=True)

# Align columns after encoding
Train_Combined, Test_Combined = Train_Combined.align(Test_Combined, join='left', axis=1, fill_value=0)

print("Claim_Month encoding completed!")

# Save to new CSV files
Train_Combined.to_csv('Train_Cleaned_Encoded.csv', index=False)
Test_Combined.to_csv('Test_Cleaned_Encoded.csv', index=False)

print("New cleaned & encoded files saved successfully!")


In [None]:
from sklearn.preprocessing import StandardScaler

# Load encoded cleaned data
train_df = pd.read_csv('Train_Cleaned_Encoded.csv')
test_df = pd.read_csv('Test_Cleaned_Encoded.csv')

# Separate target
y = train_df['PotentialFraud_Yes']  # Assuming binary 0/1
X_train = train_df.drop('PotentialFraud_Yes', axis=1)
X_test = test_df.copy()  # No target column in test set

In [None]:
from sklearn.preprocessing import StandardScaler

# Load cleaned & encoded datasets
train_df = pd.read_csv('Train_Cleaned_Encoded.csv')
test_df = pd.read_csv('Test_Cleaned_Encoded.csv')

# Separate target variable
y = train_df['PotentialFraud_Yes']
X_train = train_df.drop('PotentialFraud_Yes', axis=1)

# Remove 'PotentialFraud_Yes' from test if it exists
if 'PotentialFraud_Yes' in test_df.columns:
    X_test = test_df.drop('PotentialFraud_Yes', axis=1)
else:
    X_test = test_df.copy()

# Normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Normalization successful!")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Split for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Train Random Forest (as an example)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_tr, y_tr)

# Evaluate
y_pred = rf.predict(X_val)
print(classification_report(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred))

In [None]:
import matplotlib.pyplot as plt

# Feature Importance
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
feature_importances.nlargest(20).plot(kind='barh')
plt.title("Top 20 Important Features")
plt.show()

In [None]:
# Predict
test_preds = rf.predict(X_test_scaled)

# Prepare submission (no IDs, so index-based submission)
submission = pd.DataFrame({
    'Index': test_df.index,
    'PotentialFraud': ['Yes' if pred == 1 else 'No' for pred in test_preds]
})

submission.to_csv('Final_Submission_Normalized.csv', index=False)
print("Submission file saved successfully!")

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

# Impute missing values with the median of each column
imputer = SimpleImputer(strategy='median')

# Create a pipeline with imputer and logistic regression
model_lr = make_pipeline(imputer, LogisticRegression(max_iter=1000))

# Perform cross-validation with the imputed data
cv_scores_lr = cross_val_score(model_lr, X_train_scaled, y, cv=skf, scoring=scoring, n_jobs=-1, verbose=1)

print(f"Logistic Regression - Cross-Validated F1 Scores: {cv_scores_lr}")
print(f"Logistic Regression - Mean F1 Score: {cv_scores_lr.mean():.4f}")

In [None]:
%pip install xgboost

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score

# Define the XGBoost model with class_weight='balanced' for handling class imbalance
model_xgb = XGBClassifier(scale_pos_weight=1, random_state=42, use_label_encoder=False, eval_metric='logloss')

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
scoring = make_scorer(f1_score, pos_label=True)

# Perform cross-validation
cv_scores_xgb = cross_val_score(model_xgb, X_train_scaled, y, cv=skf, scoring=scoring, n_jobs=-1)

# Print the results
print(f"XGBoost - Cross-Validated F1 Scores: {cv_scores_xgb}")
print(f"XGBoost - Mean F1 Score: {cv_scores_xgb.mean():.4f}")

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score

# Define the LightGBM model with class_weight='balanced' for handling class imbalance
model_lgb = lgb.LGBMClassifier(class_weight='balanced', random_state=42)

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
scoring = make_scorer(f1_score, pos_label=True)

# Perform cross-validation
cv_scores_lgb = cross_val_score(model_lgb, X_train_scaled, y, cv=skf, scoring=scoring, n_jobs=-1)

# Print the results
print(f"LightGBM - Cross-Validated F1 Scores: {cv_scores_lgb}")
print(f"LightGBM - Mean F1 Score: {cv_scores_lgb.mean():.4f}")

In [None]:
# Confussion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Create confusion matrix
cm = confusion_matrix(y_val, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['False', 'True'])
disp.plot(cmap='Blues')
plt.title("Random Forest Confusion Matrix")
plt.show()

In [None]:
# Precision Recall Curve
from sklearn.metrics import precision_recall_curve

y_scores = rf.predict_proba(X_val)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val, y_scores)

plt.plot(thresholds, precisions[:-1], label='Precision')
plt.plot(thresholds, recalls[:-1], label='Recall')
plt.xlabel('Threshold')
plt.legend()
plt.title('Precision-Recall vs Threshold')
plt.show()

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Print classification report
print("Classification Report:\n", classification_report(y_val, y_pred))

# Print ROC AUC score
print(f"ROC AUC Score: {roc_auc_score(y_val, y_pred):.4f}")

In [None]:
tn, fp, fn, tp = cm.ravel()

specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)

print(f"Specificity (True Negative Rate): {specificity:.2f}")
print(f"Sensitivity (Recall / True Positive Rate): {sensitivity:.2f}")

In [None]:
#ROC and AUC Curve
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_val, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Precision-Recall Curve
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, thresholds = precision_recall_curve(y_val, y_scores)
average_precision = average_precision_score(y_val, y_scores)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, lw=2, label=f'AP = {average_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()