In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from typing import List, Tuple


In [29]:
df = pd.read_csv('Credit_Card.csv')
label_df = pd.read_csv('Credit_card_label.csv')
merged_df = pd.merge(df, label_df, on='Ind_ID')

In [30]:
def clean_data(merged_df: pd.DataFrame, 
                          numerical_cols: List[str] = ['Annual_income', 'Birthday_count', 'Employed_days'], 
                          categorical_cols: List[str] = ['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income', 
                                                         'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation'], 
                          occupation_col: str = 'Type_Occupation') -> pd.DataFrame:
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    
    merged_df.dropna(subset=[occupation_col], inplace=True)
    
    merged_df['GENDER'] = merged_df['GENDER'].fillna(merged_df['GENDER'].mode()[0])
    
    encoded_df = pd.get_dummies(merged_df, columns=categorical_cols, drop_first=True)

    encoded_df = encoded_df.replace([np.inf, -np.inf], np.nan).fillna(0)

    encoded_df = encoded_df.astype(int)

    sc = StandardScaler()
    encoded_df[numerical_cols] = sc.fit_transform(encoded_df[numerical_cols])
    return encoded_df
encoded_df = clean_data(merged_df)
encoded_df

Unnamed: 0,Ind_ID,CHILDREN,Annual_income,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Family_Members,...,Type_Occupation_Laborers,Type_Occupation_Low-skill Laborers,Type_Occupation_Managers,Type_Occupation_Medicine staff,Type_Occupation_Private service staff,Type_Occupation_Realty agents,Type_Occupation_Sales staff,Type_Occupation_Secretaries,Type_Occupation_Security staff,Type_Occupation_Waiters/barmen staff
8,5010864,1,2.047160,-0.909858,0.842207,1,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
9,5010868,1,2.047160,-0.909858,0.842207,1,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
10,5010869,1,2.047160,-0.909858,0.842207,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11,5018498,0,-0.878721,-1.110597,0.706227,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
12,5018501,0,-1.610191,-1.110597,0.706227,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,5118268,1,1.315690,0.867338,-0.357267,1,0,1,0,3,...,0,0,0,0,0,0,0,0,0,0
1543,5028645,0,-1.610191,0.696052,0.210993,1,0,0,0,2,...,0,0,1,0,0,0,0,0,0,0
1544,5023655,0,0.218485,1.142482,0.619351,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1545,5115992,2,-0.147250,0.381638,0.087185,1,0,0,0,4,...,0,0,1,0,0,0,0,0,0,0


In [None]:
def apply_pca(encoded_df: pd.DataFrame, n_components: int = 10) -> pd.DataFrame:
    pca = PCA(n_components=n_components)
    X = encoded_df.drop(columns=['label', 'Ind_ID'])
    X_pca = pca.fit_transform(X)
    
    return pd.DataFrame(X_pca), encoded_df['label']

In [None]:
def train_logistic_regression_with_pca(encoded_df: pd.DataFrame, n_components: int = 10) -> Tuple[float, str]:
    X_pca, y = apply_pca(encoded_df, n_components=n_components)
    
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)

    y_pred = log_reg.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

accuracy, report = train_logistic_regression_with_pca(encoded_df, n_components=10)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.8537735849056604
Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       181
           1       0.00      0.00      0.00        31

    accuracy                           0.85       212
   macro avg       0.43      0.50      0.46       212
weighted avg       0.73      0.85      0.79       212



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


using dimensinality reduction helped improve the accuracy of the model. by or or ten values. i think the model did better than it did before was because PCA removed redundant and highly correlated features which reduced over fitting. this cause the model to train faster and more efficient. 