In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC  # Importing LinearSVC for Linear SVM
from sklearn.impute import SimpleImputer  # SimpleImputer

In [16]:
# Load dataset
loan_df = pd.read_csv('/home/student/filtered_columns_dataset.csv')

In [17]:
# loan_df.head()

# Data Cleaning

## Key Features:
TARGET: Indicator of whether the loan was repaid (1 for default, 0 for non-default), crucial for supervised learning.<br/>
NAME_CONTRACT_TYPE: Type of loan (e.g., Cash loans or Revolving loans), important for determining loan structure.<br/>
AMT_INCOME_TOTAL: Total income of the applicant, crucial for determining their ability to repay the loan.<br/>
AMT_CREDIT: The amount of credit (loan) applied for, used in calculating loan range.<br/>
AMT_ANNUITY: Loan annuity (the regular payments), helpful for assessing repayment capacity.<br/>
AMT_GOODS_PRICE: The price of the goods for which the loan is taken, useful for understanding loan size.<br/>
CODE_GENDER: Gender of the applicant, might be relevant if gender-based loan assessment is being used.<br/>
FLAG_OWN_CAR: Indicates whether the applicant owns a car, a possible asset for creditworthiness.<br/>
FLAG_OWN_REALTY: Indicates if the applicant owns real estate, another asset for credit risk assessment.<br/>
CNT_CHILDREN: Number of children, which could influence income-to-expense ratios.<br/>
NAME_INCOME_TYPE: Type of income (e.g., Working, State servant), used to assess financial stability.<br/>
NAME_EDUCATION_TYPE: Level of education, which could impact income potential and financial literacy.<br/>
NAME_FAMILY_STATUS: Family status, used for understanding household structure and financial obligations.<br/>
DAYS_BIRTH: Age of the applicant (derived from days since birth), critical for demographic analysis.<br/>
DAYS_EMPLOYED: Duration of employment, helps in assessing job stability.<br/>
REGION_POPULATION_RELATIVE: Population of the region, useful for understanding economic background.<br/>
EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3: External risk assessment scores, typically useful in determining credit risk.<br/>
DAYS_LAST_PHONE_CHANGE: How long it has been since the applicant changed their phone, which could indicate stability or mobility.<br/>
CNT_FAM_MEMBERS: A larger number of family members generally correlates with higher living costs, which can affect the disposable income available for loan repayment.
NAME_HOUSING_TYPE:Housing type can indicate the applicant's financial stability, long-term expenses, and asset ownership.

In [18]:
columns_to_keep = [
    'TARGET', 'NAME_CONTRACT_TYPE','AMT_INCOME_TOTAL', 
    'AMT_CREDIT', 'AMT_ANNUITY', 'CODE_GENDER',
    'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'NAME_INCOME_TYPE', 
    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
    'REGION_POPULATION_RELATIVE', 'DAYS_LAST_PHONE_CHANGE',
    'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'
]

In [20]:
# df_filtered = loan_df[columns_to_keep]

In [21]:
print(loan_df.columns)

Index(['target', 'contract_type', 'income_total', 'credit_amount',
       'annuity_amount', 'gender', 'owns_car', 'owns_realty', 'children_count',
       'income_type', 'education_type', 'family_status', 'age_days',
       'employment_days', 'region_population_relative',
       'days_last_phone_change', 'family_members_count', 'ext_source_1',
       'ext_source_2', 'ext_source_3', 'age_years'],
      dtype='object')


In [None]:
# 1. Binning income_total into Low, Medium, High based on percentiles
loan_df['income_bin'] = pd.qcut(loan_df['income_total'], q=3, labels=['Low', 'Medium', 'High'])

In [None]:
# 2. Binning age_years into Young, Middle-aged, Senior
loan_df['age_bin'] = pd.cut(loan_df['age_years'], bins=[0, 30, 50, float('inf')], labels=['Young', 'Middle-aged', 'Senior'])

In [None]:
# 3. Binning credit_amount into categories based on quartiles
loan_df['credit_bin'] = pd.qcut(loan_df['credit_amount'], q=4, labels=['Very Low', 'Low', 'Medium', 'High'])

In [None]:
# 4. Binning annuity_amount into categories based on quartiles
loan_df['annuity_bin'] = pd.qcut(loan_df['annuity_amount'].dropna(), q=4, labels=['Very Low', 'Low', 'Medium', 'High'])

In [None]:
# 5. Binning employment_days into "Recently employed", "Long-term employed", etc.
loan_df['employment_bin'] = pd.cut(loan_df['employment_days'], bins=[-float('inf'), -1000, -500, 0], labels=['Long-term', 'Mid-term', 'Recently employed'])

# Feature Engineering

In [None]:
# Encoding categorical features using LabelEncoder
categorical_columns = ['contract_type', 'gender', 'owns_car', 'owns_realty', 'income_type', 
                       'education_type', 'family_status', 'income_bin', 'age_bin', 'credit_bin', 'annuity_bin', 'employment_bin']

In [None]:
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    loan_df[column] = le.fit_transform(loan_df[column].astype(str))  # Ensure string type for bin columns
    label_encoders[column] = le

In [None]:
# Split the data into features and target
X = loan_df.drop('target', axis=1)
y = loan_df['target']

In [None]:
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [None]:
# Impute missing values in the dataset
imputer = SimpleImputer(strategy='mean')  # Choose strategy like 'median', 'most_frequent', etc.
X[numeric_columns] = imputer.fit_transform(X[numeric_columns])

# Model Training

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Standardize the numeric columns
scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

Linear Support Vector Machine (SVM) is a supervised machine learning algorithm used primarily for classification tasks, though it can also be applied to regression problems. The goal of a linear SVM is to find the optimal hyperplane that separates the data points from different classes with the largest possible margin.

In [None]:
# Initialize the Linear SVM classifier
linear_svc_model = LinearSVC(random_state=42, max_iter=10000)  # Increased max_iter for convergence

In [None]:
# Train the model
linear_svc_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = linear_svc_model.predict(X_test)

In [None]:
# Custom rule-based prediction
for i in range(len(X_test)):
    age = X_test.iloc[i]['age_bin']  # Binned age category
    income = X_test.iloc[i]['income_bin']  # Binned income category
    credit = X_test.iloc[i]['credit_bin']  # Binned credit category
    annuity = X_test.iloc[i]['annuity_bin']  # Binned annuity category

    # Rule: if middle-aged, medium or high income, credit, and annuity, predict approved (1), otherwise reject (0)
    if (age == label_encoders['age_bin'].transform(['Middle-aged'])[0] and
        income in label_encoders['income_bin'].transform(['Medium', 'High']) and
        credit in label_encoders['credit_bin'].transform(['Medium', 'High']) and
        annuity in label_encoders['annuity_bin'].transform(['Medium', 'High'])):
        y_pred[i] = 1  # Approve
    else:
        y_pred[i] = 0  # Reject

In [None]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)