In [1]:
import pandas as pd
from pandas import read_csv
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,classification_report

In [None]:
# Load dataset
df = pd.read_csv("loans.csv")
df.head()


In [None]:
print('the shape of data farme is: ', df.shape)
df.columns
print('types of data farme is: ', df.dtypes)
df.duplicated().sum()
df.nunique()
df.info()
df.describe()
df.isnull().sum()
df.corr(numeric_only=True)

In [None]:
print(df['client_id'].value_counts())

print(df['loan_type'].value_counts())

print(df['loan_amount'].value_counts())

print(df['repaid'].value_counts())

print(df['loan_id'].value_counts())

print(df['loan_start'].value_counts())

print(df['loan_end'].value_counts())

print(df['rate'].value_counts())


In [None]:
# Detecting missing values
missing_values = df.isnull().sum()
print('Missing values in each column:\n', missing_values)

# 1: Drop rows with missing values
df_dropped = df.dropna()
print('Shape after dropping rows with missing values: ', df_dropped.shape)

# 2: Fill missing values with mean (for numerical columns)
df_filled = df.fillna(df.mean(numeric_only=True))
print('Number of missing values after filling with mean:\n', df_filled.isnull().sum())

# 3: Fill missing values with mode (for categorical columns)
for column in df.select_dtypes(include=['object']).columns:
    df_filled[column] = df_filled[column].fillna(df_filled[column].mode()[0])

print('Number of missing values after filling with mode:\n', df_filled.isnull().sum())


In [None]:
# Remove outliers
df_final = df_filled

def detect_outliers_iqr(data):
    outliers = []
    for column in data.select_dtypes(include = [np.number]).columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers.extend(data[(data[column] < lower_bound) | (data[column] > upper_bound)].index)
    return outliers

outliers = detect_outliers_iqr(df_final)
print('Number of outliers detected: ', len(outliers))

df_no_outliers = df.drop(outliers)
print('Shape after removing outliers: ', df_no_outliers.shape)

In [None]:
df_final_outliers = df_no_outliers
# Transforming variables if necessary
# Log transformation (avoid log(0) by adding a small constant)
df_transformed = df_final_outliers.copy()
for column in df_transformed.select_dtypes(include=[np.number]).columns:
    df_transformed[column] = np.log1p(df_transformed[column])  # log1p is log(1 + x) to handle zero values

print('Data after log transformation:\n', df_transformed.describe())

# Choose one of the transformations for further analysis
df_final_transformed = df_transformed  

In [None]:
# Scaling numerical variables
scaler_standard = StandardScaler()
# Apply standard scaling
df_standard_scaled = df_final_transformed.copy()
df_standard_scaled[df_standard_scaled.select_dtypes(include=[np.number]).columns] = scaler_standard.fit_transform(df_standard_scaled.select_dtypes(include=[np.number]))

print('Data after standard scaling:\n', df_standard_scaled.describe())
df_final_scaled = df_standard_scaled

In [None]:
# Encoding categorical variables
# Label Encoding
label_encoder = LabelEncoder()
df_label_encoded = df_final_scaled.copy()
for column in df_final_scaled.select_dtypes(include=['object']).columns:
    df_final_scaled[column] = label_encoder.fit_transform(df_final_scaled[column])

print('Data after label encoding:\n', df_label_encoded.head())

# OneHot Encoding
onehot_encoder = OneHotEncoder(sparse_output = False, drop ='first')
df_onehot_encoded = df_final_scaled.copy()
categorical_columns = df_onehot_encoded.select_dtypes(include = ['object']).columns
onehot_encoded = onehot_encoder.fit_transform(df_onehot_encoded[categorical_columns])
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns = onehot_encoder.get_feature_names_out(categorical_columns))

# Concatenate OneHot encoded columns with original dataframe
df_onehot_encoded = df_onehot_encoded.drop(categorical_columns, axis = 1)
df_onehot_encoded = pd.concat([df_onehot_encoded, onehot_encoded_df], axis = 1)

print('Data after one-hot encoding:\n', df_onehot_encoded.head())

df_final_encoded = df_label_encoded 


In [10]:
# Creating interaction features
interaction_features = pd.DataFrame()

interaction_features['loan_amount_rate_interaction'] = df_final_encoded['loan_amount'] * df_final_encoded['rate']

# Add interaction features to the original dataframe
df_final_with_interaction = pd.concat([df_final_encoded, interaction_features], axis = 1)




In [11]:
# Splitting the dataset into features (X) and target variable (y)
X = df_final_with_interaction.drop('rate', axis = 1)
y = df_final_with_interaction['rate']

# Splitting the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
