In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [4]:
income_df = pd.read_csv('/home/student/IngajiDemo/dataset.csv')

In [5]:
columns_to_delete = [
    'Region',
    'Agricultural_Household_indicator',
    'Bread_and_Cereals_Expenditure',
    'Total_Rice_Expenditure',
    'Meat_Expenditure',
    'Total_Fish_and__marine_products_Expenditure',
    'Fruit_Expenditure',
    'Vegetables_Expenditure',
    'Household_Head_Sex',
    'Restaurant_and_hotels_Expenditure',
    'Alcoholic_Beverages_Expenditure',
    'Tobacco_Expenditure',
    'Clothing,_Footwear_and_Other_Wear_Expenditure',
    'Special_Occasions_Expenditure',
    'Household_Head_Age',
    'Household_Head_Marital_Status',
    'Household_Head_Highest_Grade_Completed',
    'Household_Head_Job_or_Business_Indicator',
    'Household_Head_Occupation',
    'Total_number_of_family_members_employed',
    'Type_of_Building/House',
    'Type_of_Roof',
    'Type_of_Walls',
    'House_Floor_Area',
    'House_Age',
    'Number_of_bedrooms',
    'Toilet_Facilities',
    'Main_Source_of_Water_Supply',
    'Number_of_Television',
    'Number_of_CD/VCD/DVD',
    'Household_Head_Class_of_Worker',
    'Type_of_Household',
    'Number_of_Component/Stereo_set', 
    'Number_of_Refrigerator/Freezer',
    'Number_of_Washing_Machine',
    'Number_of_Airconditioner',
    'Number_of_Car,_Jeep,_Van',
    'Number_of_Landline/wireless_telephones',
    'Number_of_Cellular_phone',
    'Number_of_Personal_Computer',
    'Number_of_Stove_with_Oven/Gas_Range',
    'Number_of_Motorized_Banca',
    'Number_of_Motorcycle/Tricycle',     
]

In [6]:
income_df = income_df.drop(columns=columns_to_delete )

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Example DataFrame creation (replace this with your actual DataFrame)
# Here we simulate a large DataFrame for demonstration purposes
data = {
    'Total_Household_Income': [480332, 198235, 82785, 107589, 189322] * 9000,  # 45,000 rows
    'Category': ['High', 'Medium', 'Low'] * 15000,
    'Location': ['Urban', 'Suburban'] * 22500
}
income_df = pd.DataFrame(data)

# Extract categorical columns from the dataframe
categorical_columns = income_df.select_dtypes(include=['object']).columns.tolist()

# Initialize LabelEncoder for each categorical column
label_encoders = {col: LabelEncoder() for col in categorical_columns}

# Apply label encoding to the categorical columns
for col in categorical_columns:
    income_df[col + '_encoded'] = label_encoders[col].fit_transform(income_df[col])

# Drop the original categorical columns
df_encoded = income_df.drop(categorical_columns, axis=1)

# Display the resulting dataframe (print only the first few rows for brevity)
print(f"Encoded Expense data (first few rows):\n{df_encoded.head()}")


Encoded Expense data (first few rows):
   Total_Household_Income  Category_encoded  Location_encoded
0                  480332                 0                 1
1                  198235                 2                 0
2                   82785                 1                 1
3                  107589                 0                 0
4                  189322                 2                 1


In [12]:
np.random.seed(42)
n_samples = 41000  

In [15]:
df_encoded = pd.DataFrame(income_df)

In [16]:
def assign_credit_score(income):
    """Assign a credit score (0-100%) based on income percentile"""
    return np.clip(income.rank(pct=True) * 100, 0, 100)

In [31]:
def assign_loan_range(credit_score):
    """Assign a loan range based on credit score"""
    if credit_score < 20:
        return "1,000 - 5,000"
    elif credit_score < 40:
        return "5,001 - 10,000"
    elif credit_score < 60:
        return "10,001 - 20,000"
    elif credit_score < 80:
        return "20,001 - 35,000"
    else:
        return "35,001 - 50,000"

In [32]:
train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=42)

In [33]:
train_df['Credit_Score'] = assign_credit_score(train_df['Total_Household_Income'])

In [34]:
model = LinearRegression()
model.fit(train_df[['Total_Household_Income']], train_df['Credit_Score'])

In [35]:
# Predict credit scores for the test set
test_df['Predicted_Credit_Score'] = model.predict(test_df[['Total_Household_Income']])
test_df['Predicted_Credit_Score'] = np.clip(test_df['Predicted_Credit_Score'], 0, 100)

In [36]:
test_df['Actual_Credit_Score'] = assign_credit_score(test_df['Total_Household_Income'])

In [37]:
# Calculate evaluation metrics
mae = mean_absolute_error(test_df['Actual_Credit_Score'], test_df['Predicted_Credit_Score'])
mse = mean_squared_error(test_df['Actual_Credit_Score'], test_df['Predicted_Credit_Score'])
rmse = np.sqrt(mse)
r2 = r2_score(test_df['Actual_Credit_Score'], test_df['Predicted_Credit_Score'])

In [38]:
print("Model Evaluation Metrics:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.4f}")

Model Evaluation Metrics:
Mean Absolute Error: 16.78
Root Mean Squared Error: 19.18
R-squared Score: 0.5586


In [40]:
test_df['Actual_Loan_Range'] = test_df['Actual_Credit_Score'].apply(assign_loan_range)
test_df['Predicted_Loan_Range'] = test_df['Predicted_Credit_Score'].apply(assign_loan_range)
loan_range_accuracy = (test_df['Actual_Loan_Range'] == test_df['Predicted_Loan_Range']).mean()

In [28]:
print(f"\nLoan Range Prediction Accuracy: {loan_range_accuracy:.2%}")


Loan Range Prediction Accuracy: 24.61%


In [39]:
# # Visualize actual vs predicted credit scores
# plt.figure(figsize=(10, 6))
# plt.scatter(test_df['Actual_Credit_Score'], test_df['Predicted_Credit_Score'], alpha=0.5)
# plt.plot([0, 100], [0, 100], 'r--')  # Perfect prediction line
# plt.title('Actual vs Predicted Credit Scores')
# plt.xlabel('Actual Credit Score')
# plt.ylabel('Predicted Credit Score')
# plt.show()

In [None]:
print("\nSample of Test Results:")
print(test_df[['Total_Household_Income', 'Actual_Credit_Score', 'Predicted_Credit_Score', 
               'Actual_Loan_Range', 'Predicted_Loan_Range']].sample(10))