# Credit Scoring Models



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
!ls Data

BankCreditScoring      SaoPauloHousingPrices	  kaggle-data.txt
BankCreditScoring.zip  SaoPauloHousingPrices.zip


In [7]:
house_price_df = pd.read_csv("Data/SaoPauloHousingPrices.zip")

In [8]:
house_price_df.head()

Unnamed: 0,address,district,area,bedrooms,garage,type,rent,total
0,Rua Herval,Belenzinho,21,1,0,Studio e kitnet,2400,2939
1,Avenida São Miguel,Vila Marieta,15,1,1,Studio e kitnet,1030,1345
2,Rua Oscar Freire,Pinheiros,18,1,0,Apartamento,4000,4661
3,Rua Júlio Sayago,Vila Ré,56,2,2,Casa em condomínio,1750,1954
4,Rua Barata Ribeiro,Bela Vista,19,1,0,Studio e kitnet,4000,4654


In [9]:
house_price_df.sample(3).T

Unnamed: 0,8912,3339,9482
address,Rua Madrid,Rua Virgínia,Rua Chiquinha Rodrigues
district,Parque Sevilha,Vila Carrao,Caxingui
area,127,40,215
bedrooms,3,2,3
garage,2,0,4
type,Casa,Apartamento,Casa
rent,4875,1500,10000
total,5352,1808,11120


In [17]:
house_price_df.shape

(11657, 8)

In [18]:
house_price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11657 entries, 0 to 11656
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   address   11657 non-null  object
 1   district  11657 non-null  object
 2   area      11657 non-null  int64 
 3   bedrooms  11657 non-null  int64 
 4   garage    11657 non-null  int64 
 5   type      11657 non-null  object
 6   rent      11657 non-null  int64 
 7   total     11657 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 728.7+ KB


In [19]:
house_price_df.describe()

Unnamed: 0,area,bedrooms,garage,rent,total
count,11657.0,11657.0,11657.0,11657.0,11657.0
mean,84.655658,1.966286,1.060393,3250.814789,4080.030625
std,74.020536,0.931313,1.132349,2650.711557,3352.480274
min,0.0,0.0,0.0,500.0,509.0
25%,40.0,1.0,0.0,1590.0,1996.0
50%,60.0,2.0,1.0,2415.0,3057.0
75%,96.0,3.0,2.0,3800.0,4774.0
max,580.0,6.0,6.0,25000.0,28700.0


Model aid with LLM Gen AI Grok from X

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import joblib

# Assuming house_price_df is your DataFrame
# Define features and target
X = house_price_df[['district', 'area', 'bedrooms', 'garage', 'type']]
y = house_price_df['total']  # Predicting total price

# Define categorical and numerical columns
categorical_cols = ['district', 'type']
numerical_cols = ['area', 'bedrooms', 'garage']

# Get all unique categories for 'district' and 'type' to avoid unknown category warning
district_categories = house_price_df['district'].unique().tolist()
type_categories = house_price_df['type'].unique().tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(
            categories=[district_categories, type_categories],
            drop='first',
            handle_unknown='ignore'
        ), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ])

# Create pipeline with preprocessor and linear regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# Save the model to a .sav file
joblib.dump(model, 'house_price_model.sav')
print("Model saved to 'house_price_model.sav'")

Mean Squared Error: 3464963.75
R-squared Score: 0.69
Model saved to 'house_price_model.sav'


In [16]:
import pandas as pd
import joblib

# Load the saved model
try:
    model = joblib.load('house_price_model.sav')
    print("Model loaded successfully from 'house_price_model.sav'")
except FileNotFoundError:
    print("Error: Model file 'house_price_model.sav' not found")
    exit()

# Example prediction for a new house
new_house = pd.DataFrame({
    'district': ['Vila Carrao'],
    'area': [130],
    'bedrooms': [2],
    'garage': [1],
    'type': ['Apartamento']
})

# Make prediction
predicted_price = model.predict(new_house)
print(f"Predicted total price for new house: ${predicted_price[0]:.2f}")

Model loaded successfully from 'house_price_model.sav'
Predicted total price for new house: $5105.54
