In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# --- Step 1: Automatically Download the German Credit Dataset ---
# This code downloads the dataset directly from a public URL.
# This avoids any FileNotFoundError issues.
try:
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"

    # The dataset has no header row, so we define the column names manually.
    # The last column (20th) is the target variable.
    column_names = ['checking_account', 'duration', 'credit_history', 'purpose', 'credit_amount',
                    'savings_account', 'employment_since', 'installment_rate', 'personal_status',
                    'other_debtors', 'residence_since', 'property', 'age', 'other_installment_plans',
                    'housing', 'existing_credits', 'job', 'dependents', 'telephone', 'foreign_worker',
                    'credit_risk']

    df = pd.read_csv(url, sep=' ', header=None, names=column_names)
    print("Dataset loaded successfully from URL.")

except Exception as e:
    print(f"An error occurred while downloading the dataset: {e}")
    print("Please check your internet connection or the URL.")
    exit()

# --- Step 2: Feature Engineering and Data Preparation ---
# The target variable (credit_risk) is currently 1 for good and 2 for bad.
# We convert it to 0 for good and 1 for bad for machine learning.
df['credit_risk'] = df['credit_risk'].apply(lambda x: 0 if x == 1 else 1)

# Define the features and the target variable
numerical_features = ['duration', 'credit_amount', 'installment_rate', 'age', 'existing_credits', 'dependents']
categorical_features = ['checking_account', 'credit_history', 'purpose', 'savings_account',
                        'employment_since', 'personal_status', 'other_debtors',
                        'property', 'other_installment_plans', 'housing',
                        'job', 'telephone', 'foreign_worker']
target = 'credit_risk'

X = df[numerical_features + categorical_features]
y = df[target]

# --- Step 3: Create the Preprocessing Pipeline ---
# This pipeline scales numerical data and one-hot encodes categorical data.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# --- Step 4: Create and Train the Model Pipeline ---
# Using a RandomForestClassifier with balanced class weights.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=500, random_state=42, class_weight='balanced'))
])

print("\nTraining the Random Forest model for Credit Risk Prediction...")

# Split the data and train the pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipeline.fit(X_train, y_train)

print("Training complete.")

# --- Step 5: Evaluate the Model ---
y_pred = pipeline.predict(X_test)
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_pred))

# --- Step 6: Save the Trained Pipeline ---
joblib.dump(pipeline, 'credit_risk_pipeline.pkl')
print("\nNew model saved to 'credit_risk_pipeline.pkl'.")


Dataset loaded successfully from URL.

Training the Random Forest model for Credit Risk Prediction...
Training complete.

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.77      0.91      0.84       140
           1       0.65      0.37      0.47        60

    accuracy                           0.75       200
   macro avg       0.71      0.64      0.65       200
weighted avg       0.73      0.75      0.73       200


New model saved to 'credit_risk_pipeline.pkl'.
