<a href="https://colab.research.google.com/github/glennamaria/task1/blob/main/Console.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns


In [32]:
from google.colab import drive
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:

path="/content/drive/MyDrive/SME.csv"
df= pd.read_csv(path)
df

Unnamed: 0,LoanAmount,Age,NetIncome,CIBIL,Gender,Tenure,Branch Name,Deliquency
0,500000,51,59048,804,FEMALE,60,THANJAVUR,0
1,500000,29,11000,744,MALE,60,THANJAVUR,0
2,400000,39,30000,704,MALE,60,THANJAVUR,0
3,300000,64,16000,690,MALE,60,THANJAVUR,1
4,500000,52,40000,760,MALE,60,THANJAVUR,0
...,...,...,...,...,...,...,...,...
2604,300000,39,88000,745,FEMALE,48,THANJAVUR,0
2605,1100000,31,52260,0,MALE,72,YELAMANCHILI,0
2606,320000,25,39016,-1,MALE,60,DUMDUMA,0
2607,400000,27,36500,754,MALE,60,DANAPUR,0


In [34]:
print(df.info())
print(df.head())

# Assuming 'delinquency' is the target variable
X = df.drop('Deliquency', axis=1)
y = df['Deliquency']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2609 entries, 0 to 2608
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   LoanAmount   2609 non-null   int64 
 1   Age          2609 non-null   int64 
 2   NetIncome    2609 non-null   int64 
 3   CIBIL        2609 non-null   int64 
 4   Gender       2609 non-null   object
 5   Tenure       2609 non-null   int64 
 6   Branch Name  2609 non-null   object
 7   Deliquency   2609 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 163.2+ KB
None
   LoanAmount  Age  NetIncome  CIBIL  Gender  Tenure Branch Name  Deliquency
0      500000   51      59048    804  FEMALE      60   THANJAVUR           0
1      500000   29      11000    744    MALE      60   THANJAVUR           0
2      400000   39      30000    704    MALE      60   THANJAVUR           0
3      300000   64      16000    690    MALE      60   THANJAVUR           1
4      500000   52      40000    760    MA

In [35]:
# Define numerical and categorical features
numerical_features = ['Age', 'NetIncome', 'CIBIL', 'Tenure', 'LoanAmount']
categorical_features = ['Gender', 'Branch Name']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocess the features
X_processed = preprocessor.fit_transform(X)


In [36]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [37]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_resampled, y_train_resampled)
y_pred_log_reg = log_reg.predict(X_test)

# Random Forest
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_resampled, y_train_resampled)
y_pred_rf = random_forest.predict(X_test)

# Gradient Boosting
grad_boost = GradientBoostingClassifier(n_estimators=100, random_state=42)
grad_boost.fit(X_train_resampled, y_train_resampled)
y_pred_gb = grad_boost.predict(X_test)


In [38]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"{model_name}:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))

# Evaluate each model
evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_gb, "Gradient Boosting")


Logistic Regression:
[[395 260]
 [ 50  78]]
              precision    recall  f1-score   support

           0       0.89      0.60      0.72       655
           1       0.23      0.61      0.33       128

    accuracy                           0.60       783
   macro avg       0.56      0.61      0.53       783
weighted avg       0.78      0.60      0.66       783

Accuracy: 0.6040868454661558
Random Forest:
[[599  56]
 [ 91  37]]
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       655
           1       0.40      0.29      0.33       128

    accuracy                           0.81       783
   macro avg       0.63      0.60      0.61       783
weighted avg       0.79      0.81      0.80       783

Accuracy: 0.8122605363984674
Gradient Boosting:
[[543 112]
 [ 77  51]]
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       655
           1       0.31      0.40      0.35       128



In [39]:
# Split data into training and test sets *before* preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Use the original X, not X_processed

# Example preprocessing pipeline and model
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'NetIncome', 'CIBIL', 'Tenure', 'LoanAmount']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Gender', 'Branch Name'])
    ])

# Train a Random Forest model (example)
model = RandomForestClassifier(n_estimators=100, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Train your pipeline with data
pipeline.fit(X_train, y_train) # Now X_train is a DataFrame

# Save the trained pipeline
joblib.dump(pipeline, 'model_pipeline.pkl')

['model_pipeline.pkl']

In [41]:
import joblib
import pandas as pd
import numpy as np

# Load the saved model pipeline
pipeline = joblib.load('model_pipeline.pkl')

def make_prediction(new_data):
    """
    Make predictions using the trained model pipeline.
    """
    # Convert new data to DataFrame
    new_data_df = pd.DataFrame([new_data])

    # Predict using the pipeline
    prediction = pipeline.predict(new_data_df)

    return prediction[0]

def main():
    print("Welcome to the Delinquency Prediction Console")

    # Collect new data from user
    Age = float(input("Enter age: "))
    NetIncome = float(input("Enter income: "))
    CIBIL = float(input("Enter CIBIL: "))
    Tenure = float(input("Enter Tenure: ")) # Removed extra indent here
    Gender = input("Enter gender (e.g., male/female): ")
    BranchName = input("Enter Branch Name: ")



    # Create a dictionary with the input data
    new_data = {
        'Age': 20,
        'NetIncome': 100000,
        'LoanAmount': 500000,
        'CIBIL': 750,
        'Gender': 'FEMALE',
        'Tenure': 60, # Added comma here
        'Branch Name': 'THANJAVUR'
    }

    # Make prediction
    prediction = make_prediction(new_data)
    print(f"Prediction: {'Delinquent' if prediction == 1 else 'Non-delinquent'}")

if __name__ == "__main__":
    main()

Welcome to the Delinquency Prediction Console
Enter age: 20
Enter income: 100000
Enter CIBIL: 750
Enter Tenure: 60
Enter gender (e.g., male/female): female
Enter Branch Name: thanjavur
Prediction: Non-delinquent
