<a href="https://colab.research.google.com/github/ecribbie/6912_PROJECT/blob/main/data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
!git clone https://github.com/ecribbie/6912_PROJECT.git
%cd ./6912_PROJECT/

Cloning into '6912_PROJECT'...
remote: Enumerating objects: 70, done.[K
remote: Counting objects: 100% (70/70), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 70 (delta 21), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (70/70), 1.39 MiB | 3.02 MiB/s, done.
Resolving deltas: 100% (21/21), done.
/content/6912_PROJECT/6912_PROJECT


## Read data

In [32]:
data_original=pd.read_excel("default of credit card clients.xls")
data_original.head()
#do not touch this dataset, a copy is made into dat to use (it is the one that is cleaned below)

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


## Clean data

In [33]:
dat=data_original.copy()
# Remove id column
dat.drop(dat.columns[0], axis=1, inplace=True)

# Dict of column names to definition
labels_to_variable = {}
for idx, column in enumerate(dat.columns):
    labels_to_variable[column]=dat.iloc[0, idx]
print(labels_to_variable)

# Remove the first row (labels)
dat = dat.iloc[1:]

# Reset index after removing the first row
dat.reset_index(drop=True, inplace=True)
dat.head()

{'X1': 'LIMIT_BAL', 'X2': 'SEX', 'X3': 'EDUCATION', 'X4': 'MARRIAGE', 'X5': 'AGE', 'X6': 'PAY_0', 'X7': 'PAY_2', 'X8': 'PAY_3', 'X9': 'PAY_4', 'X10': 'PAY_5', 'X11': 'PAY_6', 'X12': 'BILL_AMT1', 'X13': 'BILL_AMT2', 'X14': 'BILL_AMT3', 'X15': 'BILL_AMT4', 'X16': 'BILL_AMT5', 'X17': 'BILL_AMT6', 'X18': 'PAY_AMT1', 'X19': 'PAY_AMT2', 'X20': 'PAY_AMT3', 'X21': 'PAY_AMT4', 'X22': 'PAY_AMT5', 'X23': 'PAY_AMT6', 'Y': 'default payment next month'}


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


# Data splitting

In [34]:
# SPlit data into predictors and response
X = dat.iloc[:, :23]
Y = dat.iloc[:, 23]
Y=Y.astype(int)

# Splitting the data into training and testing
X_train_pre, X_test_pre, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_pre)
X_test = scaler.transform(X_test_pre)


# SMOTE and Undersampling

In [43]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

minority_class_count = np.sum(Y_train == 1)
majority_class_count = np.sum(Y_train == 0)

# Calculate desired number of samples for minority class to achieve 0.5 ratio
desired_minority_samples = int(minority_class_count * 1.2)
# Calculate desired number of samples for majority class to achieve 0.5 ratio
desired_majority_samples = int(desired_minority_samples * 2)

# Create SMOTE and RandomUnderSampler objects with specified sampling strategies
smote = SMOTE(sampling_strategy={1: desired_minority_samples})
undersample = RandomUnderSampler(sampling_strategy={0: desired_majority_samples})

# Creating a pipeline with SMOTE and undersampling
pipeline = Pipeline([
    ('smote', smote),
    ('undersample', undersample)
])

# Applying the pipeline to the training data only
X_train, Y_train = pipeline.fit_resample(X_train, Y_train)

In [44]:
from collections import Counter

# Counting unique values
Counter(Y_train)

Counter({0: 12774, 1: 6387})

# Logistic Regression

In [45]:
Y.astype(int)

0        1
1        1
2        0
3        0
4        0
        ..
29995    0
29996    0
29997    1
29998    1
29999    1
Name: Y, Length: 30000, dtype: int64

In [46]:
# Initialize the logistic regression model
log_reg = LogisticRegression()

# Fit the model on the training data
log_reg.fit(X_train, Y_train)

# Make predictions on the testing data
Y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))

Accuracy: 0.8126666666666666

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.93      0.89      4687
           1       0.61      0.40      0.48      1313

    accuracy                           0.81      6000
   macro avg       0.73      0.66      0.68      6000
weighted avg       0.79      0.81      0.80      6000



# K-means clustering