In [None]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#pip install xlrd

In [None]:
# Loading the dataset from excel file and skip first row
data = pd.read_excel('default of credit card clients.xls', skiprows=1)  

In [None]:
data

In [None]:
# Displaying the first few rows of the dataset to understand its structure.
print(data.head())


In [None]:
# Displaying info about the dataset.
print(data.info())

In [None]:
#Displaying Summary statistics
data.describe()

In [None]:
# Checking for missing values
print("\nMissing values present in the dataset :")
print(data.isnull().sum())

In [None]:
# Rename columns for more clarity
data.columns = ['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 
              'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 
              'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 
              'PAY_AMT6', 'default_payment_next_month']

In [None]:
data.head(10)

In [None]:
# Distribution of categorical variables
print("\nDistribution of categorical variables:")
print("Sex:")
print(data['SEX'].value_counts())
print("\nEducation:")
print(data['EDUCATION'].value_counts())
print("\nMarriage:")
print(data['MARRIAGE'].value_counts())


In [None]:
# Distribution of target variable
print("\nDistribution of target variable:")
print(data['default_payment_next_month'].value_counts())

In [None]:
# Correlation Analysis
print("\nCorrelation analysis:")
corr_matrix = data.corr()
print(corr_matrix)


In [None]:
# Plotting the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Pair plot of selected features ('LIMIT_BAL', 'AGE', 'BILL_AMT1', 'PAY_AMT1', 'default_payment_next_month')
selected_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'PAY_AMT1', 'default_payment_next_month']
sns.pairplot(data[selected_features], hue='default_payment_next_month')
plt.show()


In [None]:
# Ploting the relationship between age and default payment next month using BoxPlot
plt.figure(figsize=(10, 6))
sns.boxplot(x='default_payment_next_month', y='AGE', data=data)
plt.title('Relationship between Age and Default Payment Next Month')
plt.xlabel('Default Payment Next Month (1 = Yes, 0 = No)')
plt.ylabel('Age')
plt.show()

In [None]:
# Ploting Distribution of Age
plt.figure(figsize=(10, 6))
sns.histplot(data['AGE'], bins=30, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
# Counts of default and non-default payments
default_counts = data['default_payment_next_month'].value_counts()

# Ploting the pie chart for the Proportion of Default and Non-default Payments
labels = ['Non-default', 'Default']
colors = ['#66b3ff', '#ffcc99']
explode = (0, 0.1)  # explode the 2nd slice (i.e. 'Default')

plt.figure(figsize=(8, 8))
plt.pie(default_counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, explode=explode)
plt.title('Proportion of Default and Non-default Payments')
plt.show()

In [None]:
# Mapping the education levels to their descriptions
education_mapping = {1: 'Graduate School', 2: 'University', 3: 'High School', 4: 'Others'}
data['EDUCATION_MAP'] = data['EDUCATION'].map(education_mapping)


In [None]:
# Ploting the relationship between education and default payment next month using bargraph
plt.figure(figsize=(10, 6))
sns.countplot(x='EDUCATION_MAP', hue='default_payment_next_month', data=data, palette='Set2')
plt.title('Relationship between Education Level and Default Payment Next Month')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.legend(title='Default Payment Next Month', labels=['No', 'Yes'])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Ploting the relationship between LIMIT_BAL and default payment next month using boxplot
plt.figure(figsize=(12, 6))
sns.boxplot(x='default_payment_next_month', y='LIMIT_BAL', data=data, palette='Set2')
plt.title('Relationship between Credit Limit (LIMIT_BAL) and Default Payment Next Month')
plt.xlabel('Default Payment Next Month')
plt.ylabel('Credit Limit (NT dollars)')
plt.xticks([0, 1], ['No', 'Yes'])
plt.show()

In [None]:
# Ploting Relationship between Education and Credit Limit using a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='EDUCATION_MAP', y='LIMIT_BAL', data=data, palette='Set2')
plt.title('Relationship between Education and Credit Limit')
plt.xlabel('Education Level')
plt.ylabel('Credit Limit (LIMIT_BAL)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
# Mapping for marital status
marriage_map = {
    1: 'Married',
    2: 'Single',
    3: 'Others'
}

# Apply the marital status mapping
data['MARRIAGE_MAP'] = data['MARRIAGE'].map(marriage_map)



In [None]:
# Plotting relationship between MARRIAGE and default_payment_next_month

plt.figure(figsize=(10, 6))

# Count plot to show distribution of default_payment_next_month across MARRIAGE categories
sns.countplot(x='MARRIAGE_MAP', hue='default_payment_next_month', data=data, palette='Set1')
plt.title('Relationship between Marital Status and Default Payment Next Month')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.legend(title='Default Payment Next Month', loc='upper right', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

In [None]:
# Selecting columns related to repayment status (PAY_0 to PAY_6)
repayment_columns = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

# Flatten and count the values across all repayment columns
payment_status_counts = data[repayment_columns].stack().value_counts().sort_index()

# Mapping numeric codes to descriptive labels
payment_labels = {
    -2: 'No Consumption',
    -1: 'Pay Duly',
     0: 'Revolving Credit',
     1: 'Delay 1 Month',
     2: 'Delay 2 Months',
     3: 'Delay 3 Months',
     4: 'Delay 4 Months',
     5: 'Delay 5 Months',
     6: 'Delay 6 Months',
     7: 'Delay 7 Months',
     8: 'Delay 8 Months',
     9: 'Delay 9+ Months'
}

# Create a DataFrame for plotting
plot_data = pd.DataFrame({
    'Repayment Status': payment_status_counts.index.map(payment_labels),
    'Count': payment_status_counts.values
})

# Plotting the bar plot for the Distribution of Repayment Status Across Months
plt.figure(figsize=(12, 6))
sns.barplot(x='Repayment Status', y='Count', data=plot_data, palette='Set1', order=payment_labels.values())
plt.title('Distribution of Repayment Status Across Months')
plt.xlabel('Repayment Status')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Selecting columns related to payment status (PAY_0 to PAY_6) and default_payment_next_month
payment_columns = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
target_column = 'default_payment_next_month'

# Plotting grouped bar plot to visualize relationship between PAY and default_payment_next_month
plt.figure(figsize=(12, 6))

# Iterate through each payment column and plot the grouped bar
for i, col in enumerate(payment_columns, start=1):
    plt.subplot(2, 3, i)
    sns.countplot(x=col, hue=target_column, data=data, palette='Set1')
    plt.title(f'{col} vs Default Payment Next Month')
    plt.xlabel(f'{col} Payment Status')
    plt.ylabel('Count')
    plt.legend(title='Default Payment Next Month', labels=['No', 'Yes'])
    plt.tight_layout()

plt.show()

In [None]:
# List of bill amount and payment amount columns
bill_columns = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
pay_columns = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

# Creating scatter plots for each pair of bill amounts and payment amounts
plt.figure(figsize=(15, 10))

for i in range(6):
    plt.subplot(2, 3, i + 1)
    sns.scatterplot(x=data[bill_columns[i]], y=data[pay_columns[i]], alpha=0.5)
    plt.title(f'Relationship between {bill_columns[i]} and {pay_columns[i]}')
    plt.xlabel('Bill Amount (NT dollars)')
    plt.ylabel('Payment Amount (NT dollars)')

plt.tight_layout()
plt.show()

# Model Implementation

In [None]:
#pip install gplearn


In [None]:
# Drop the ID column as it is not relevant
data = data.drop(columns=['ID','EDUCATION_MAP','MARRIAGE_MAP'])

In [None]:
data

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Exclude categorical columns from normalization
non_normalizable_columns = list(categorical_columns)
columns_to_normalize = [col for col in numeric_columns if col not in non_normalizable_columns]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Normalize specified numeric columns
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

# Print the DataFrame to verify normalization
print(data)

In [None]:
columns_to_normalize

In [None]:
data


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from gplearn.genetic import SymbolicClassifier






# Define features and target
X = data.drop('default_payment_next_month', axis=1)
y = data['default_payment_next_month']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the symbolic regression model using Genetic Programming
gp = SymbolicClassifier(population_size=5000,
                        generations=20,
                        tournament_size=20,
                        const_range=(0, 1),
                        init_depth=(2, 6),
                        init_method='half and half',  # Use 'half and half' instead of 'half_and_half'
                        parsimony_coefficient=0.001,
                        random_state=42)

# Fit the model
gp.fit(X_train, y_train)

# Make predictions
y_pred = gp.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the best program found
print("Best program:")
print(gp._program)