#                            Credit Score Tensorflow Notebook 

In [None]:
#Importing libraries
import seaborn as sns 
import matplotlib.pyplot as plotly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("Libraries imported successfully....")

In [None]:
#Import data
df =  pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Data imported successfully....")

In [None]:
# get the number of missing data points per column in Complaints_df
missing_values_count = df.isnull().sum()

# look at the # of missing points
missing_values_count

##Relevant Null vallues dropped in mining via query. 

In [None]:
df.describe()

##### Missing data handling plan
1. Name: We wont be using this as a feature 
2. Monthly inhand salary,Number of delayed payment, Num credit inquiries, Amount invested monthly, Monthly balance : all set to zero.
3. Type of Loan: will be split into one hot and null values will sort theselves out.
4. Credit history age will be feature engineered into numerics and null will be set to zero

There is data for eight months of the year for each client. Upon research, I found that credit score is not dependant on the previous month hence each entry in the dataset can and will be used independently. Trying a time series trick would be overfitting.


#### Numeric values handling

In [None]:
#Credit history age to months
def age_to_months(age_str):
    if pd.isna(age_str) or age_str == '':
        return float('nan')
    age_list = age_str.split()
    years = int(age_list[0])
    months = int(age_list[3])
    total_months = years * 12 + months
    return total_months

df['Credit_History_Age'] = df['Credit_History_Age'].apply(age_to_months)

#Cleaning up underscored columns
numeric_cols = ['Age','Annual_Income','Monthly_Inhand_Salary','Num_Bank_Accounts','Num_Credit_Card','Interest_Rate','Num_of_Loan','Delay_from_due_date','Num_of_Delayed_Payment','Changed_Credit_Limit','Num_Credit_Inquiries','Outstanding_Debt','Credit_Utilization_Ratio','Credit_History_Age','Total_EMI_per_month','Amount_invested_monthly','Monthly_Balance']
categorical_cols = ['Occupation','Credit_Mix','Payment_of_Min_Amount','Payment_Behaviour', 'Credit_Score']

# remove '_' from values in specified columns and convert to float
numeric_cols

In [None]:
for col in numeric_cols:
    df[col] = df[col].replace('_', '')
    df[col] = pd.to_numeric(df[col], errors='coerce')

# print the updated DataFrame
print(df)

 #### Categorical values handling

1. Segregate into high paying, medium and low paying' and use ordinal encoding
2. Month to be transformed into ordinal encoding
3. Separate types of loans and Payment_Behaviour by commas and count number of loans
4. Transform Credit Mix and Payment_of_Min_Amount with ordinal encoding


In [None]:
#Annual salaries feature engineering

# Define the income thresholds for each category
income_bins = [0, 30000, 70000, float('inf')]
income_labels = ['Low Income', 'Medium Income', 'High Income']

# Segregate the annual income into categories using pandas cut() function
df['Income_Category'] = pd.cut(df['Annual_Income'], bins=income_bins, labels=income_labels)

# Plot the frequency distribution of each income category using a bar plot
df['Income_Category'].value_counts().plot(kind='bar', rot=0)

# Set the axis labels and title
plt.xlabel('Income_Category')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Annual Income by Category')

# Show the plot
plt.show()


In [None]:
df.loc[:,['Month','Occupation','Type_of_Loan','Credit_Mix','Payment_of_Min_Amount','Payment_Behaviour','Credit_Score']]

In [None]:
unique_count = df['Payment_of_Min_Amount'].value_counts()
print(unique_count)

#### Exploration

In [None]:
# Create a box plot for each numeric column by "Credit_Score"
for col in numeric_cols:
    plt.figure()
    sns.boxplot(x='Credit_Score', y=col, data=df)
    plt.title(col)
    
plt.show()


In [None]:
# Create a bar plot for each categorical column by "Credit_Score"
# Iterate over each categorical column and create a bar plot
for col in categorical_cols:
    # Group the DataFrame by 'Credit_Score' and the current categorical column
    grouped = df.groupby(['Credit_Score', col]).size().unstack()
    
    # Normalize the data to convert counts to proportions
    grouped = grouped.div(grouped.sum(axis=1), axis=0)
    
    # Create the bar plot
    grouped.plot(kind='bar', stacked=True)
    plt.title(col)
    plt.xlabel('Credit_Score')
    plt.ylabel('Proportion')
    plt.show()

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Define the features and label
features = ['Monthly_Inhand_Salary', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',  'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
label = 'Credit_Score'

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.3, random_state=0)

# Define a pipeline for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define a pipeline for categorical columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

# Create a ColumnTransformer to apply the pipeline to the numeric and categorical columns
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, X_train.select_dtypes(include=['int64', 'float64']).columns),
    ('cat', cat_transformer, X_train.select_dtypes(include=['object']).columns)
])

# Fit the preprocessor to the training data and transform both the training and test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
print('Training Set: %d, Test Set: %d \n' % (len(X_train), len(X_test)))

# Print the transformed DataFrame
print("X_train_Transformed /n",X_train_transformed)
print("X_test_Transformed /n",X_test_transformed)



Training Set: 70000, Test Set: 30000 

X_train_Transformed /n [[-9.62948806e-01  1.83819055e-02 -4.77111390e-01 ...  2.00000000e+00
   2.00000000e+00  4.00000000e+00]
 [-4.15967739e-02  0.00000000e+00  1.67810664e+00 ...  0.00000000e+00
   0.00000000e+00  2.00000000e+00]
 [ 2.98216011e+00 -2.99535488e-02 -1.21796759e+00 ...  1.00000000e+00
   1.00000000e+00  2.00000000e+00]
 ...
 [-3.83742770e-01 -1.38417307e-02 -1.42001928e+00 ...  1.00000000e+00
   1.00000000e+00  4.00000000e+00]
 [-3.10528121e-16 -4.60653669e-02  2.63744806e-01 ...  1.00000000e+00
   1.00000000e+00  4.00000000e+00]
 [-1.14982457e+00  8.28291779e-02 -4.09760827e-01 ...  0.00000000e+00
   2.00000000e+00  6.00000000e+00]]
X_test_Transformed /n [[-3.10528121e-16  6.67173598e-02  1.27400326e+00 ...  0.00000000e+00
   2.00000000e+00  2.00000000e+00]
 [-2.60029570e-01  3.44937236e-02 -4.09760827e-01 ...  0.00000000e+00
   2.00000000e+00  6.00000000e+00]
 [-3.10528121e-16 -2.99535488e-02  2.63744806e-01 ...  1.00000000e+00


In [24]:
import tensorflow as tf

# Assuming y_test is a pandas Series object
# First, convert it to a NumPy array
y_test_np = y_test.to_numpy()

# Calculate the number of unique classes in y_test
num_classes = len(set(y_test_np))

# Transform y_test to ordinal labels
y_test_transformed = tf.keras.utils.to_categorical(y_test_np, num_classes)

print("y_test transformed to ordinal labels:")
print(y_test_transformed)


ModuleNotFoundError: No module named 'tensorflow'