### Kiva Loan Default Prediction


In [1]:
!pip install transformers matplotlib



In [2]:
import matplotlib.pyplot as plt
import tensorflow.keras as keras
import pandas as pd

from transformers import DistilBertTokenizer, TFDistilBertModel

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Data Preparation

## Clean the text and your targets
Hints: 

3. These resources will help you understand what type of cleaning will be required and how you can encode your text for the network:
    - a) Preprocessing: https://huggingface.co/transformers/preprocessing.html
    - b) Summary of tokenizers (DistilBERT uses WordPiece): https://huggingface.co/transformers/tokenizer_summary.html#wordpiece
4. Consider the text length, is this too big/small for DistilBERT? what impact would padding/truncation have?
5. In load data you generated a profiling report of this dataset, might be helpful to review that as well

In [3]:
#Creating the dataframes
kiva_train = pd.read_csv('kiva_train.csv')
kiva_test = pd.read_csv('kiva_test.csv')

#Formatting kaggle dataframe to run in the predict function
kaggle_test = kiva_test['en_clean']
kaggle_test = kaggle_test.to_list()


kiva_train.head(10)

Unnamed: 0,loan_id,en_clean,defaulted
0,7779,She opened a colmado out of the side of her ho...,0
1,2777,(First Loan): Joffre continues to run his loc...,1
2,6007,"Dina Santana is the mother of two children, Ju...",0
3,76,"Rosemary is 50 years old, single, and has 6 ch...",1
4,4217,"Segundo has a shop where he sells animal feed,...",0
5,5077,"I am a single parent, mother of 4. I sell groc...",0
6,6033,Mariana Jose Serda owns a general store in Mag...,0
7,843,Mary is a mother of four children. One of the ...,1
8,4357,Wilson is seeking his second loan with Kiva an...,0
9,7007,Clara lives with her two children (ages: 20 an...,0


In [24]:
# Creating Training Data
X = kiva_train['en_clean']
y = kiva_train['defaulted']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = kaggle_test

### Tokenization of Data

In [32]:
def encode_text(text):
    
    print(type(text))
    model_inputs_and_masks = dbert_tokenizer(
        text.to_list(), 
        padding='max_length',
        return_tensors="tf",
        truncation=True,
        max_length=512
    )
    input_ids = model_inputs_and_masks['input_ids']
    attention_mask = model_inputs_and_masks['attention_mask']

    return input_ids, attention_mask

def prepare_target(raw_y):
    # 0 = "non-default"
    # 1 = "default"
    y = keras.utils.to_categorical(raw_y)
    y = np.array(y)
    return y



In [35]:
#Creating Data for distilBert

train_input, train_mask = encode_text(X_train)
train_y = y_train #prepare_target(y_train)

val_input, val_mask = encode_text(X_val)
val_y = y_val #prepare_target(y_val)


train_model_inputs_and_masks = {
    'inputs' : train_input,
    'masks' : train_mask
}

val_model_inputs_and_masks = {
    'inputs' : val_input,
    'masks' : val_mask
}


<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


# Modelling

Resources:
- DistilBERT paper: https://arxiv.org/abs/1910.01108
- DistilBERT Tensorflow Documentation: https://huggingface.co/transformers/model_doc/distilbert.html#tfdistilbertmodel

In [None]:
def build_model(base_model, trainable=True, params={}):
    
    max_seq_len = params["max_seq_len"]
    inputs = Input(shape = (max_seq_len,), dtype='int64', name='inputs')
    masks  = Input(shape = (max_seq_len,), dtype='int64', name='masks')

    base_model.trainable = trainable

    dbert_output = base_model.distilbert(inputs, attention_mask=masks)
    dbert_last_hidden_state = dbert_output.last_hidden_state

    # Any additional layers should go here
    # use the 'params' as a dictionary for hyper parameter to facilitate experimentation
    dbert_cls_output = dbert_last_hidden_state[:,0,:]
    # two fully connected layers with dropout. This can be tweaked
    # x = Dense(params["layer_width1"], activation='relu')(dbert_cls_output)
    # x = Dropout(params["dropout1"])(x)
    # x = Dense(params["layer_width2"], activation='relu')(x)
    # x = Dropout(params["dropout2"])(x)

    probs = Dense(1, activation='sigmoid')(dbert_cls_output)

    model = keras.Model(inputs=[inputs, masks], outputs=probs)
    model.summary()
    return model



In [None]:
def compile_model(model):
    # TODO: compile the model, include relevant auc metrics when training
    # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION
    # Hint: you may want to read up on the "multi_label" parameter in the keras AUC metrics
    model.compile(
        loss=keras.losses.BinaryCrossentropy(),
        optimizer=keras.optimizers.Adam(learning_rate=4e-5),
        metrics=[
            'accuracy', 
            keras.metrics.Precision(),
            keras.metrics.Recall()
        ]
    )
    
    return model

In [44]:
def train_model(model, model_inputs_and_masks_train, model_inputs_and_masks_val, y_train, y_val, batch_size, num_epochs):
    
    es = keras.callbacks.EarlyStopping(
        monitor="loss", 
        mode='min', 
        verbose=1,
        patience=2
    )
    
    history = model.fit(model_inputs_and_masks_train, y_train, batch_size=batch_size, epochs=num_epochs,verbose=1, callbacks=[es], steps_per_epoch=3)
    return model, history

# Execution




Use the cell below to execute and experiment with your model

In [None]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

params={"max_seq_len" : train_input.shape[1],
        "layer_width1" : 64,
        "dropout1" : 0.2,
        "layer_width2" : 32,
        "dropout2" : 0.2}

model = build_model(dbert_model, params=params)
model = compile_model(model)
model, history = train_model(model, train_model_inputs_and_masks, val_model_inputs_and_masks, train_y, val_y, batch_size=20, num_epochs=10)


In [40]:
from transformers import AutoTokenizer, TFAutoModel

xlnetPretrainedModel = TFAutoModel.from_pretrained("xlnet-base-cased")

params={"max_seq_len" : train_input.shape[1],
        "layer_width1" : 64,
        "dropout1" : 0.2,
        "layer_width2" : 32,
        "dropout2" : 0.2}

model = build_model(xlnetPretrainedModel, params=params)
model = compile_model(model)
model, history = train_model(model, train_model_inputs_and_masks, val_model_inputs_and_masks, train_y, val_y, batch_size=20, num_epochs=7)





Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


NameError: name 'build_model' is not defined

In [None]:
pred_df = pd.DataFrame(columns = ['preds'])

def create_preds(text):

    #Encode Text 
    inputs_and_masks = dbert_tokenizer(text, return_tensors="tf",padding='max_length',truncation=True,max_length=512)

    input_ids = inputs_and_masks['input_ids']
    attention_mask = inputs_and_masks['attention_mask']
    

    # Creating dictionary for ids and masks
    model_inputs_and_masks = {
    'inputs' : input_ids,
    'masks' : attention_mask 
    }

    # Get prediction from model
    prediction = model(model_inputs_and_masks, training=False)
    
    # # Store prediction
    pred = np.array(prediction)

    return pred

#  
pred_df = X_val.apply(create_preds)

new_preds = pred_df.apply(lambda x: x.flatten())
new_preds = new_preds.apply(lambda x: x.round())
new_preds = new_preds.apply(lambda x: int(x))

print(classification_report(y_val, new_preds))






### Kaggle Prediction

In [None]:
kaggle_pred = pd.DataFrame(columns = ['preds'])
kaggle_pred = kaggle_test.apply(create_preds)

In [None]:

new_preds = kaggle_pred.apply(lambda x: x.flatten())
new_preds = new_preds.apply(lambda x: x.round())
new_preds = new_preds.apply(lambda x: int(x))

#Convert to dataframe
kaggle_submission = pd.DataFrame(data = new_preds)

print(kaggle_submission)

#Add back in the loan IDs and format for Kaggle submission
Team_Spadina_Submission = pd.DataFrame({'id': kiva_test['loan_id'], 'predicted': kaggle_submission['en_clean']})

Team_Spadina_Submission.to_csv('Team_Spadina_Submission.csv')

In [None]:
model.save('Model_Transformer2')

In [1]:
import tensorflow.compat.v2 as tf

tf.enable_v2_behavior()

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

from tensorflow.python.compiler.mlcompute import mlcompute
mlcompute.set_mlc_device(device_name='gpu')


import matplotlib.pyplot as plt
import tensorflow.keras as keras
import pandas as pd


import numpy as np
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from transformers import AutoTokenizer, TFAutoModel, DistilBertTokenizer


In [26]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [43]:


params={"max_seq_len" : train_input.shape[1],
        "layer_width1" : 64,
        "dropout1" : 0.2,
        "layer_width2" : 32,
        "dropout2" : 0.2}

max_seq_len = params["max_seq_len"]
inputs = Input(shape = (max_seq_len,), dtype='int64', name='inputs')
masks  = Input(shape = (max_seq_len,), dtype='int64', name='masks')


x = tf.keras.Input(shape=X_train.shape[0], dtype='int64')
xlnetPretrainedModel = TFAutoModel.from_pretrained("xlnet-base-cased")
output = xlnetPretrainedModel(inputs, masks)

last_hidden_state = output.last_hidden_state
cls_token = last_hidden_state[:, 0, :]

x = xlnetPretrainedModel(x)

x = tf.keras.layers.Dense(1, activation='relu')(cls_token)
model = tf.keras.Model(inputs=[inputs, masks],outputs=x)
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

model, history = train_model(model, train_model_inputs_and_masks, val_model_inputs_and_masks, train_y, val_y, batch_size=20, num_epochs=7)




Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, 512)]        0                                            
__________________________________________________________________________________________________
masks (InputLayer)              [(None, 512)]        0                                            
__________________________________________________________________________________________________
tfxl_net_model_9 (TFXLNetModel) multiple             116718336   inputs[0][0]                     
                                                                 masks[0][0]                      
__________________________________________________________________________________________________
tf_op_layer_strided_slice_8 (Te [(None, 768)]        0           tfxl_net_model_9[0][0]     

ValueError: `validation_steps` should not be specified if `validation_data` is None.