In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from gensim.models import KeyedVectors

In [93]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [14]:
df = pd.read_json("/Users/ramir/Desktop/Carpeta/Nueva_carpeta/Programas/Laboratorios_IA/Lab_1/complaints.json")

In [15]:
df = df[['complaint_what_happened', 'product','consumer_disputed']].dropna()
df = df[df['consumer_disputed'] != 'N/A']
df = df[df['complaint_what_happened'] != '']

In [19]:
df = df[df['product'] == 'Mortgage']

In [22]:
df[df['consumer_disputed'] == 'Yes']

Unnamed: 0,complaint_what_happened,product,consumer_disputed
121126,We purchased a new home from XXXX XXXX in XXXX...,Mortgage,Yes
124822,Nationstar Mortgage requested information that...,Mortgage,Yes
124867,I submitted a form requesting acknowledgement ...,Mortgage,Yes
124900,Nationstar Mortgage became a servicer of my mo...,Mortgage,Yes
124935,"On XXXX XXXX, XXXX, I got a letter from Nation...",Mortgage,Yes
...,...,...,...
3979232,PNC Bank refuses to put the account on our cre...,Mortgage,Yes
3979770,We were in a current modification with SPS ( s...,Mortgage,Yes
3979790,Urgent Please HelpI purchased my personal resi...,Mortgage,Yes
3979925,I had a short sale back in 2009 which was nego...,Mortgage,Yes


In [23]:
df_temporal = pd.DataFrame()
df_temporal = df_temporal.append(df[df['consumer_disputed'] == 'No'].iloc[:8317], ignore_index=True)
df = df_temporal.append(df[df['consumer_disputed'] == 'Yes'], ignore_index=True)
df

  df_temporal = df_temporal.append(df[df['consumer_disputed'] == 'No'].iloc[:8317], ignore_index=True)
  df = df_temporal.append(df[df['consumer_disputed'] == 'Yes'], ignore_index=True)


Unnamed: 0,complaint_what_happened,product,consumer_disputed
0,Denied sufficient time to complete a ( short s...,Mortgage,No
1,Nationstar Mortgage withheld money from my mon...,Mortgage,No
2,My client is XXXX XXXX and I have been working...,Mortgage,No
3,My original note was owned by XXXX Bank and I ...,Mortgage,No
4,My mortgage was bought by Nationstar Mortgage ...,Mortgage,No
...,...,...,...
16629,PNC Bank refuses to put the account on our cre...,Mortgage,Yes
16630,We were in a current modification with SPS ( s...,Mortgage,Yes
16631,Urgent Please HelpI purchased my personal resi...,Mortgage,Yes
16632,I had a short sale back in 2009 which was nego...,Mortgage,Yes


In [97]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ramir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramir\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [198]:
# Function to preprocess text
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation and stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Join the cleaned tokens back into a sentence
    cleaned_text = ' '.join(filtered_tokens)
    
    return cleaned_text

# Apply the preprocess_text function to the 'complain_what_happened' column
df['complaint_what_happened_cleaned'] = df['complaint_what_happened'].apply(preprocess_text)


In [210]:
# Function to split sentences into batches of 50 words
def split_sentences(row):
    words = word_tokenize(row['complaint_what_happened_cleaned'])
    batch_size = 50
    batches = [words[i:i + batch_size] for i in range(0, len(words), batch_size)]
    # Pad the last batch with empty strings if needed
    last_batch_size = len(batches[-1])
    if last_batch_size < batch_size:
        batches[-1] += [''] * (batch_size - last_batch_size)
    return pd.Series({'complaint_what_happened': batches, 'consumer_disputed': row['consumer_disputed']})

# Apply the function to each row
new_df = df.apply(split_sentences, axis=1).reset_index(drop=True)

In [212]:
# Explode the inner arrays into separate rows
df_expanded = new_df.explode('complaint_what_happened')

# Determine the maximum number of elements in any inner array
max_elements = df_expanded['complaint_what_happened'].apply(len).max()

# Create new columns from the exploded lists
for i in range(max_elements):
    df_expanded[f'Word{i + 1}'] = df_expanded['complaint_what_happened'].apply(lambda x: x[i] if len(x) > i else None)

# Drop the original column
df_expanded = df_expanded.drop('complaint_what_happened', axis=1)

# Reset the index
df_expanded = df_expanded.reset_index(drop=True)

In [214]:
# Specify the columns to merge
columns_to_merge = df_expanded.columns[1:51]

# Create a new column with the merged values
df_expanded['complaint_what_happened'] = df_expanded[columns_to_merge].apply(lambda row: ' '.join(row), axis=1)

# Drop the original columns
df_expanded.drop(columns=columns_to_merge, inplace=True)

# Keeping only the first column and the merged column
df_expanded = df_expanded[['consumer_disputed', 'complaint_what_happened']]

In [215]:
# Preprocess the text data
texts = df_expanded['complaint_what_happened'].values
labels = df_expanded['consumer_disputed'].map({'Yes': 1, 'No': 0}).values

In [216]:
# Split into trainning and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [217]:
# Tokenize the data
max_words = 23300
max_len = 50

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [218]:
import pickle

# Save the Tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [219]:
from gensim.models import KeyedVectors

# Loading the pre-trained Word2Vec model (Google's Word2Vec model)
word2vec_model = KeyedVectors.load_word2vec_format('Google_word2vec.bin', binary=True)

embedding_matrix = np.zeros((max_words, word2vec_model.vector_size))

for word, i in tokenizer.word_index.items():
    if i < max_words and word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

In [262]:
from keras.layers import GRU
from keras.optimizers import Adam

# Define the model to use
model = Sequential()
model.add(Embedding(max_words, word2vec_model.vector_size, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(GRU(16))
model.add(Dense(32, activation='tanh'))
model.add(Dense(32, activation='tanh'))
model.add(Dense(8, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

In [263]:
from tensorflow.keras import callbacks

# Define the optimizer
optimizer = Adam(learning_rate=0.00005)

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping callback
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=15)

In [264]:
# Train the model
model.fit(X_train_pad, y_train, batch_size=64, epochs=300, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300


<keras.src.callbacks.History at 0x1b5051dba90>

In [265]:
# Test the model accuracy
score = model.evaluate(X_test_pad, y_test)
print(f'Test loss: {score[0]}')
print(f'Test accuracy: {score[1]}')

Test loss: 0.6742064356803894
Test accuracy: 0.5768890380859375


In [266]:
# model_json = model.to_json()
# with open('model.json', 'w') as json_file:
#    json_file.write(model_json)