In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, SpatialDropout1D, LSTM, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Bidirectional, LSTM

# import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from matplotlib import pyplot as plt

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.metrics import classification_report, confusion_matrix
#test train split
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter



In [2]:
df = pd.read_csv('cleaned_dataset.csv')
df.columns

Index(['type', 'posts', 'cleaned_posts'], dtype='object')

In [3]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['cleaned_posts'])
tokenizer.word_index # Get our learned vocabulary
word_index = tokenizer.word_index

VOCAB_SIZE = len(word_index)+1 # Total words
print(VOCAB_SIZE)

135303


In [4]:
X = tokenizer.texts_to_sequences(df['cleaned_posts'])
MAX_SEQ_LENGTH = max(len(seq) for seq in X)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen= MAX_SEQ_LENGTH, padding = 'post') # Pad the sequence to the same length to make it uniform

In [5]:
from gensim.models import Word2Vec, KeyedVectors

gn_vec_zip_path ="/Users/salmazainana/Downloads/GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(gn_vec_zip_path, binary=True)


In [6]:
word2vec_model = model

In [7]:
embedding_size = 300
embedding_matrix = np.zeros((VOCAB_SIZE, embedding_size))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:  # Use word2vec_model directly instead of word2vec_model.wv
        embedding_vector = word2vec_model[word]  # Same here
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [8]:
y = df['type']
le = LabelEncoder()
y = le.fit_transform(y) # Fit label encoder and return encoded labels 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Oversampling 
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))


Resampled dataset shape Counter({8: 1462, 1: 1462, 3: 1462, 9: 1462, 0: 1462, 10: 1462, 2: 1462, 11: 1462, 4: 1462, 13: 1462, 15: 1462, 12: 1462, 6: 1462, 7: 1462, 14: 1462, 5: 1462})


In [11]:
# label decode the labels form numbers to their original categories
y_train = le.inverse_transform(y_train)
y_test = le.inverse_transform(y_test)

y_train[0]

'INFJ'

In [12]:
#type of y_train 
type(y_train)

numpy.ndarray

In [13]:
# strip letter from the type so ['ENTJ'] becomes ['E','N','T','J']
# y_stripped = y.apply(lambda x: list(x.strip()))

# def preprocessing(y):
#     if y[0] == 'E':
#         y[0] = 1
#     else:
#         y[0] = 0
#     if y[1] == 'N':
#         y[1] = 1
#     else:
#         y[1] = 0
#     if y[2] == 'T':
#         y[2] = 1
#     else:
#         y[2] = 0
#     if y[3] == 'J':
#         y[3] = 1
#     else:
#         y[3] = 0

#     return y 

# y = y_stripped.apply(preprocessing)
# y = np.array(y.tolist())
# type(y)
def preprocess_types(y):
    # Define mappings for each character position
    mappings = {
        0: {'E': 1, 'I': 0},
        1: {'N': 1, 'S': 0},
        2: {'T': 1, 'F': 0},
        3: {'J': 1, 'P': 0}
    }
    
    # Convert each type to a list of characters, apply mappings, and convert to NumPy array
    y_binary = np.array([[mappings[pos][char] for pos, char in enumerate(type_str)] for type_str in y])
    
    return y_binary

y_train = preprocess_types(y_train)
y_test = preprocess_types(y_test)

In [28]:
# number/count of each type in y_train
unique, counts = np.unique(y_train, return_counts=True, axis=0)
unique, counts

(array([[0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 1, 1],
        [0, 1, 0, 0],
        [0, 1, 0, 1],
        [0, 1, 1, 0],
        [0, 1, 1, 1],
        [1, 0, 0, 0],
        [1, 0, 0, 1],
        [1, 0, 1, 0],
        [1, 0, 1, 1],
        [1, 1, 0, 0],
        [1, 1, 0, 1],
        [1, 1, 1, 0],
        [1, 1, 1, 1]]),
 array([1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462, 1462,
        1462, 1462, 1462, 1462, 1462]))

In [29]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, embedding_size, weights=[embedding_matrix], input_length=MAX_SEQ_LENGTH, trainable=False))
model.add(tf.keras.layers.Conv1D(64, (3), padding='same', activation='relu'))
model.add(tf.keras.layers.Conv1D(64, (3), activation='relu'))
model.add(tf.keras.layers.Dropout(0.5)) 

model.add(tf.keras.layers.Conv1D(128, (3), padding='same', activation='relu'))
model.add(tf.keras.layers.Conv1D(128, (3), activation='relu'))
model.add(tf.keras.layers.Dropout(0.5)) 


model.add(tf.keras.layers.Conv1D(256, (3), padding='same', activation='relu'))
model.add(tf.keras.layers.Conv1D(256, (3), activation='relu'))
model.add(tf.keras.layers.Dropout(0.5)) 

model.add(Bidirectional(LSTM(256)))
model.add(Dense(4, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics = ['acc'])

history = model.fit(X_train, y_train, 
                    validation_split=0.2, 
                    epochs=30, 
                    batch_size=128,
                    callbacks=[EarlyStopping(monitor='val_loss',
                                              patience=3,
                                                verbose=1,
                                               restore_best_weights=True ,
                                                mode='min')])

Epoch 1/30
Epoch 2/30
 13/147 [=>............................] - ETA: 7:20 - loss: 0.6780 - acc: 0.3840

KeyboardInterrupt: 