In [27]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# import the necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, SpatialDropout1D, LSTM, GRU, Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import KFold

#Oversampling library
from imblearn.over_sampling import SMOTE
from collections import Counter

# Word2Vec
from gensim.models import Word2Vec, KeyedVectors


In [55]:
#Download data 
df = pd.read_csv('cleaned_dataset.csv')

# replacing ||| with space
df["cleaned_posts"] = df["cleaned_posts"].str.replace(
    r"\|\|\|", " ", regex=True
)

#Tockenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['cleaned_posts'])
tokenizer.word_index # Get our learned vocabulary
word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index)+1 # Total words
print('Vocabulary size:', VOCAB_SIZE)


#Tockenize the words
X = tokenizer.texts_to_sequences(df['cleaned_posts'])
MAX_SEQ_LENGTH = max(len(seq) for seq in X)
print('Max sequence length:', MAX_SEQ_LENGTH)

#Pad the sequence
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen= MAX_SEQ_LENGTH, padding = 'post') # Pad the sequence to the same length to make it uniform
print('Padded sequence shape:', X.shape)


Vocabulary size: 135303
Max sequence length: 957
Padded sequence shape: (8675, 957)


In [56]:
# Word 2 vec model 
gn_vec_zip_path ="/Users/salmazainana/Downloads/GoogleNews-vectors-negative300.bin"
word2vec_model = KeyedVectors.load_word2vec_format(gn_vec_zip_path, binary=True)

# Create an embedding matrix
embedding_size = 300
print('Embedding size:', embedding_size)

embedding_matrix = np.zeros((VOCAB_SIZE, embedding_size))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model:  # Use word2vec_model directly instead of word2vec_model.wv
        embedding_vector = word2vec_model[word]  # Same here
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Check the shape of the embedding matrix
print('Shape of embedding matrix:', embedding_matrix.shape)

# Check the embedding matrix
print('Embedding matrix[1]:', embedding_matrix[1])


Embedding size: 300
Shape of embedding matrix: (135303, 300)
Embedding matrix[1]: [-0.03662109  0.01452637  0.03515625  0.23046875 -0.20800781  0.26171875
 -0.13183594 -0.08740234  0.07519531  0.03881836 -0.19726562 -0.37109375
 -0.22460938 -0.05029297  0.14648438  0.08398438 -0.0625      0.3828125
  0.05664062 -0.09277344 -0.20898438  0.11035156  0.36132812  0.28710938
 -0.15332031 -0.16113281 -0.3828125  -0.05395508 -0.140625   -0.29101562
  0.18261719  0.09326172 -0.19628906 -0.00500488 -0.07910156  0.296875
 -0.38085938  0.44335938  0.3671875   0.20117188  0.07568359 -0.25585938
  0.1953125   0.10253906  0.23730469  0.00772095  0.1875     -0.20117188
 -0.09277344  0.10107422  0.0246582   0.18457031  0.19824219  0.19140625
 -0.05419922  0.13476562  0.00506592  0.10644531 -0.05322266 -0.18945312
 -0.10498047 -0.01611328 -0.26171875  0.05004883 -0.04882812 -0.3046875
 -0.00799561 -0.14257812 -0.359375    0.3671875   0.10546875  0.40234375
  0.11035156  0.08740234 -0.32226562 -0.058349

In [14]:
### Learning variables
LEARNING_RATE = 0.01
DROPOUT = 0.1
NUM_EPOCHS = 10
BATCH_SIZE = 32


In [53]:
# Build Model
model = Sequential()
model.add(
    Embedding(
        VOCAB_SIZE,
        embedding_size, #input vector size
        input_length=MAX_SEQ_LENGTH, #input length
        weights=[embedding_matrix],
        mask_zero=True,
        trainable=True,
    )
)
# model.add(SimpleRNN(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
# model.add(GRU(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
model.add(
    LSTM(
        embedding_size,
        dropout=DROPOUT,
        recurrent_dropout=DROPOUT,
        activation="sigmoid",
        kernel_initializer="zeros",
    )
)
# model.add(Bidirectional(LSTM(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros')))
model.add(Dense(1, activation="sigmoid"))
optimizer = tf.keras.optimizers.Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model.compile(
    loss="binary_crossentropy", optimizer= optimizer, metrics=["accuracy"]
)
print(model.summary())



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 957, 300)          40590900  
                                                                 
 lstm_1 (LSTM)               (None, 300)               721200    
                                                                 
 dense_1 (Dense)             (None, 1)                 301       
                                                                 
Total params: 41312401 (157.59 MB)
Trainable params: 41312401 (157.59 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [46]:
y = df['type']


In [47]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# Oversampling training test 
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))
print('example of resampled dataset', X_train[0], y_train[0])


Resampled dataset shape Counter({'INFJ': 1462, 'ENFP': 1462, 'ENTP': 1462, 'INFP': 1462, 'ENFJ': 1462, 'INTJ': 1462, 'ENTJ': 1462, 'INTP': 1462, 'ESFJ': 1462, 'ISFP': 1462, 'ISTP': 1462, 'ISFJ': 1462, 'ESTJ': 1462, 'ESTP': 1462, 'ISTJ': 1462, 'ESFP': 1462})
example of resampled dataset [  508   622    25  7602 47454    51  1258   174   343   112   435   495
   722   476  6584 16338  6407 90931 36784   233     9    23  1072  3544
     5     9    15   600     9  1030   175    21     3   159    17  3753
  9002  1012   431   385    72   830     2   191   350  2292   975   509
  4434   509   133   509   762  1973  1101  1483   945  3475  6141  2815
 14995   714  9817   225  2656   372   540   649  1076 13263 21424  1869
   192   520 90932    36    84   529   491  1212   590   133   136   419
  2679   708   425  3653     6  1084   396   945  3272     2  3164    28
    18  1534  1870  5984    71     8     3   927     6   656  1534   397
   174    38    91  1610  1667    87   111   874    17  

In [49]:
# label encode the target variable

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)



#one hot encode labels:

# y_train = to_categorical(y_train)
# y_test = to_categorical(y_test)

# #print example : 
# print('example of one hot encoded label', y_train[0])


In [57]:
k_fold = KFold(n_splits=6)
scores_k = []
confusion_k = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(X_train):
    x_train_k = X_train[train_indices]
    y_train_k = y_train[train_indices]
    x_test_k = X_train[test_indices]
    y_test_k = y_train[test_indices]

    model.fit(
        x_train_k,
        y_train_k,
        epochs = NUM_EPOCHS,
        batch_size = BATCH_SIZE,
    )

    predictions_k = model.predict(x_test_k) 
    confusion_k += confusion_matrix(y_test_k, predictions_k)
    score_k = accuracy_score(y_test_k, predictions_k)
    scores_k.append(score_k)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:

### Test set classification (individual posts)
model.fit(
    X_train,
    y_train,
    epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
)

predictions = model.predict(X_test)
confusion = confusion_matrix(y_test, predictions)
score = accuracy_score(y_test, predictions)