## Preprocessing

In [3]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

In [4]:
def Preprocessor(text:str, stemmer: str='Snowball', remove_mbti: bool=False) -> list:
	'''
	Input: str
	Output: list
		Preprocessed tokens
	stemmer: str
		Can be 'Snowball' or 'Porter'. Default is Snowball.
	remove_mbti: bool
		Remove MBTI keywords like INTJ, ENFP, etc. Default is False.(Keep MBTI keywords.)
	'''
	# Cleaning
	text = re.sub(r'\|\|\|', ' ', text)  # Split by separator
	text = re.sub(r'http\S+', ' ', text)  # Replace hyperlink
	text = re.sub(r"[A-Za-z]+\'+\w+", ' ', text)  # Handling apostrophe (e.g. you've, there's)
	text = re.sub('[^0-9a-zA-Z]',' ', text)  # Keep only numbers and alphabets (remove special characters)
	text = text.lower()
	if remove_mbti == True:
		text = re.sub('intj|intp|entj|entp|infp|enfj|enfp|istj|isfj|estj|esfj|istp|isfp|estp|esfp|infj', '', text)
  	# Tokenization
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stopwords.words('english')]  # Remove stopwords
	# Stemming
	stemmer_ = SnowballStemmer("english")
	if stemmer == 'Porter|porter':
		stemmer_ = PorterStemmer()
	if stemmer not in ['Snowball', 'snowball', 'Porter', 'porter']:
		raise ValueError("Please check passed argument: stemmer must be 'Snowball' or 'Porter'")
	stemmed = [stemmer_.stem(t) for t in filtered_tokens]
	# Lemmatizing
	lemma = WordNetLemmatizer()
	# lemmatized = [lemma.lemmatize(t) for t in stemmed]
	lemmatized = " ".join([lemma.lemmatize(w) for w in stemmed])   # .join() -> 用空格分開每個字
	return lemmatized

## Part 1 : training data : snowball stemmer with removing MBTI

In [5]:
data = pd.read_csv('../data/Kaggle_MBTI.csv')
data_snow_removeMBTI = data.copy()

for d in range(len(data)):
    post = data.loc[d, 'posts']

    txt_snow_removenMBTI = Preprocessor(post, remove_mbti=True)

    data_snow_removeMBTI.posts[d] = txt_snow_removenMBTI

list_posts = []
for i in range(len(data_snow_removeMBTI)):
    post = data_snow_removeMBTI.loc[i, 'posts']
    list_posts.append(post)
list_posts = np.array(list_posts)


from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizing the database posts to a matrix of token counts for the model
cntizer = CountVectorizer() 
                        
# the feature should be made of word n-gram 
# Learn the vocabulary dictionary and return term-document matrix
print("Using CountVectorizer :")
X_cnt = cntizer.fit_transform(list_posts)
col_name = cntizer.get_feature_names_out()   # 紀錄 sparse matrix 的字分別是哪些字


# For the Standardization or Feature Scaling Stage :-
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
print("\nUsing Tf-idf :")

print("Now the dataset size is as below")
X_tfidf =  tfizer.fit_transform(X_cnt).toarray()
X_tfidf.shape

Using CountVectorizer :

Using Tf-idf :
Now the dataset size is as below


(8675, 77959)

In [6]:
data = pd.read_csv('../data/Kaggle_MBTI.csv')
data.type.value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

In [7]:
label = data.loc[:,['type']]
from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(label)   # Y before train-test-split
label = pd.DataFrame(Y, columns=['LABEL'])

from keras.utils.np_utils import to_categorical
labels = to_categorical(label['LABEL'], num_classes=16)
labels

  y = column_or_1d(y, warn=True)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [14]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

n_most_common_words = 8000
max_len = 100   # 保留每筆資料的前500個字
tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data_snow_removeMBTI['posts'].values)
sequences = tokenizer.texts_to_sequences(data_snow_removeMBTI['posts'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len)

Found 77994 unique tokens.


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=42)

epochs = 100
emb_dim = 128
batch_size = 256
labels[:2]

print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))

((6506, 100), (6506, 16), (2169, 100), (2169, 16))


In [20]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.models import Sequential

model = Sequential()
model.add(Embedding(1000, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(16, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          128000    
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 100, 128)         0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 dense_2 (Dense)             (None, 16)                1040      
                                                                 
Total params: 178,448
Trainable params: 178,448
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100


2022-11-23 13:30:26.824512: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-11-23 13:40:16.870941: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


In [17]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 2.277
  Accuracy: 0.213


In [18]:
result = model.predict(X_test)
result

2022-11-22 18:17:17.413036: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




array([[0.02163923, 0.07823421, 0.02876924, ..., 0.03419801, 0.02511852,
        0.04449378],
       [0.02109298, 0.08025363, 0.02836107, ..., 0.03072802, 0.0242075 ,
        0.04046316],
       [0.02021379, 0.07784317, 0.0282184 , ..., 0.03165273, 0.02338939,
        0.0400547 ],
       ...,
       [0.02045593, 0.08671936, 0.02453822, ..., 0.02723196, 0.0219667 ,
        0.03612923],
       [0.02202559, 0.08268887, 0.02881961, ..., 0.03254695, 0.02449911,
        0.04150951],
       [0.02211777, 0.08327577, 0.02706769, ..., 0.02989568, 0.02387635,
        0.03850861]], dtype=float32)