## Preprocessing

In [3]:
### import package ###

# from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
# from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chenenying/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chenenying/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/chenenying/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenenying/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

#### 定義前處理function

In [11]:
def Preprocessor(text:str, stemmer: str='Snowball', remove_mbti: bool=False) -> list:
	'''
	Input: str
	Output: list
		Preprocessed tokens
	stemmer: str
		Can be 'Snowball' or 'Porter'. Default is Snowball.
	remove_mbti: bool
		Remove MBTI keywords like INTJ, ENFP, etc. Default is False.(Keep MBTI keywords.)
	'''
	# Cleaning
	text = re.sub(r'\|\|\|', ' ', text)  # Split by separator
	text = re.sub(r'http\S+', ' ', text)  # Replace hyperlink
	text = re.sub(r"[A-Za-z]+\'+\w+", ' ', text)  # Handling apostrophe (e.g. you've, there's)
	text = re.sub('[^0-9a-zA-Z]',' ', text)  # Keep only numbers and alphabets (remove special characters)
	text = text.lower()
	if remove_mbti == True:
		text = re.sub('intj|intp|entj|entp|infp|enfj|enfp|istj|isfj|estj|esfj|istp|isfp|estp|esfp|infj', '', text)
  	# Tokenization
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stopwords.words('english')]  # Remove stopwords
	# Stemming
	stemmer_ = SnowballStemmer("english")
	if stemmer == 'Porter|porter':
		stemmer_ = PorterStemmer()
	if stemmer not in ['Snowball', 'snowball', 'Porter', 'porter']:
		raise ValueError("Please check passed argument: stemmer must be 'Snowball' or 'Porter'")
	stemmed = [stemmer_.stem(t) for t in filtered_tokens]
	# Lemmatizing
	lemma = WordNetLemmatizer()
	# lemmatized = [lemma.lemmatize(t) for t in stemmed]
	lemmatized = " ".join([lemma.lemmatize(w) for w in stemmed])   # .join() -> 用空格分開每個字
	return lemmatized

#### 測試前處理

In [12]:
txt = "http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments sportly center not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  prank|||What has been the most life-changing experience in your life?|||The last thing my sportly INFJ friend posted on his facebook: Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. It's only natural for a relationship to not be perfection all the time in every moment of existence."

# stemmer=snowball, remove_MBTI=false
txt_snow = Preprocessor(txt)

# stemmer = snowball, remove_MBTI = true
txt_snow_removenMBTI = Preprocessor(txt, remove_mbti=True)

# stemmer = porter, remove_MBTI = false
txt_porter = Preprocessor(txt, stemmer='Porter')

# stemmer = porter, remove_MBTI = true
txt_porter_removeMBTI = Preprocessor(txt, stemmer='porter', remove_mbti=True)

In [13]:
txt_snow

'enfp intj moment sport center top ten play prank life chang experi life last thing sport infj friend post facebook rest peac hello enfj7 natur relationship perfect time everi moment exist'

## Model使用：XGBoost

- part 1 : remove_MBTI = True
- part 2 : remove_MBTI = False
- part 3 : remove_MBTI = True, 4 models
- part 4 : remove_MBTI = True, combine of 4 models

## Part 1 : 
## training data : snowball stemmer with remove_MBTI = True

In [14]:
data = pd.read_csv('../data/Kaggle_MBTI.csv')
data_snow_removeMBTI = data.copy()

for d in range(len(data)):
    post = data.loc[d, 'posts']

    txt_snow_removenMBTI = Preprocessor(post, remove_mbti=True)

    data_snow_removeMBTI.posts[d] = txt_snow_removenMBTI

In [15]:
list_posts = []
for i in range(len(data_snow_removeMBTI)):
    post = data_snow_removeMBTI.loc[i, 'posts']
    list_posts.append(post)
list_posts = np.array(list_posts)
list_posts[0]

'moment sportscent top ten play prank life chang experi life repeat today may perc experi immers last thing friend post facebook commit suicid next day rest peac hello 7 sorri hear distress natur relationship perfect time everi moment exist tri figur hard time time growth 84389 84390 welcom stuff game set match prozac wellbrutin least thirti minut move leg mean move sit desk chair weed moder mayb tri edibl healthier altern basic come three item determin type whichev type want would like use given type cognit function whatnot left thing moder sim inde video game good one note good one somewhat subject complet promot death given sim dear favorit video game grow current favorit video game cool appear late sad someon everyon wait thought confid good thing cherish time solitud b c revel within inner world wherea time workin enjoy time worri peopl alway around yo ladi complimentari person well hey main social outlet xbox live convers even verbal fatigu quick realli dig part 1 46 2 50 ban thr

In [16]:
list_posts = []
for i in range(len(data_snow_removeMBTI)):
    post = data_snow_removeMBTI.loc[i, 'posts']
    list_posts.append(post)
list_posts = np.array(list_posts)

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizing the database posts to a matrix of token counts for the model
cntizer = CountVectorizer() 
                        
# the feature should be made of word n-gram 
# Learn the vocabulary dictionary and return term-document matrix
print("Using CountVectorizer :")
X_cnt = cntizer.fit_transform(list_posts)
col_name = cntizer.get_feature_names_out()   # 紀錄 sparse matrix 的字分別是哪些字


# For the Standardization or Feature Scaling Stage :-
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
print("\nUsing Tf-idf :")

print("Now the dataset size is as below")
X_tfidf =  tfizer.fit_transform(X_cnt).toarray()
X_tfidf.shape

Using CountVectorizer :

Using Tf-idf :
Now the dataset size is as below


(8675, 77959)

### Dimension Reduction

In [9]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
result = svd.fit_transform(X_tfidf)
result

array([[ 2.18321351e-01, -2.45834207e-02, -3.84752551e-02, ...,
         5.50076640e-03, -4.15293961e-02, -4.41515198e-03],
       [ 3.47231658e-01,  1.44933018e-02,  4.14309011e-04, ...,
         7.04989659e-03, -3.06592309e-02, -2.20784484e-02],
       [ 2.95310602e-01, -2.54562934e-02, -3.38758266e-02, ...,
        -2.09563284e-02, -1.46703506e-02,  7.36759977e-03],
       ...,
       [ 2.94277665e-01, -3.72974752e-02, -1.79301326e-02, ...,
        -9.23416395e-03, -1.10198841e-02,  6.83930269e-03],
       [ 5.03995256e-01, -2.93056982e-03, -4.57695322e-02, ...,
        -2.71498123e-02, -3.37654209e-02, -1.60298056e-02],
       [ 4.73248056e-01, -7.81700205e-02, -2.36295542e-02, ...,
         2.18301947e-02,  5.85967817e-02,  1.52859661e-02]])

In [10]:
df_tfidf = pd.DataFrame(result)
df_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.218321,-0.024583,-0.038475,0.051253,0.052573,-0.098390,-0.012897,-0.012861,0.007571,0.026597,...,0.046387,0.004768,0.035041,-0.022150,-0.033650,-0.001184,0.003872,0.005501,-0.041529,-0.004415
1,0.347232,0.014493,0.000414,0.002264,0.022911,-0.027197,-0.053938,-0.050517,0.010498,-0.031986,...,-0.011009,-0.024145,-0.033613,-0.018406,-0.001637,-0.010336,-0.019465,0.007050,-0.030659,-0.022078
2,0.295311,-0.025456,-0.033876,0.015701,0.017684,-0.048588,-0.011597,-0.027767,0.025643,-0.018619,...,0.039269,-0.038023,0.009555,-0.018155,0.010268,-0.002686,0.019000,-0.020956,-0.014670,0.007368
3,0.365347,0.138809,-0.019489,0.044647,-0.018550,-0.006768,0.001171,0.075980,-0.030990,-0.035420,...,-0.012199,-0.019985,0.022097,-0.006371,0.009820,0.013881,-0.020020,-0.027712,0.013648,0.012111
4,0.306189,-0.015107,-0.028018,0.074865,-0.032701,0.027101,0.010016,-0.008506,-0.083858,-0.042293,...,-0.026158,-0.009088,-0.017805,0.016174,0.002986,-0.007500,-0.009730,0.018737,0.000721,-0.009504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,0.283351,-0.021577,-0.029227,0.041018,0.016443,-0.048257,-0.009345,-0.010493,-0.074849,-0.002368,...,0.045288,-0.007694,0.008313,0.001655,-0.002699,-0.021647,0.009463,0.028501,0.010709,0.011473
8671,0.408075,0.009098,-0.022288,-0.009931,0.037114,-0.089582,0.025174,-0.016819,-0.056482,0.000131,...,-0.022417,-0.027183,-0.010989,0.042422,0.025413,0.014192,0.015489,-0.027111,0.004296,-0.021201
8672,0.294278,-0.037297,-0.017930,0.092755,-0.007925,-0.026852,0.017868,-0.005668,-0.018594,-0.076833,...,-0.012156,0.003705,0.004381,-0.060154,-0.010956,-0.008312,-0.039472,-0.009234,-0.011020,0.006839
8673,0.503995,-0.002931,-0.045770,-0.008551,-0.016652,0.011997,0.038163,0.026382,0.011663,-0.010993,...,0.028005,0.004002,0.005614,0.030821,-0.028366,-0.038388,0.000297,-0.027150,-0.033765,-0.016030


In [100]:
label = data.loc[:,['type']]
from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(label) 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tfidf, Y, test_size=0.33, random_state=42)
eval_set = [(X_test, y_test)]

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

#XG boost Classifier
param = {}

param['n_estimators'] = 100
param['max_depth'] = 2
param['learning_rate'] = 0.2

xgb = XGBClassifier(**param)
xgb_model = xgb.fit(X_train,y_train)

Y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in Y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

  y = column_or_1d(y, warn=True)


Accuracy: 34.72%


### grid search for XGboost(remove_MBTI = True, Model = XGboost, data = raw data)

In [101]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
learning_rate=[round(float(x),2) for x in np.linspace(start=0.1, stop=0.2, num=5)]

best_nest = 0
best_lr = 0
best_acc=0

for nest in n_estimators:
    local_acc = 0
    local_lr = 0
    for lr in learning_rate:
        param = {}
        param['n_estimators'] = nest
        param['max_depth'] = 2
        param['learning_rate'] = lr

        xgb = XGBClassifier(**param)
        xgb_model = xgb.fit(X_train,y_train)

        Y_pred = xgb_model.predict(X_test)
        predictions = [round(value) for value in Y_pred]

        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)

        if accuracy > local_acc:
            local_acc = accuracy
            local_lr = lr

        if accuracy > best_acc:
            best_acc = accuracy
            best_nest = nest
            best_lr = lr
    
    print(f'n_estimator = {nest}, learning_rate = {local_lr}, best accuracy = {local_acc} ')

print(f'Result : best n_estimator = {best_nest}, best learning_rate = {best_lr}, best accuracy = {best_acc} ')

n_estimator = 100, learning_rate = 0.2, best accuracy = 0.3471882640586797 
n_estimator = 200, learning_rate = 0.18, best accuracy = 0.3471882640586797 
n_estimator = 300, learning_rate = 0.18, best accuracy = 0.34963325183374083 
n_estimator = 400, learning_rate = 0.2, best accuracy = 0.35103038770520434 
n_estimator = 500, learning_rate = 0.12, best accuracy = 0.3517289556409361 
n_estimator = 600, learning_rate = 0.15, best accuracy = 0.3499825358016067 
n_estimator = 700, learning_rate = 0.12, best accuracy = 0.354872511351729 
n_estimator = 800, learning_rate = 0.12, best accuracy = 0.35207823960880197 
n_estimator = 900, learning_rate = 0.1, best accuracy = 0.3545232273838631 
n_estimator = 1000, learning_rate = 0.1, best accuracy = 0.3545232273838631 
Result : best n_estimator = 700, best learning_rate = 0.12, best accuracy = 0.354872511351729 


#### Result
remove_MBTI = True, data = raw data

In [11]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
result = svd.fit_transform(X_tfidf)
df_tfidf = pd.DataFrame(result)    # X before train-test-split

label = data.loc[:,['type']]
from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(label)   # Y before train-test-split

# train-test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tfidf, Y, test_size=0.33, random_state=42)

#XG boost Classifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param = {}

param['n_estimators'] = 700 # after tunning
param['max_depth'] = 2
param['learning_rate'] = 0.12 # after tunning

xgb = XGBClassifier(**param)
xgb_model = xgb.fit(X_train,y_train)

Y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in Y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

precision_mi = precision_score(y_test, predictions, average='micro')
print("Micro Precision: %.2f%%" % (precision_mi * 100.0))
precision_ma = precision_score(y_test, predictions, average='macro')
print("Macro Precision: %.2f%%" % (precision_ma * 100.0))

recall_mi = recall_score(y_test, predictions, average='micro')
print("Micro Recall: %.2f%%" % (recall_mi * 100.0))
recall_ma = recall_score(y_test, predictions, average='macro')
print("Macro Recall: %.2f%%" % (recall_ma * 100.0))

f1score_mi = f1_score(y_test, predictions, average='micro')
print("Micro F1score: %.2f%%" % (f1score_mi * 100.0))
f1score_ma = f1_score(y_test, predictions, average='macro')
print("Macro F1score: %.2f%%" % (f1score_ma * 100.0))

  y = column_or_1d(y, warn=True)


Accuracy: 36.19%
Micro Precision: 36.19%
Macro Precision: 23.16%
Micro Recall: 36.19%
Macro Recall: 16.78%
Micro F1score: 36.19%
Macro F1score: 16.81%


  _warn_prf(average, modifier, msg_start, len(result))


#### Result for XGboost after grid search : 
remove_MBTI = True, data = SMOTE

In [21]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
result = svd.fit_transform(X_tfidf)
df_tfidf = pd.DataFrame(result)    # X before train-test-split

label = data.loc[:,['type']]
from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(label)   # Y before train-test-split

# train-test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tfidf, Y, test_size=0.33, random_state=42)

# SMOTE
from imblearn.over_sampling import SMOTE
X_re, y_re = SMOTE(random_state=42).fit_resample(X_train, y_train)


# grid search
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
learning_rate=[round(float(x),2) for x in np.linspace(start=0.1, stop=0.2, num=5)]

best_nest = 0
best_lr = 0
best_acc=0

for nest in n_estimators:
    local_acc = 0
    local_lr = 0
    for lr in learning_rate:
        param = {}
        param['n_estimators'] = nest
        param['max_depth'] = 2
        param['learning_rate'] = lr

        xgb = XGBClassifier(**param)
        xgb_model = xgb.fit(X_re,y_re)

        Y_pred = xgb_model.predict(X_test)
        predictions = [round(value) for value in Y_pred]

        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)

        if accuracy > local_acc:
            local_acc = accuracy
            local_lr = lr

        if accuracy > best_acc:
            best_acc = accuracy
            best_nest = nest
            best_lr = lr
    
    print(f'n_estimator = {nest}, learning_rate = {local_lr}, best accuracy = {local_acc} ')

print(f'Result : best n_estimator = {best_nest}, best learning_rate = {best_lr}, best accuracy = {best_acc} ')
print('-----------------------------------------------------------')



#XG boost Classifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param = {}

param['n_estimators'] = best_nest # after tunning
param['max_depth'] = 2
param['learning_rate'] = best_lr # after tunning

xgb = XGBClassifier(**param)
xgb_model = xgb.fit(X_re, y_re)

Y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in Y_pred]

print('Result for XGboost after grid search (remove_MBTI = True, Model = XGboost, data = SMOTE)')
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

precision_mi = precision_score(y_test, predictions, average='micro')
print("Micro Precision: %.2f%%" % (precision_mi * 100.0))
precision_ma = precision_score(y_test, predictions, average='macro')
print("Macro Precision: %.2f%%" % (precision_ma * 100.0))

recall_mi = recall_score(y_test, predictions, average='micro')
print("Micro Recall: %.2f%%" % (recall_mi * 100.0))
recall_ma = recall_score(y_test, predictions, average='macro')
print("Macro Recall: %.2f%%" % (recall_ma * 100.0))

f1score_mi = f1_score(y_test, predictions, average='micro')
print("Micro F1score: %.2f%%" % (f1score_mi * 100.0))
f1score_ma = f1_score(y_test, predictions, average='macro')
print("Macro F1score: %.2f%%" % (f1score_ma * 100.0))

  y = column_or_1d(y, warn=True)


n_estimator = 100, learning_rate = 0.2, best accuracy = 0.26720223541739435 
n_estimator = 200, learning_rate = 0.2, best accuracy = 0.29269996507160323 
n_estimator = 300, learning_rate = 0.18, best accuracy = 0.3028292001397136 
n_estimator = 400, learning_rate = 0.2, best accuracy = 0.3108627314006287 
n_estimator = 500, learning_rate = 0.18, best accuracy = 0.31610199091861685 
n_estimator = 600, learning_rate = 0.18, best accuracy = 0.32238910234020257 
n_estimator = 700, learning_rate = 0.2, best accuracy = 0.3293747816975201 
n_estimator = 800, learning_rate = 0.18, best accuracy = 0.3346140412155082 
n_estimator = 900, learning_rate = 0.18, best accuracy = 0.33391547327977644 
n_estimator = 1000, learning_rate = 0.2, best accuracy = 0.3363604610548376 
Result : best n_estimator = 1000, best learning_rate = 0.2, best accuracy = 0.3363604610548376 
-----------------------------------------------------------
Result for XGboost after grid search (remove_MBTI = True, Model = XGboost

  _warn_prf(average, modifier, msg_start, len(result))


## Part 2 : 
## training data : snowball stemmer with remove_MBTI = False

In [13]:
data = pd.read_csv('../data/Kaggle_MBTI.csv')
data_snow_preserveMBTI = data.copy()

for d in range(len(data)):
    post = data.loc[d, 'posts']

    txt_snow_preserveMBTI = Preprocessor(post, remove_mbti=False)

    data_snow_preserveMBTI.posts[d] = txt_snow_preserveMBTI

In [14]:
list_posts_preserveMBTI = []
for i in range(len(data_snow_preserveMBTI)):
    post = data_snow_preserveMBTI.loc[i, 'posts']
    list_posts_preserveMBTI.append(post)
list_posts_preserveMBTI = np.array(list_posts_preserveMBTI)
list_posts_preserveMBTI[0]

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizing the database posts to a matrix of token counts for the model
cntizer = CountVectorizer() 
                        
# the feature should be made of word n-gram 
# Learn the vocabulary dictionary and return term-document matrix
print("Using CountVectorizer :")
X_cnt = cntizer.fit_transform(list_posts_preserveMBTI)
col_name = cntizer.get_feature_names_out()   # 紀錄 sparse matrix 的字分別是哪些字


# For the Standardization or Feature Scaling Stage :-
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
print("\nUsing Tf-idf :")

print("Now the dataset size is as below")
X_tfidf_preserveMBTI =  tfizer.fit_transform(X_cnt).toarray()
X_tfidf_preserveMBTI.shape

Using CountVectorizer :

Using Tf-idf :
Now the dataset size is as below


(8675, 78195)

### Dimension Reduction

In [15]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
result = svd.fit_transform(X_tfidf_preserveMBTI)
result

array([[ 2.23610681e-01, -2.02310166e-02, -3.40873630e-02, ...,
         4.18226318e-03, -1.48110021e-02,  4.71118238e-02],
       [ 3.54135836e-01,  3.75235553e-02, -1.80221282e-04, ...,
        -1.69594789e-02, -1.31130966e-02, -2.45278845e-02],
       [ 2.94517923e-01, -4.25355192e-02, -4.69807402e-02, ...,
         2.55843178e-05, -3.21370399e-02, -5.93011647e-03],
       ...,
       [ 2.95972693e-01, -4.04896288e-02, -7.56407070e-02, ...,
        -3.01270711e-02, -4.87913584e-03, -1.94552880e-02],
       [ 5.03399731e-01, -2.01640743e-02, -1.85485433e-02, ...,
        -7.63944536e-03, -3.04604701e-03, -5.71329570e-03],
       [ 4.64898298e-01, -1.16113478e-01, -2.54154557e-02, ...,
         4.92052934e-03,  8.16109107e-03,  1.90547896e-02]])

In [16]:
df_tfidf_preserveMBTI = pd.DataFrame(result)
df_tfidf_preserveMBTI

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.223611,-0.020231,-0.034087,0.000962,0.063015,-0.073717,0.057205,0.017520,0.047126,-0.010016,...,-0.025138,0.000732,0.036692,0.014167,0.015217,0.018207,0.021154,0.004182,-0.014811,0.047112
1,0.354136,0.037524,-0.000180,0.010968,0.036568,-0.070150,-0.056896,-0.022549,-0.010898,0.019569,...,0.012253,0.026911,0.007903,0.025391,-0.001629,-0.011122,-0.028776,-0.016959,-0.013113,-0.024528
2,0.294518,-0.042536,-0.046981,0.000538,0.018094,-0.029436,0.040829,-0.010940,0.002424,-0.018426,...,-0.022780,0.000692,-0.031226,-0.004326,0.006249,-0.020192,0.021526,0.000026,-0.032137,-0.005930
3,0.370912,0.117969,-0.071086,0.034069,-0.019328,0.017335,0.025102,-0.007058,0.021839,0.045205,...,0.005520,0.009013,-0.033306,-0.012315,-0.036751,-0.019653,0.004027,0.003232,-0.011458,0.011685
4,0.303496,-0.033966,-0.067459,0.052238,-0.015558,0.000597,-0.048641,0.006804,-0.028214,0.025108,...,-0.023257,-0.004382,0.034981,-0.035670,0.012652,0.001482,-0.013012,0.017438,-0.032007,0.017551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,0.279177,-0.049432,-0.052027,0.021983,0.021391,-0.031876,0.036212,-0.031795,-0.038034,0.009174,...,-0.008241,-0.008045,0.052462,0.000162,0.024739,0.003352,0.019685,0.005127,-0.016426,-0.013010
8671,0.404503,-0.003132,0.007871,-0.026125,0.039692,-0.080074,0.060411,0.016189,-0.055238,0.053366,...,-0.032098,-0.031473,-0.024878,0.030667,0.015289,-0.003076,-0.024281,0.060890,0.000300,0.015073
8672,0.295973,-0.040490,-0.075641,0.080334,0.011291,-0.043678,-0.014257,0.009159,-0.024012,-0.036531,...,-0.033448,0.009340,0.007724,-0.011008,-0.004893,0.000511,-0.009006,-0.030127,-0.004879,-0.019455
8673,0.503400,-0.020164,-0.018549,-0.045825,-0.024716,0.029856,0.017269,0.004324,-0.000342,-0.019251,...,0.030555,0.044189,0.030708,-0.025583,-0.054790,0.048147,-0.000105,-0.007639,-0.003046,-0.005713


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tfidf_preserveMBTI, Y, test_size=0.33, random_state=42)
eval_set = [(X_test, y_test)]

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

#XG boost Classifier
param = {}

param['n_estimators'] = 200
param['max_depth'] = 2
param['learning_rate'] = 0.2

xgb = XGBClassifier(**param)
xgb_model = xgb.fit(X_train,y_train)

Y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in Y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 65.07%


#### Result for XGboost after grid search : 
remove_MBTI = False, data = raw data

In [22]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
result = svd.fit_transform(X_tfidf_preserveMBTI)
df_tfidf_preserveMBTI = pd.DataFrame(result)    # X before train-test-split

label = data.loc[:,['type']]
from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(label) 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tfidf_preserveMBTI, Y, test_size=0.33, random_state=42)

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
learning_rate=[round(float(x),2) for x in np.linspace(start=0.1, stop=0.2, num=5)]

best_nest = 0
best_lr = 0
best_acc=0

for nest in n_estimators:
    local_acc = 0
    local_lr = 0
    for lr in learning_rate:
        param = {}
        param['n_estimators'] = nest
        param['max_depth'] = 2
        param['learning_rate'] = lr

        xgb = XGBClassifier(**param)
        xgb_model = xgb.fit(X_train,y_train)

        Y_pred = xgb_model.predict(X_test)
        predictions = [round(value) for value in Y_pred]

        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)

        if accuracy > local_acc:
            local_acc = accuracy
            local_lr = lr

        if accuracy > best_acc:
            best_acc = accuracy
            best_nest = nest
            best_lr = lr
    
    print(f'n_estimator = {nest}, learning_rate = {local_lr}, best accuracy = {local_acc} ')

print(f'Result : best n_estimator = {best_nest}, best learning_rate = {best_lr}, best accuracy = {best_acc} ')
print('---------------------------------------------------------------')



param = {}

param['n_estimators'] = best_nest # after tunning
param['max_depth'] = 2
param['learning_rate'] = best_lr # after tunning

xgb = XGBClassifier(**param)
xgb_model = xgb.fit(X_train,y_train)

Y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in Y_pred]

print('Result for XGboost after grid search (remove_MBTI = False, Model = XGboost, data = raw data)')
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

precision_mi = precision_score(y_test, predictions, average='micro')
print("Micro Precision: %.2f%%" % (precision_mi * 100.0))
precision_ma = precision_score(y_test, predictions, average='macro')
print("Macro Precision: %.2f%%" % (precision_ma * 100.0))

recall_mi = recall_score(y_test, predictions, average='micro')
print("Micro Recall: %.2f%%" % (recall_mi * 100.0))
recall_ma = recall_score(y_test, predictions, average='macro')
print("Macro Recall: %.2f%%" % (recall_ma * 100.0))

f1score_mi = f1_score(y_test, predictions, average='micro')
print("Micro F1score: %.2f%%" % (f1score_mi * 100.0))
f1score_ma = f1_score(y_test, predictions, average='macro')
print("Macro F1score: %.2f%%" % (f1score_ma * 100.0))

  y = column_or_1d(y, warn=True)


n_estimator = 100, learning_rate = 0.15, best accuracy = 0.650716032134125 
n_estimator = 200, learning_rate = 0.2, best accuracy = 0.656653859587845 
n_estimator = 300, learning_rate = 0.15, best accuracy = 0.6559552916521132 
n_estimator = 400, learning_rate = 0.15, best accuracy = 0.6556060076842473 
n_estimator = 500, learning_rate = 0.2, best accuracy = 0.6570031435557108 
n_estimator = 600, learning_rate = 0.12, best accuracy = 0.6584002794271743 
n_estimator = 700, learning_rate = 0.18, best accuracy = 0.6604959832343695 
n_estimator = 800, learning_rate = 0.12, best accuracy = 0.6587495633950402 
n_estimator = 900, learning_rate = 0.18, best accuracy = 0.6577017114914425 
n_estimator = 1000, learning_rate = 0.18, best accuracy = 0.6580509954593085 
Result : best n_estimator = 700, best learning_rate = 0.18, best accuracy = 0.6604959832343695 
---------------------------------------------------------------
Result for XGboost after grid search (remove_MBTI = False, Model = XGboos

#### Result for XGboost after grid search : 
remove_MBTI = False, data = SMOTE

In [23]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
result = svd.fit_transform(X_tfidf_preserveMBTI)
df_tfidf_preserveMBTI = pd.DataFrame(result)    # X before train-test-split

label = data.loc[:,['type']]
from sklearn.preprocessing import LabelEncoder
Y = LabelEncoder().fit_transform(label)   # Y before train-test-split

# train-test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tfidf_preserveMBTI, Y, test_size=0.33, random_state=42)

# SMOTE
from imblearn.over_sampling import SMOTE
X_re, y_re = SMOTE(random_state=42).fit_resample(X_train, y_train)

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
learning_rate=[round(float(x),2) for x in np.linspace(start=0.1, stop=0.2, num=5)]

best_nest = 0
best_lr = 0
best_acc=0

for nest in n_estimators:
    local_acc = 0
    local_lr = 0
    for lr in learning_rate:
        param = {}
        param['n_estimators'] = nest
        param['max_depth'] = 2
        param['learning_rate'] = lr

        xgb = XGBClassifier(**param)
        xgb_model = xgb.fit(X_re,y_re)

        Y_pred = xgb_model.predict(X_test)
        predictions = [round(value) for value in Y_pred]

        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)

        if accuracy > local_acc:
            local_acc = accuracy
            local_lr = lr

        if accuracy > best_acc:
            best_acc = accuracy
            best_nest = nest
            best_lr = lr
    
    print(f'n_estimator = {nest}, learning_rate = {local_lr}, best accuracy = {local_acc} ')

print(f'Result : best n_estimator = {best_nest}, best learning_rate = {best_lr}, best accuracy = {best_acc} ')
print('---------------------------------------------------------------')




param = {}

param['n_estimators'] = best_nest # after tunning
param['max_depth'] = 2
param['learning_rate'] = best_lr # after tunning

xgb = XGBClassifier(**param)
xgb_model = xgb.fit(X_re,y_re)

Y_pred = xgb_model.predict(X_test)
predictions = [round(value) for value in Y_pred]

print('Result for XGboost after grid search (remove_MBTI = False, Model = XGboost, data = SMOTE)')
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

precision_mi = precision_score(y_test, predictions, average='micro')
print("Micro Precision: %.2f%%" % (precision_mi * 100.0))
precision_ma = precision_score(y_test, predictions, average='macro')
print("Macro Precision: %.2f%%" % (precision_ma * 100.0))

recall_mi = recall_score(y_test, predictions, average='micro')
print("Micro Recall: %.2f%%" % (recall_mi * 100.0))
recall_ma = recall_score(y_test, predictions, average='macro')
print("Macro Recall: %.2f%%" % (recall_ma * 100.0))

f1score_mi = f1_score(y_test, predictions, average='micro')
print("Micro F1score: %.2f%%" % (f1score_mi * 100.0))
f1score_ma = f1_score(y_test, predictions, average='macro')
print("Macro F1score: %.2f%%" % (f1score_ma * 100.0))

  y = column_or_1d(y, warn=True)


n_estimator = 100, learning_rate = 0.2, best accuracy = 0.6192804750261963 
n_estimator = 200, learning_rate = 0.18, best accuracy = 0.6398882291302829 
n_estimator = 300, learning_rate = 0.2, best accuracy = 0.6465246245197346 
n_estimator = 400, learning_rate = 0.15, best accuracy = 0.6545581557806497 
n_estimator = 500, learning_rate = 0.2, best accuracy = 0.6531610199091862 
n_estimator = 600, learning_rate = 0.18, best accuracy = 0.6521131680055885 
n_estimator = 700, learning_rate = 0.2, best accuracy = 0.6521131680055885 
n_estimator = 800, learning_rate = 0.12, best accuracy = 0.650716032134125 
n_estimator = 900, learning_rate = 0.1, best accuracy = 0.6503667481662592 
n_estimator = 1000, learning_rate = 0.1, best accuracy = 0.6493188962626616 
Result : best n_estimator = 400, best learning_rate = 0.15, best accuracy = 0.6545581557806497 
---------------------------------------------------------------
Result for XGboost after grid search (remove_MBTI = False, Model = XGboost, 

## Part 3 : 4 models (remove_MBTI = True)

In [5]:
data = pd.read_csv('../data/Kaggle_MBTI.csv')
def get_types(row):
    t=row['type']

    I = 0; N = 0
    T = 0; J = 0
    
    if t[0] == 'I': I = 1
    elif t[0] == 'E': I = 0
    else: print('I-E not found') 
        
    if t[1] == 'N': N = 1
    elif t[1] == 'S': N = 0
    else: print('N-S not found')
        
    if t[2] == 'T': T = 1
    elif t[2] == 'F': T = 0
    else: print('T-F not found')
        
    if t[3] == 'J': J = 1
    elif t[3] == 'P': J = 0
    else: print('J-P not found')
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

data = data.join(data.apply (lambda row: get_types (row),axis=1))
data

Unnamed: 0,type,posts,IE,NS,TF,JP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1
1,ENTP,'I'm finding the lack of me in these posts ver...,0,1,1,0
2,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1
4,ENTJ,'You're fired.|||That's another silly misconce...,0,1,1,1
...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,1,0,0,0
8671,ENFP,'So...if this thread already exists someplace ...,0,1,0,0
8672,INTP,'So many questions when i do these things. I ...,1,1,1,0
8673,INFP,'I am very conflicted right now when it comes ...,1,1,0,0


In [6]:
print ("Introversion (I) /  Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
print ("Intuition (N) / Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
print ("Thinking (T) / Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
print ("Judging (J) / Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])

Introversion (I) /  Extroversion (E):	 1999  /  6676
Intuition (N) / Sensing (S):		 1197  /  7478
Thinking (T) / Feeling (F):		 4694  /  3981
Judging (J) / Perceiving (P):		 5241  /  3434


In [7]:
# Splitting the MBTI personality into 4 letters and binarizing it

b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    return [b_Pers[l] for l in personality]

#To show result output for personality prediction
def translate_back(personality):
    # transform binary vector to mbti personality
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s

list_personality_bin = np.array([translate_personality(p) for p in data.type])
print("Binarize MBTI list: \n%s" % list_personality_bin)

Binarize MBTI list: 
[[0 0 0 0]
 [1 0 1 1]
 [0 0 1 1]
 ...
 [0 0 1 1]
 [0 0 0 1]
 [0 0 0 1]]


In [8]:
list_personality=[]
for row in data.iterrows():
    type_labelized = translate_personality(row[1].type) #or use lab_encoder.transform([row[1].type])[0]
    list_personality.append(type_labelized)
list_personality = np.array(list_personality)
list_personality.shape

(8675, 4)

In [9]:
personality_type = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) / Sensing (S)", 
                   "FT: Feeling (F) / Thinking (T)", "JP: Judging (J) / Perceiving (P)"  ]

for l in range(len(personality_type)):
    print(personality_type[l])

IE: Introversion (I) / Extroversion (E)
NS: Intuition (N) / Sensing (S)
FT: Feeling (F) / Thinking (T)
JP: Judging (J) / Perceiving (P)


In [17]:
X_tfidf.shape

(8675, 77959)

#### data = raw data

In [19]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
X = svd.fit_transform(X_tfidf)



In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# XGBoost Model create
for l in range(len(personality_type)):
    
    Y = list_personality[:,l]

    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

    # fit model on training data
    param = {}

    param['n_estimators'] = 700 # after tunning
    param['max_depth'] = 2
    param['learning_rate'] = 0.12 # after tunning

    model = XGBClassifier(**param)
    model.fit(X_train, y_train)

    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    
    print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))

IE: Introversion (I) / Extroversion (E) Accuracy: 77.65%
NS: Intuition (N) / Sensing (S) Accuracy: 85.47%
FT: Feeling (F) / Thinking (T) Accuracy: 75.69%
JP: Judging (J) / Perceiving (P) Accuracy: 65.91%


#### data = SMOTE

In [45]:
# XGBoost Model create
for l in range(len(personality_type)):
    
    Y = list_personality[:,l]

    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

    # SMOTE
    from imblearn.over_sampling import SMOTE
    X_re, y_re = SMOTE(random_state=42).fit_resample(X_train, y_train)

    # fit model on training data
    param = {}

    param['n_estimators'] = 1000 # after tunning
    param['max_depth'] = 2
    param['learning_rate'] = 0.2 # after tunning

    model = XGBClassifier(**param)
    model.fit(X_re, y_re)

    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    
    print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))

IE: Introversion (I) / Extroversion (E) Accuracy: 72.20%
NS: Intuition (N) / Sensing (S) Accuracy: 79.88%
FT: Feeling (F) / Thinking (T) Accuracy: 75.83%
JP: Judging (J) / Perceiving (P) Accuracy: 64.41%


In [1]:

x = [1,2,3,4]
y = [1,2,2,4]
accuracy = accuracy_score(x, y)
accuracy

0.75

## Part 4 : Combine 4 models (remove_MBTI = True)

#### remove_MBTI = True, data = raw data

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(X, list_personality, test_size=0.33, random_state=42)


final_pred = np.zeros((X_test.shape[0], 4)) # 紀錄 4 個 model 分別預測病combine後的 MBTI 結果

# XGBoost Model create
for l in range(len(personality_type)):
    
    Y = y_train[:,l]


    # fit model on training data
    param = {}

    param['n_estimators'] = 700 # after tunning
    param['max_depth'] = 2
    param['learning_rate'] = 0.12 # after tunning

    model = XGBClassifier(**param)
    model.fit(X_train, y_train[:,l])

    # make predictions for test data
    y_pred = model.predict(X_test)

    # 將四個預測結果合併成最後 MBTI 結果
    for j in range(len(y_pred)):
        final_pred[j][l] = y_pred[j]


    # predictions = [round(value) for value in y_pred]
    # # evaluate predictions
    # accuracy = accuracy_score(y_test, predictions)
    
    print(personality_type[l])
    print(y_pred)

accuracy = accuracy_score(y_test, final_pred)
print(f'The final Accuracy is : {accuracy}')

IE: Introversion (I) / Extroversion (E)
[0 0 0 ... 0 0 0]
NS: Intuition (N) / Sensing (S)
[0 0 0 ... 0 0 0]
FT: Feeling (F) / Thinking (T)
[1 1 1 ... 0 0 0]
JP: Judging (J) / Perceiving (P)
[1 1 1 ... 0 1 1]
The final Accuracy is : 0.34299685644428923


#### remove_MBTI = True, data = SMOTE

In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(X, list_personality, test_size=0.33, random_state=42)


final_pred = np.zeros((X_test.shape[0], 4)) # 紀錄 4 個 model 分別預測病combine後的 MBTI 結果

# XGBoost Model create
for l in range(len(personality_type)):
    
    Y = y_train[:,l]


    # SMOTE
    from imblearn.over_sampling import SMOTE
    X_re, y_re = SMOTE(random_state=42).fit_resample(X_train, Y)
    # fit model on training data
    param = {}

    param['n_estimators'] = 1000 # after tunning
    param['max_depth'] = 2
    param['learning_rate'] = 0.2 # after tunning

    model = XGBClassifier(**param)
    model.fit(X_re, y_re)

    # make predictions for test data
    y_pred = model.predict(X_test)

    # 將四個預測結果合併成最後 MBTI 結果
    for j in range(len(y_pred)):
        final_pred[j][l] = y_pred[j]


    # predictions = [round(value) for value in y_pred]
    # # evaluate predictions
    # accuracy = accuracy_score(y_test, predictions)
    
    print(personality_type[l])
    print(y_pred)

accuracy = accuracy_score(y_test, final_pred)
print(f'The final Accuracy is : {accuracy}')

IE: Introversion (I) / Extroversion (E)
[0 0 0 ... 0 1 0]
NS: Intuition (N) / Sensing (S)
[0 0 0 ... 0 0 0]
FT: Feeling (F) / Thinking (T)
[1 1 1 ... 0 0 0]
JP: Judging (J) / Perceiving (P)
[1 1 1 ... 0 0 1]
The final Accuracy is : 0.2888578414250786
