# Text Preprocess Note

In [None]:
# Data Analysis
import re
import numpy as np
import pandas as pd

# Text Processing 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# Other
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# NLTK resource
import nltk
# nltk.download('punkt')  # for word_tokenize
# nltk.download('wordnet')  # for WordNetLemmatizer
# nltk.download('omw-1.4')
# nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/kanko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kanko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kanko/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kanko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
data = pd.read_csv('../../data/Kaggle_MBTI.csv')
data.shape

(8675, 2)

In [None]:
# Copy data for explaining text preprocessing
data_copy = data.copy()

### Cleaning
- 簡單的去除分隔符、超連結、符號、多餘的空格

In [None]:
def getCleanPost(text):
	text = re.sub(r'\|\|\|', ' ', text)  # Split by separator
	text = re.sub(r'http\S+', ' ', text)  # Replace hyperlink
	text = re.sub(r"[A-Za-z]+\'+\w+", ' ', text)  # Handling apostrophe (e.g. you've, there's)
	text = re.sub('[^0-9a-zA-Z]',' ', text)  # Keep only numbers and alphabets (remove special characters)
	return text

#### Explanation

In [None]:
'''
Example of getCleanPost.
input: Top 520 words in data.posts[0]
output: getCleanPost(input)
'''
origi_sentence = data.posts[0][0:520]
clean_sentence = getCleanPost(origi_sentence)

print('\033[96mBefore cleaning:\n',origi_sentence,'\n')

print('\033[94mAfter cleaning:\n',clean_sentence)

[96mBefore cleaning:
 'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend pos 

[94mAfter cleaning:
      enfp and intj moments     sportscenter not top ten plays     pranks What has been the most life changing experience in your life         On repeat for most of today  May the PerC Experience immerse you  The last thing my INFJ friend pos


#### Apply

In [None]:
# Apply getCleanPost to all training data
tqdm.pandas()  # Progress bar
data_copy['posts_clean'] = data_copy['posts'].progress_apply(getCleanPost)
data_copy

100%|██████████| 8675/8675 [00:07<00:00, 1126.97it/s]


Unnamed: 0,type,posts,preprocessed,posts_clean
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"[enfp, intj, moment, sportscent, top, ten, pla...",enfp and intj moments sportscenter no...
1,ENTP,'I'm finding the lack of me in these posts ver...,"[find, lack, post, alarm, sex, bore, posit, of...",finding the lack of me in these posts very ...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"[good, one, cours, say, know, bless, cur, abso...",Good one Of course to which I say...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","[dear, intp, enjoy, convers, day, esoter, gab,...",Dear INTP I enjoyed our conversation the o...
4,ENTJ,'You're fired.|||That's another silly misconce...,"[fire, anoth, silli, misconcept, approach, log...",fired another silly misconception That ...
...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,"[ixfp, alway, think, cat, fi, dom, reason, esp...",IxFP just because I always think of cats as...
8671,ENFP,'So...if this thread already exists someplace ...,"[thread, alreadi, exist, someplac, el, heck, d...",So if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...,"[mani, question, thing, would, take, purpl, pi...",So many questions when i do these things I ...
8673,INFP,'I am very conflicted right now when it comes ...,"[conflict, right, come, want, child, honest, m...",I am very conflicted right now when it comes ...


### Tokenization & Remove stop words
- 轉小寫
- 切詞
- 移除 Stop words

#### getCleanToken()

In [None]:
# Stop word list
stop_words = stopwords.words('english')
print('Stop words\n',stop_words)

Stop words
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [None]:
def getCleanToken(text):
	# getCleanPost
	text = re.sub(r'\|\|\|', r' ', text)
	text = re.sub(r'http\S+', r'', text)
	text = re.sub('[^0-9a-zA-Z]',' ', text)
	text = re.sub(' +', ' ', text)
	# Add "Tokenization" and remove stopword
	text = text.lower()
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stop_words]
	return filtered_tokens

#### Explanation

In [None]:
'''
Example of the added part in getCleanToken.
Referred to the paragraph # Add "Tokenization" and remove stopword
input: getCleanPost(user #1228)
output: getCleanToken(user #1228)
'''
clean_post = getCleanPost(data_copy.posts[1228])

# Tokenization
tokens = word_tokenize(clean_post)
print(f'Original: {len(tokens)} tokens\n')

# Stop words
filtered_tokens = [w for w in tokens if not w in stop_words]
print(f'After removing stop words: {len(filtered_tokens)} tokens\n')

# Check removed words
print(f'Removed words: {list(set(tokens).difference(set(filtered_tokens)))}')

Original: 829 tokens

After removing stop words: 491 tokens

Removed words: ['just', 'do', 'of', 'and', 'so', 'he', 'them', 'while', 'when', 'as', 'in', 'not', 'at', 'with', 'each', 'it', 'some', 'other', 'on', 'only', 'yours', 'a', 'how', 'have', 'will', 'your', 'few', 'or', 'about', 'is', 'which', 'me', 'o', 'any', 'why', 'once', 'here', 'up', 'because', 'this', 'they', 'did', 'him', 'am', 'all', 'has', 'to', 'who', 'very', 'more', 'what', 'are', 'were', 'an', 'you', 'too', 'his', 'no', 'be', 'then', 'we', 'out', 'where', 'can', 'that', 'if', 'their', 'from', 'for', 'my', 'her', 'now', 'the', 'those']


#### Apply

In [None]:
# Apply getCleanToken to all training data
tqdm.pandas()  # Progress bar
data_copy['tokens_clean'] = data_copy['posts'].progress_apply(getCleanToken)

100%|██████████| 8675/8675 [00:55<00:00, 157.02it/s]


In [None]:
data_copy

Unnamed: 0,type,posts,preprocessed,posts_clean,tokens_clean
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"[enfp, intj, moment, sportscent, top, ten, pla...",enfp and intj moments sportscenter no...,"[enfp, intj, moments, sportscenter, top, ten, ..."
1,ENTP,'I'm finding the lack of me in these posts ver...,"[find, lack, post, alarm, sex, bore, posit, of...",finding the lack of me in these posts very ...,"[finding, lack, posts, alarming, sex, boring, ..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,"[good, one, cours, say, know, bless, cur, abso...",Good one Of course to which I say...,"[good, one, course, say, know, blessing, curse..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","[dear, intp, enjoy, convers, day, esoter, gab,...",Dear INTP I enjoyed our conversation the o...,"[dear, intp, enjoyed, conversation, day, esote..."
4,ENTJ,'You're fired.|||That's another silly misconce...,"[fire, anoth, silli, misconcept, approach, log...",fired another silly misconception That ...,"[fired, another, silly, misconception, approac..."
...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,"[ixfp, alway, think, cat, fi, dom, reason, esp...",IxFP just because I always think of cats as...,"[ixfp, always, think, cats, fi, doms, reason, ..."
8671,ENFP,'So...if this thread already exists someplace ...,"[thread, alreadi, exist, someplac, el, heck, d...",So if this thread already exists someplace ...,"[thread, already, exists, someplace, else, hec..."
8672,INTP,'So many questions when i do these things. I ...,"[mani, question, thing, would, take, purpl, pi...",So many questions when i do these things I ...,"[many, questions, things, would, take, purple,..."
8673,INFP,'I am very conflicted right now when it comes ...,"[conflict, right, come, want, child, honest, m...",I am very conflicted right now when it comes ...,"[conflicted, right, comes, wanting, children, ..."


#### Compare

In [None]:
# Statistics
data_copy['Words count after getCleanPost'] = data_copy['posts_clean'].apply(lambda n: len(n.split()))
data_copy['Words count after getCleanToken'] = data_copy['tokens_clean'].str.len()
data_copy

Unnamed: 0,type,posts,preprocessed,posts_clean,tokens_clean,Words count after getCleanPost,Words count after getCleanToken
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"[enfp, intj, moment, sportscent, top, ten, pla...",enfp and intj moments sportscenter no...,"[enfp, intj, moments, sportscenter, top, ten, ...",570,318
1,ENTP,'I'm finding the lack of me in these posts ver...,"[find, lack, post, alarm, sex, bore, posit, of...",finding the lack of me in these posts very ...,"[finding, lack, posts, alarming, sex, boring, ...",1166,572
2,INTP,'Good one _____ https://www.youtube.com/wat...,"[good, one, cours, say, know, bless, cur, abso...",Good one Of course to which I say...,"[good, one, course, say, know, blessing, curse...",841,445
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","[dear, intp, enjoy, convers, day, esoter, gab,...",Dear INTP I enjoyed our conversation the o...,"[dear, intp, enjoyed, conversation, day, esote...",1068,542
4,ENTJ,'You're fired.|||That's another silly misconce...,"[fire, anoth, silli, misconcept, approach, log...",fired another silly misconception That ...,"[fired, another, silly, misconception, approac...",987,484
...,...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,"[ixfp, alway, think, cat, fi, dom, reason, esp...",IxFP just because I always think of cats as...,"[ixfp, always, think, cats, fi, doms, reason, ...",798,406
8671,ENFP,'So...if this thread already exists someplace ...,"[thread, alreadi, exist, someplac, el, heck, d...",So if this thread already exists someplace ...,"[thread, already, exists, someplace, else, hec...",1329,630
8672,INTP,'So many questions when i do these things. I ...,"[mani, question, thing, would, take, purpl, pi...",So many questions when i do these things I ...,"[many, questions, things, would, take, purple,...",978,503
8673,INFP,'I am very conflicted right now when it comes ...,"[conflict, right, come, want, child, honest, m...",I am very conflicted right now when it comes ...,"[conflicted, right, comes, wanting, children, ...",1684,778


In [None]:
data_copy.describe()

Unnamed: 0,Words count after getCleanPost,Words count after getCleanToken
count,8675.0,8675.0
mean,1231.20196,614.928646
std,306.412055,142.514306
min,4.0,3.0
25%,1055.5,536.0
50%,1283.0,640.0
75%,1456.0,719.0
max,1880.0,927.0


### 2.3: Stemming and Lemmatization
- 比較 PorterStemmer 與 SnowballStemmer 的結果
- 用 WordNetLemmatizer 進行 Lemmatization

Preprocessor()

In [None]:
def Preprocessor(text:str, stemmer: str='Snowball', remove_mbti: bool=False) -> list:
	'''
	Input: str
	Output: list
		Preprocessed tokens
	stemmer: str
		Can be 'Snowball' or 'Porter'. Default is Snowball.
	remove_mbti: bool
		Remove MBTI keywords like INTJ, ENFP, etc. Default is False.(Keep MBTI keywords.)
	'''
	# Cleaning
	text = re.sub(r'\|\|\|', ' ', text)  # Split by separator
	text = re.sub(r'http\S+', ' ', text)  # Replace hyperlink
	text = re.sub(r"[A-Za-z]+\'+\w+", ' ', text)  # Handling apostrophe (e.g. you've, there's)
	text = re.sub('[^0-9a-zA-Z]',' ', text)  # Keep only numbers and alphabets (remove special characters)
	text = text.lower()
	if remove_mbti == True:
		text = re.sub('intj|intp|entj|entp|infp|enfj|enfp|istj|isfj|estj|esfj|istp|isfp|estp|esfp|infj', '', text)
  	# Tokenization
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stopwords.words('english')]  # Remove stopwords
	# Stemming
	stemmer_ = SnowballStemmer("english")
	if stemmer == 'Porter|porter':
		stemmer_ = PorterStemmer()
	if stemmer not in ['Snowball', 'snowball', 'Porter', 'porter']:
		raise ValueError("Please check passed argument: stemmer must be 'Snowball' or 'Porter'")
	stemmed = [stemmer_.stem(t) for t in filtered_tokens]
	# Lemmatizing
	lemma = WordNetLemmatizer()
	lemmatized = [lemma.lemmatize(t) for t in stemmed]
	return lemmatized

#### Explanation

In [None]:
'''
Example of the added part in Preprocessor.
Referred to the paragraph: # Add "Stemming" and "Lemmatization"
input: getCleanToken(user #1228)
output: Preprocessor(user #1228)
'''
clean_token = getCleanToken(data_copy.posts[1228])
# Initiate
stemmer_ps = PorterStemmer()
stemmer_ss = SnowballStemmer("english") 
lemma = WordNetLemmatizer()
# Stemming
stemmed_ps = [stemmer_ps.stem(t) for t in clean_token]
stemmed_ss = [stemmer_ss.stem(t) for t in clean_token]
# Lemmatizing
lemmatized_ps = [lemma.lemmatize(t) for t in stemmed_ps]
lemmatized_ss = [lemma.lemmatize(t) for t in stemmed_ss]

#### Compare different Stemmer

In [None]:
# Compare different 'Stemmer' and 'Lemmatizer'
df_stle = pd.DataFrame(
          list(zip(clean_token, stemmed_ps, stemmed_ss, lemmatized_ps, lemmatized_ss)),
          columns =['Original(clean_token)', 'PorterStemmer', 'SnowballStemmer', 'Lemma with PorterStemmer', 'Lemma with SnowballStemmer']) 
df_stle.head(10)

Unnamed: 0,Original(clean_token),PorterStemmer,SnowballStemmer,Lemma with PorterStemmer,Lemma with SnowballStemmer
0,mandarin,mandarin,mandarin,mandarin,mandarin
1,speakers,speaker,speaker,speaker,speaker
2,receive,receiv,receiv,receiv,receiv
3,education,educ,educ,educ,educ
4,canada,canada,canada,canada,canada
5,since,sinc,sinc,sinc,sinc
6,13,13,13,13,13
7,thanks,thank,thank,thank,thank
8,bellisaurius,bellisauriu,bellisaurius,bellisauriu,bellisaurius
9,appreciate,appreci,appreci,appreci,appreci


In [None]:
diff_result = df_stle.query('PorterStemmer != SnowballStemmer')
print(f'The PorterStemmer and SnowballStemmer has\
  {diff_result.shape[0]} / {df_stle.shape[0]}\
  different tokens in user #1228\'s posts.')
diff_result

The PorterStemmer and SnowballStemmer has  15 / 444  different tokens in user #1228's posts.


Unnamed: 0,Original(clean_token),PorterStemmer,SnowballStemmer,Lemma with PorterStemmer,Lemma with SnowballStemmer
8,bellisaurius,bellisauriu,bellisaurius,bellisauriu,bellisaurius
10,kindly,kindli,kind,kindli,kind
41,yes,ye,yes,ye,yes
46,yes,ye,yes,ye,yes
157,yes,ye,yes,ye,yes
161,saurus,sauru,saurus,sauru,saurus
291,dos,do,dos,do,do
304,pros,pro,pros,pro,pro
318,exactly,exactli,exact,exactli,exact
382,dos,do,dos,do,do


### 2.4: 各步驟的比較

In [None]:
data_copy.drop(data_copy.columns[[5,6]],axis = 1)

Unnamed: 0,type,posts,preprocessed,posts_clean,tokens_clean
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"[enfp, intj, moment, sportscent, top, ten, pla...",enfp and intj moments sportscenter no...,"[enfp, intj, moments, sportscenter, top, ten, ..."
1,ENTP,'I'm finding the lack of me in these posts ver...,"[find, lack, post, alarm, sex, bore, posit, of...",finding the lack of me in these posts very ...,"[finding, lack, posts, alarming, sex, boring, ..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,"[good, one, cours, say, know, bless, cur, abso...",Good one Of course to which I say...,"[good, one, course, say, know, blessing, curse..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","[dear, intp, enjoy, convers, day, esoter, gab,...",Dear INTP I enjoyed our conversation the o...,"[dear, intp, enjoyed, conversation, day, esote..."
4,ENTJ,'You're fired.|||That's another silly misconce...,"[fire, anoth, silli, misconcept, approach, log...",fired another silly misconception That ...,"[fired, another, silly, misconception, approac..."
...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,"[ixfp, alway, think, cat, fi, dom, reason, esp...",IxFP just because I always think of cats as...,"[ixfp, always, think, cats, fi, doms, reason, ..."
8671,ENFP,'So...if this thread already exists someplace ...,"[thread, alreadi, exist, someplac, el, heck, d...",So if this thread already exists someplace ...,"[thread, already, exists, someplace, else, hec..."
8672,INTP,'So many questions when i do these things. I ...,"[mani, question, thing, would, take, purpl, pi...",So many questions when i do these things I ...,"[many, questions, things, would, take, purple,..."
8673,INFP,'I am very conflicted right now when it comes ...,"[conflict, right, come, want, child, honest, m...",I am very conflicted right now when it comes ...,"[conflicted, right, comes, wanting, children, ..."


#### Random example

In [None]:
print(f'Input (800 words):\n{data_copy.posts[0][:800]}...')

Input (800 words):
'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times ...


In [None]:
print(f'Output:\n{Preprocessor(data_copy.posts[0])}')

Output:
['enfp', 'intj', 'moment', 'sportscent', 'top', 'ten', 'play', 'prank', 'life', 'chang', 'experi', 'life', 'repeat', 'today', 'may', 'perc', 'experi', 'immers', 'last', 'thing', 'infj', 'friend', 'post', 'facebook', 'commit', 'suicid', 'next', 'day', 'rest', 'peac', 'hello', 'enfj7', 'sorri', 'hear', 'distress', 'natur', 'relationship', 'perfect', 'time', 'everi', 'moment', 'exist', 'tri', 'figur', 'hard', 'time', 'time', 'growth', '84389', '84390', 'welcom', 'stuff', 'game', 'set', 'match', 'prozac', 'wellbrutin', 'least', 'thirti', 'minut', 'move', 'leg', 'mean', 'move', 'sit', 'desk', 'chair', 'weed', 'moder', 'mayb', 'tri', 'edibl', 'healthier', 'altern', 'basic', 'come', 'three', 'item', 'determin', 'type', 'whichev', 'type', 'want', 'would', 'like', 'use', 'given', 'type', 'cognit', 'function', 'whatnot', 'left', 'thing', 'moder', 'sim', 'inde', 'video', 'game', 'good', 'one', 'note', 'good', 'one', 'somewhat', 'subject', 'complet', 'promot', 'death', 'given', 'sim', 'dea