# EDA and Text Preprocessing

## Import module

In [60]:
# Basic
import re
import numpy as np
import pandas as pd
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Machine Learning
from sklearn.model_selection import train_test_split
# Text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
# Other
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

## 1: EDA

In [61]:
data = pd.read_csv('../data/Kaggle_MBTI.csv')
data.shape

(8675, 2)

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [63]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [64]:
# Show some posts of the first user
user1post = [post.split('|||') for post in data.head(1).posts.values]  # list
user1post_random = pd.DataFrame(
	data=np.array(user1post).transpose(),
	columns=['post']).sample(10)
user1post_random.style.set_table_styles([
	{'selector': 'th', 'props': [('text-align', 'left')]},
	{'selector': 'td', 'props': [('text-align', 'left')]}
	], overwrite=False)

Unnamed: 0,post
44,http://www.youtube.com/watch?v=w8IgImn57aQ
27,http://www.youtube.com/watch?v=4V2uYORhQOk
11,"Prozac, wellbrutin, at least thirty minutes of moving your legs (and I don't mean moving them while sitting in your same desk chair), weed in moderation (maybe try edibles as a healthier alternative..."
6,The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~ http://vimeo.com/22842206
37,http://www.youtube.com/watch?v=ZRCEq_JFeFM
41,Not all artists are artists because they draw. It's the idea that counts in forming something of your own... like a signature.
33,http://www.youtube.com/watch?v=IRcrv41hgz4
28,http://www.youtube.com/watch?v=SlVmgFQQ0TI
45,"Banned for being too much of a thundering, grumbling kind of storm... yep."
38,http://discovermagazine.com/2012/jul-aug/20-things-you-didnt-know-about-deserts/desert.jpg


In [65]:
# There are many records contains URL
search_URL = data.loc[data['posts'].str.contains("www", case=True)]
search_URL

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8667,ENTP,'I think generally people experience post trau...
8669,INFJ,'I'm not sure about a method for picking out I...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8672,INTP,'So many questions when i do these things. I ...


In [66]:
color = px.colors.sequential.Sunset_r
df_count = data['type'].value_counts()
df_count = df_count.rename_axis('type').reset_index(name='counts')
fig = px.bar(df_count, x='type', y='counts',
color='type', color_discrete_sequence=color,
title='Type count',
width=1000,
height=600)
fig.show()

In [67]:
# Stratify split to ensure equal distribution of data
train_data, test_data = train_test_split(data,
	test_size=0.2,
	random_state=42,
	stratify=data.type)

In [68]:
train_data

Unnamed: 0,type,posts
1228,INFP,'We are mandarin speakers. He receive educati...
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ..."
6756,ENFJ,'That's the only one I haven't gotten to read ...
1662,INFP,'I used to think that maturity was burning bri...
3338,INFP,'I get typed as both a 4w5 and 5w6 as well but...
...,...,...
7292,INFP,Haven't posted here in a while. Here was my at...
1086,INFP,"'Ok, I'll go first. I'm a 29 year old INFP mal..."
7435,ENTJ,"'I have dated a few INFJs, including my curren..."
1843,INTP,'People who are unable to replace social norms...


In [69]:
df_train_count = train_data['type'].value_counts().rename_axis('type').reset_index(name='counts')
df_test_count = test_data['type'].value_counts().rename_axis('type').reset_index(name='counts')

In [70]:
df_train_count

Unnamed: 0,type,counts
0,INFP,1465
1,INFJ,1176
2,INTP,1043
3,INTJ,873
4,ENTP,548
5,ENFP,540
6,ISTP,270
7,ISFP,217
8,ENTJ,185
9,ISTJ,164


In [71]:
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_train_count['type'], values=df_train_count['counts'], name='Train'),
              1, 1)
fig.add_trace(go.Pie(labels=df_test_count['type'], values=df_test_count['counts'], name='Test'),
              1, 2)

fig.update_traces(hole=.4, hovertemplate='Type: %{label}<br>Count: %{value}', marker_colors=color)
fig.update_layout(
    title_text = "Train_Test Split",
    annotations = [dict(text='Train', x=0.2, y=0.5, font_size=18, showarrow=False),
                 dict(text='Test', x=0.795, y=0.5, font_size=18, showarrow=False)])
fig.show()

In [72]:
# fig = px.pie(train_data, names='type', title='Train data', color_discrete_sequence=colors, hole=0.3, width=800, height=600 )
# fig.update_traces(textinfo='percent',hovertemplate='Type: %{label}<br>Count: %{value}')
# fig.show()

In [73]:
# fig = px.pie(test_data, names='type', title='Test data', color_discrete_sequence=colors, hole=0.3, width=800, height=600 )
# fig.update_traces(textinfo='percent',hovertemplate='Type: %{label}<br>Count: %{value}')
# fig.show()

## 2: Text Preprocessing

Todo:
- 要考慮：目前有去除冒號跟井字號，但有些可能是 emoji 或 hashtag （例如：":happy:"、"#mood")
- 目前是把超連結替換成 URL，也可以直接拔掉
- 還沒做 stop words 包含 MBTI 的
- 目前有把 porter stemmer 當參數之一，看要不要刪掉，也可以跟 SnowballStemmer 一起訓練，看結果有沒有差

### 2.1: Cleaning
- 簡單的去除分隔符、超連結、符號、多餘的空格

#### getCleanPost()

In [74]:
def getCleanPost(text):
	text = re.sub(r'\|\|\|', r' ', text)  # Remove splitter
	text = re.sub(r'http\S+', r'URL', text)  # Replace hyperlinks with "URL"
	text = re.sub('[^0-9a-zA-Z]',' ', text)  # Keep only words
	text = re.sub(' +', ' ', text)  # Remove redundant space
	return text

#### Function explanation

In [75]:
'''
Example of getCleanPost.
input: Top 520 words in data.posts[0]
output: getCleanPost(input)
'''
origi_sentence = data.posts[0][0:520]
clean_sentence = getCleanPost(origi_sentence)

print('\033[96mBefore cleaning:\n',origi_sentence,'\n')

print('\033[94mAfter cleaning:\n',clean_sentence)

[96mBefore cleaning:
 'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend pos 

[94mAfter cleaning:
  URL URL enfp and intj moments URL sportscenter not top ten plays URL pranks What has been the most life changing experience in your life URL URL On repeat for most of today May the PerC Experience immerse you The last thing my INFJ friend pos


#### Apply

In [76]:
# Apply getCleanPost to all training data
train_data_copy = train_data.copy()
tqdm.pandas()  # Progress bar
train_data_copy['posts_clean'] = train_data_copy['posts'].progress_apply(getCleanPost)
train_data_copy

  0%|          | 0/6940 [00:00<?, ?it/s]

Unnamed: 0,type,posts,posts_clean
1228,INFP,'We are mandarin speakers. He receive educati...,We are mandarin speakers He receive education...
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",Nope Not now not ever I m too busy with work ...
6756,ENFJ,'That's the only one I haven't gotten to read ...,That s the only one I haven t gotten to read ...
1662,INFP,'I used to think that maturity was burning bri...,I used to think that maturity was burning bri...
3338,INFP,'I get typed as both a 4w5 and 5w6 as well but...,I get typed as both a 4w5 and 5w6 as well but...
...,...,...,...
7292,INFP,Haven't posted here in a while. Here was my at...,Haven t posted here in a while Here was my att...
1086,INFP,"'Ok, I'll go first. I'm a 29 year old INFP mal...",Ok I ll go first I m a 29 year old INFP male ...
7435,ENTJ,"'I have dated a few INFJs, including my curren...",I have dated a few INFJs including my current...
1843,INTP,'People who are unable to replace social norms...,People who are unable to replace social norms...


### 2.2: Tokenization & Remove stop words
- 轉小寫
- 切詞
- 移除 Stop words

#### getCleanToken()

In [77]:
# Stop word list
stop_words = stopwords.words('english')
print('Stop words\n',stop_words)

Stop words
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [78]:
def getCleanToken(text):
	# getCleanPost
	text = re.sub(r'\|\|\|', r' ', text)
	text = re.sub(r'http\S+', r'URL', text)
	text = re.sub('[^0-9a-zA-Z]',' ', text)
	text = re.sub(' +', ' ', text)
	# Add "Tokenization" and remove stopword
	text = text.lower()
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stop_words]
	return filtered_tokens

#### Function explanation

In [79]:
'''
Example of the added part in getCleanToken.
Referred to the paragraph # Add "Tokenization" and remove stopword
input: getCleanPost(user #1228)
output: getCleanToken(user #1228)
'''
clean_post = getCleanPost(train_data.posts[1228])

# Tokenization
tokens = word_tokenize(clean_post)
print(f'Original: {len(tokens)} tokens\n')

# Stop words
filtered_tokens = [w for w in tokens if not w in stop_words]
print(f'After removing stop words: {len(filtered_tokens)} tokens\n')

# Check removed words
print(f'Removed words: {list(set(tokens).difference(set(filtered_tokens)))}')

Original: 879 tokens

After removing stop words: 508 tokens

Removed words: ['t', 'for', 'who', 'no', 'were', 'which', 'with', 'her', 'few', 'his', 'very', 'he', 's', 'me', 'when', 'those', 'at', 'where', 'what', 'they', 'have', 'on', 'some', 'too', 'do', 'all', 'in', 'not', 'it', 'them', 'their', 'each', 'any', 'we', 'm', 'will', 'your', 'now', 'from', 'why', 'the', 'once', 'has', 'can', 'are', 'how', 'just', 'to', 'don', 'while', 'out', 'that', 'you', 'so', 'yours', 'am', 'o', 'up', 'this', 'other', 'll', 'him', 'a', 'only', 'here', 'as', 'and', 'or', 'of', 'did', 'more', 'an', 'then', 'is', 'if', 'about', 'won', 'because', 'my', 'be']


#### Apply

In [80]:
# Apply getCleanToken to all training data
tqdm.pandas()  # Progress bar
train_data_copy['tokens_clean'] = train_data_copy['posts'].progress_apply(getCleanToken)

  0%|          | 0/6940 [00:00<?, ?it/s]

In [81]:
train_data_copy

Unnamed: 0,type,posts,posts_clean,tokens_clean
1228,INFP,'We are mandarin speakers. He receive educati...,We are mandarin speakers He receive education...,"[mandarin, speakers, receive, education, canad..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",Nope Not now not ever I m too busy with work ...,"[nope, ever, busy, work, causes, adrenaline, r..."
6756,ENFJ,'That's the only one I haven't gotten to read ...,That s the only one I haven t gotten to read ...,"[one, gotten, read, yet, might, pick, one, boo..."
1662,INFP,'I used to think that maturity was burning bri...,I used to think that maturity was burning bri...,"[used, think, maturity, burning, bridges, with..."
3338,INFP,'I get typed as both a 4w5 and 5w6 as well but...,I get typed as both a 4w5 and 5w6 as well but...,"[get, typed, 4w5, 5w6, well, like, consider, 4..."
...,...,...,...,...
7292,INFP,Haven't posted here in a while. Here was my at...,Haven t posted here in a while Here was my att...,"[posted, attire, best, man, buddies, wedding, ..."
1086,INFP,"'Ok, I'll go first. I'm a 29 year old INFP mal...",Ok I ll go first I m a 29 year old INFP male ...,"[ok, go, first, 29, year, old, infp, male, int..."
7435,ENTJ,"'I have dated a few INFJs, including my curren...",I have dated a few INFJs including my current...,"[dated, infjs, including, current, partner, 6,..."
1843,INTP,'People who are unable to replace social norms...,People who are unable to replace social norms...,"[people, unable, replace, social, norms, ratio..."


#### Compare

In [82]:
# Statistics
train_data_copy['Words count after getCleanPost'] = train_data_copy['posts_clean'].apply(lambda n: len(n.split()))
train_data_copy['Words count after getCleanToken'] = train_data_copy['tokens_clean'].str.len()
train_data_copy

Unnamed: 0,type,posts,posts_clean,tokens_clean,Words count after getCleanPost,Words count after getCleanToken
1228,INFP,'We are mandarin speakers. He receive educati...,We are mandarin speakers He receive education...,"[mandarin, speakers, receive, education, canad...",879,444
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",Nope Not now not ever I m too busy with work ...,"[nope, ever, busy, work, causes, adrenaline, r...",1299,648
6756,ENFJ,'That's the only one I haven't gotten to read ...,That s the only one I haven t gotten to read ...,"[one, gotten, read, yet, might, pick, one, boo...",1273,571
1662,INFP,'I used to think that maturity was burning bri...,I used to think that maturity was burning bri...,"[used, think, maturity, burning, bridges, with...",1479,678
3338,INFP,'I get typed as both a 4w5 and 5w6 as well but...,I get typed as both a 4w5 and 5w6 as well but...,"[get, typed, 4w5, 5w6, well, like, consider, 4...",1142,522
...,...,...,...,...,...,...
7292,INFP,Haven't posted here in a while. Here was my at...,Haven t posted here in a while Here was my att...,"[posted, attire, best, man, buddies, wedding, ...",653,331
1086,INFP,"'Ok, I'll go first. I'm a 29 year old INFP mal...",Ok I ll go first I m a 29 year old INFP male ...,"[ok, go, first, 29, year, old, infp, male, int...",1086,535
7435,ENTJ,"'I have dated a few INFJs, including my curren...",I have dated a few INFJs including my current...,"[dated, infjs, including, current, partner, 6,...",1367,683
1843,INTP,'People who are unable to replace social norms...,People who are unable to replace social norms...,"[people, unable, replace, social, norms, ratio...",720,332


In [83]:
train_data_copy.describe()

Unnamed: 0,Words count after getCleanPost,Words count after getCleanToken
count,6940.0,6940.0
mean,1320.436888,618.429251
std,325.409982,140.09772
min,5.0,4.0
25%,1132.0,539.0
50%,1374.0,643.0
75%,1561.0,721.0
max,1998.0,927.0


### 2.3: Stemming and Lemmatization
- 比較 PorterStemmer 與 SnowballStemmer 的結果
- 用 WordNetLemmatizer 進行 Lemmatization

Preprocessor()

In [84]:
def Preprocessor(text, stemmer='Snowball'):
	# getCleanToken
	text = re.sub(r'\|\|\|', r' ', text)
	text = re.sub(r'http\S+', r'URL', text)
	text = re.sub('[^0-9a-zA-Z]',' ', text)
	text = re.sub(' +', ' ', text)
	text = text.lower()
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stop_words]

	# Add "Stemming" and "Lemmatization"
	stemmer = SnowballStemmer("english")  # Initiate
	lemma = WordNetLemmatizer()  # Initiate
	stemmed = [stemmer.stem(t) for t in filtered_tokens]  # Stemming	
	lemmatized = [lemma.lemmatize(t) for t in stemmed]  # Lemmatization

	if stemmer == 'Porter':
		stemmer = PorterStemmer()
		stemmed = [stemmer_ps.stem(t) for t in filtered_tokens]
		lemmatized = [lemma.lemmatize(t) for t in stemmed]

	return lemmatized

#### Function explanation

In [85]:
'''
Example of the added part in Preprocessor.
Referred to the paragraph: # Add "Stemming" and "Lemmatization"
input: getCleanToken(user #1228)
output: Preprocessor(user #1228)
'''
clean_token = getCleanToken(train_data.posts[1228])
# Initiate
stemmer_ps = PorterStemmer()
stemmer_ss = SnowballStemmer("english") 
lemma = WordNetLemmatizer()
# Stemming
stemmed_ps = [stemmer_ps.stem(t) for t in clean_token]
stemmed_ss = [stemmer_ss.stem(t) for t in clean_token]
# Lemmatizing
lemmatized_ps = [lemma.lemmatize(t) for t in stemmed_ps]
lemmatized_ss = [lemma.lemmatize(t) for t in stemmed_ss]

#### Compare different Stemmer

In [86]:
# Compare different Stemmer and Lemmatizer, which 'stle' stands for.
df_stle = pd.DataFrame(
          list(zip(clean_token, stemmed_ps, stemmed_ss, lemmatized_ps, lemmatized_ss)),
          columns =['Original(clean_token)', 'PorterStemmer', 'SnowballStemmer', 'Lemma with PorterStemmer', 'Lemma with SnowballStemmer']) 
df_stle.head(10)

Unnamed: 0,Original(clean_token),PorterStemmer,SnowballStemmer,Lemma with PorterStemmer,Lemma with SnowballStemmer
0,mandarin,mandarin,mandarin,mandarin,mandarin
1,speakers,speaker,speaker,speaker,speaker
2,receive,receiv,receiv,receiv,receiv
3,education,educ,educ,educ,educ
4,canada,canada,canada,canada,canada
5,since,sinc,sinc,sinc,sinc
6,13,13,13,13,13
7,thanks,thank,thank,thank,thank
8,bellisaurius,bellisauriu,bellisaurius,bellisauriu,bellisaurius
9,appreciate,appreci,appreci,appreci,appreci


In [87]:
diff_result = df_stle.query('PorterStemmer != SnowballStemmer')
print(f'The PorterStemmer and SnowballStemmer has\
  {diff_result.shape[0]} / {df_stle.shape[0]}\
  different tokens in user #1228\'s posts.')
diff_result

The PorterStemmer and SnowballStemmer has  15 / 444  different tokens in user #1228's posts.


Unnamed: 0,Original(clean_token),PorterStemmer,SnowballStemmer,Lemma with PorterStemmer,Lemma with SnowballStemmer
8,bellisaurius,bellisauriu,bellisaurius,bellisauriu,bellisaurius
10,kindly,kindli,kind,kindli,kind
41,yes,ye,yes,ye,yes
46,yes,ye,yes,ye,yes
157,yes,ye,yes,ye,yes
161,saurus,sauru,saurus,sauru,saurus
291,dos,do,dos,do,do
304,pros,pro,pros,pro,pro
318,exactly,exactli,exact,exactli,exact
382,dos,do,dos,do,do


#### Apply

In [88]:
# Apply Preprocessor to all training data
tqdm.pandas()  # Progress bar
train_data_copy['preprocessed'] = train_data_copy['posts'].progress_apply(Preprocessor)

  0%|          | 0/6940 [00:00<?, ?it/s]

### 2.4: Result

#### Evolution

In [89]:
train_data_copy.drop(train_data_copy.columns[[4,5]],axis = 1)

Unnamed: 0,type,posts,posts_clean,tokens_clean,preprocessed
1228,INFP,'We are mandarin speakers. He receive educati...,We are mandarin speakers He receive education...,"[mandarin, speakers, receive, education, canad...","[mandarin, speaker, receiv, educ, canada, sinc..."
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",Nope Not now not ever I m too busy with work ...,"[nope, ever, busy, work, causes, adrenaline, r...","[nope, ever, busi, work, caus, adrenalin, rush..."
6756,ENFJ,'That's the only one I haven't gotten to read ...,That s the only one I haven t gotten to read ...,"[one, gotten, read, yet, might, pick, one, boo...","[one, gotten, read, yet, might, pick, one, boo..."
1662,INFP,'I used to think that maturity was burning bri...,I used to think that maturity was burning bri...,"[used, think, maturity, burning, bridges, with...","[use, think, matur, burn, bridg, without, seco..."
3338,INFP,'I get typed as both a 4w5 and 5w6 as well but...,I get typed as both a 4w5 and 5w6 as well but...,"[get, typed, 4w5, 5w6, well, like, consider, 4...","[get, type, 4w5, 5w6, well, like, consid, 4w5,..."
...,...,...,...,...,...
7292,INFP,Haven't posted here in a while. Here was my at...,Haven t posted here in a while Here was my att...,"[posted, attire, best, man, buddies, wedding, ...","[post, attir, best, man, buddi, wed, 698410, u..."
1086,INFP,"'Ok, I'll go first. I'm a 29 year old INFP mal...",Ok I ll go first I m a 29 year old INFP male ...,"[ok, go, first, 29, year, old, infp, male, int...","[ok, go, first, 29, year, old, infp, male, int..."
7435,ENTJ,"'I have dated a few INFJs, including my curren...",I have dated a few INFJs including my current...,"[dated, infjs, including, current, partner, 6,...","[date, infj, includ, current, partner, 6, year..."
1843,INTP,'People who are unable to replace social norms...,People who are unable to replace social norms...,"[people, unable, replace, social, norms, ratio...","[peopl, unabl, replac, social, norm, ration, e..."


#### Random example

In [131]:
print(f'Input (800 words):\n{train_data.posts[0][:800]}...')

Input (800 words):
'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times ...


In [121]:
print(f'Output:\n{Preprocessor(train_data.posts[0])}')

Output:
['url', 'url', 'enfp', 'intj', 'moment', 'url', 'sportscent', 'top', 'ten', 'play', 'url', 'prank', 'life', 'chang', 'experi', 'life', 'url', 'url', 'repeat', 'today', 'may', 'perc', 'experi', 'immers', 'last', 'thing', 'infj', 'friend', 'post', 'facebook', 'commit', 'suicid', 'next', 'day', 'rest', 'peac', 'url', 'hello', 'enfj7', 'sorri', 'hear', 'distress', 'natur', 'relationship', 'perfect', 'time', 'everi', 'moment', 'exist', 'tri', 'figur', 'hard', 'time', 'time', 'growth', '84389', '84390', 'url', 'url', 'welcom', 'stuff', 'url', 'game', 'set', 'match', 'prozac', 'wellbrutin', 'least', 'thirti', 'minut', 'move', 'leg', 'mean', 'move', 'sit', 'desk', 'chair', 'weed', 'moder', 'mayb', 'tri', 'edibl', 'healthier', 'altern', 'basic', 'come', 'three', 'item', 'determin', 'type', 'whichev', 'type', 'want', 'would', 'like', 'use', 'given', 'type', 'cognit', 'function', 'whatnot', 'left', 'thing', 'moder', 'sim', 'inde', 'video', 'game', 'good', 'one', 'note', 'good', 'one', 'so