## Explore SpaCy 

### Import packages and data 

In [7]:
# regular package 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import os
import string 

# nlp specific 
from collections import Counter
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [8]:
# import data 
df_punc = pd.read_csv('../../clean_data/with_punc.csv', index_col=0)

  mask |= (ar1 == a)


In [25]:
df_sample = df_punc.sample(n=1000)

In [10]:
df_sample

Unnamed: 0,MBTI,comments
1533251,INFP,"Also, this is availble on jailbroken iPhones a..."
489172,INFP,Damn spices eating all my jeans!
1434585,INFP,When my friend first showed me a picture of th...
979964,INFP,Lovely
50650,ENFJ,I have impressed the guy who taught me not to ...
...,...,...
1265,ESTP,Asian American History: Movement and Dislocati...
122703,ENTJ,I'm not saying that all media is bad all of th...
1101969,INTP,Carthage. Spam cities and pick up that +2 Scie...
61720,ESTJ,Redditors


In [3]:
# check missing values
df_punc.isna().sum()

# good there is no null value

MBTI        0
comments    0
dtype: int64

In [4]:
X = df_punc.comments
y = df_punc.MBTI

In [5]:
# split the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


### Text Processing in SpaCy

In [11]:
import spacy

# Load the large English NLP model
nlp = spacy.load('en_core_web_sm')

# Parse the text with spaCy. This runs the entire pipeline.

# function for cleaning text 
def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop | len(token.text) <= 2 | token.like_email | token.like_url)

In [131]:
# function cleaning 
def clean_text(docs): 
    filtered_tokens = []
    for doc in nlp.pipe(docs):
        tokens = [token.lemma_ for token in doc if token_filter(token) ]
        # [tok.lemma_.lower().strip(this removes the -- in -pron-) if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
        tokens = [word.lower() for word in tokens]
        tokens = [word for word in tokens if word != '-pron-']
        tokens = [re.sub(r"http\S+", "", word) for word in tokens]
        filtered_tokens.append(tokens)
    return filtered_tokens

In [51]:
from tqdm.notebook import trange, tqdm
from tqdm import tqdm_notebook
from time import sleep
tqdm.pandas()

  from pandas import Panel


In [61]:
# Further function cleaning 
def clean_text(docs): 
    filtered_tokens = []
    
    for doc in nlp.pipe(docs):
        tokens = [token.lemma_ for token in doc if token_filter(token) ]
        # [tok.lemma_.lower().strip(this removes the -- in -pron-) if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
        tokens = [word.lower() for word in tokens]
        tokens = [word for word in tokens if word != '-pron-']
        #tokens = [re.sub(r"http\S+", "", word) for word in tokens]
        tokens = [re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'url', word) for word in tokens]
        tokens = [re.sub("[^a-zA-Z]", " ", word) for word in tokens] # keep only words
        tokens = [re.sub(' +', ' ', word) for word in tokens] # remove space > 1
        filtered_tokens.append(tokens)
    return filtered_tokens

In [60]:
df_sample['text_lemma'] = clean_text(df_sample['comments'])
df_sample['text_lemma'] = df_sample['text_lemma'].apply(lambda x: ' '.join(x))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [27]:
df_sample
#pd.set_option('display.max_colwidth', -1)

Unnamed: 0,MBTI,comments,text_lemma
787840,ISTP,"It's not quite that simple. The ""test taking"" environment is completely different to day-to-day stuff. Dead silence, no referencing/quick checks which I do constantly (to reaffirm my knowledge), the stress of trying to get a good grade, being tested on the spot, having a time limit, etc. It impairs (at least my) ability to remember things. But just get me out of the blue when I'm completely relaxed and there's no stress or anything? Sure, I could easily tell you. Also, the questions on exams are usually different than how you'd usually use the knowledge as well as different from how you learned it. Also, some people are just shit at thoroughly making sure that the knowledge is down on paper. Sometimes you forget to mention something that the other person would simply just follow up on and ask about, which you'd then respond. On a test, that doesn't happen, you just get points knocked off.",be not quite that simple the test take environment be completely different to day to day stuff dead silence no referencing quick check which do constantly to reaffirm knowledge the stress of try to get good grade be test on the spot have time limit etc impair at least ability to remember thing but just get out of the blue when be completely relaxed and there be no stress or anything sure could easily tell also the question on exam be usually different than how would usually use the knowledge as well as different from how learn also some people be just shit at thoroughly make sure that the knowledge be down on paper sometimes forget to mention something that the other person would simply just follow up on and ask about which would then respond on test that do not happen just get point knock off
46144,ISTJ,"For those who haven't been in touch with the game for long or very closely (including the creator of this video I assume), in the past we have had multiple news agencies blow up at Arma 3 for being an ""ISIS training simulator"" because of a mod that added ISIS units. Don't let this happen again. Please remove this for the sake of the game.",for those who have not be in touch with the game for long or very closely include the creator of this video assume in the past have have multiple news agency blow up at arma for be an isis training simulator because of mod that add isis unit do not let this happen again please remove this for the sake of the game
321942,ENTP,"The fat toothless kid from *Stranger Things*. He seems to be the only one of the kids who has a grasp of the bigger picture. He starts off as more of a sidekick until the other two kids start to lose it, then he steps from out of the shadows, smacks them around until they see sense, and gets everything back on track.",the fat toothless kid from stranger things seem to be the only one of the kid who have grasp of the big picture start off as more of sidekick until the other two kid start to lose then step from out of the shadow smack around until see sense and get everything back on track
123281,ENFP,"I wasn't allowed to wear nail polish, get natural-colored highlights, wear makeup (even to cover a zit), or have any piercing besides one normal-sized love at my school.",be not allow to wear nail polish get natural color highlight wear makeup even to cover zit or have any piercing besides one normal sized love at school
263516,ISFJ,"She's been off the grid for what...like a month? Two months? No one in the fandom knows where she is or what she's doing right now. It doesn't surprise me that she didn't go and I'm not sure why everyone is so upset about it. I feel like if she did go, it would have been all over the news (because the media is obsessed with her), then people would be saying ""Ugh, why is Taylor Swift making the women's march all about her?"" Girl can't win. Not to mention the fact that she has all kinds of psychos who stalk her and doesn't go anywhere without security...all that plus the paparazzi would have created a huge spectacle and detracted from the importance of the march. Beyonce uses feminism to promote her brand way more than Taylor does, and no one's salty about her not going. I'll never understand why people have such a stick up their ass about Taylor.",be be off the grid for what like month two month no one in the fandom know where be or what be do right now do not surprise that do not go and be not sure why everyone be so upset about feel like if did go would have be all over the news because the medium be obsess with then people would be say ugh why be taylor swift make the woman s march all about girl can not win not to mention the fact that have all kind of psychos who stalk and do not go anywhere without security all that plus the paparazzi would have create huge spectacle and detract from the importance of the march beyonce use feminism to promote brand way more than taylor do and no one s salty about not go will never understand why people have such stick up ass about taylor
...,...,...,...
689063,ISTP,"This isn't about respect this is about ending suffering, hatred has never in all of human history conquered hate and until we decide to tackle the problems at their roots instead of dehumanizing people hate will never end.",this be not about respect this be about end suffering hatred have never in all of human history conquer hate and until decide to tackle the problem at root instead of dehumanize people hate will never end
288663,ENTP,"I can help you out personally :) I know a lot about python and have even made a bot in python (/u/dogetipchecker). PM me if you are interested, and I can recommend some books and also answer specific questions about python. I can give you my skype number, and we can chat with skype too if you are OK with that.",can help out personally know lot about python and have even make bot in python dogetipchecker if be interested and can recommend some book and also answer specific question about python can give skype number and can chat with skype too if be with that
996000,INFP,TIL that the fires of Hell are blue! O_O,til that the fire of hell be blue o o
20057,ISTP,Doesn't spoofing require kernel? Hype?,do not spoof require kernel hype


In [135]:
df_punc['comments_lemma'] = clean_text(df_punc['comments'])
df_punc['comments_lemma'] = df_punc['comments_lemma'].apply(lambda x: ' '.join(x))

KeyboardInterrupt: 

In [133]:
df_sample

Unnamed: 0,MBTI,comments,text_lemma
162062,ISTP,NO THAT'S THE COMPETITION FOR BORING RUGBY.,no that be the competition for boring rugby
1551436,INFP,Search for it.,search for
54899,ESFJ,I quickly made a room in GroupMe if you wanted...,quickly make room in groupme if want to give t...
1960,ESTP,"He never said that, god damn your such a strawman",never say that god damn such strawman
933076,INFJ,"""Great, I'll remember that,"" I smile before si...",great will remember that smile before sip beer...
...,...,...,...
15416,ESTJ,"&gt; ENTJ for Gilgamesh, not ESTJ and definite...",entj for gilgamesh not estj and definitely not...
759747,INFP,People who lack empathy usually also lack shame,people who lack empathy usually also lack shame
1209225,ENTJ,"ESFJ mom and Entp dad. Sisters are ENFP, INTP,...",esfj mom and entp dad sister be enfp intp isfj...
380907,ENTP,"i haven't updated my flair, I graduated about ...",have not update flair graduate about month ago...


In [134]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(df_sample['comments'].iloc[98])

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

about 15 months ago 40 59 DATE
3 days 94 100 DATE
