# Basic NLP

### Reading data

In [1]:
import pandas as pd
df = pd.read_csv("./data/mbti-type.zip", compression="zip")
print(df.shape)
df.head()

(8675, 2)


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


### Preprocessing the text

In [2]:
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import TweetTokenizer
from nltk import download as nltk_download
import string

nltk_download("stopwords")

tokenizer = TweetTokenizer() # using TweetTokenizer even though these aren't tweets per se
stopwords = set(nltk_stopwords.words('english'))

punc_stripper = str.maketrans('', '', string.punctuation)

def preprocess(text):
    # lowercase
    text = text.lower()
    
    # strip punctuation
    text = text.translate(punc_stripper)
    
    # tokenize
    tokenized = tokenizer.tokenize(text)
    
    # remove stopwords
    unstopped = [t for t in tokenized if t not in stopwords]
    
    return unstopped

test_text = "This is Some test text!"
print(preprocess(test_text))

[nltk_data] Downloading package stopwords to /home/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['test', 'text']


In [3]:
df['preprocessed'] = df.posts.apply(preprocess)

### N-Gram analysis

In [4]:
from nltk.util import ngrams as nltk_ngrams

def ngram_extract(tokens, n):
    ngrams = list(nltk_ngrams(tokens, n))
    ngram_phrases = [" ".join(x) for x in ngrams]
    return ngram_phrases
    
test_ngrams = ["my", "name", "is", "Carl"]
ngram_extract(test_ngrams, 3)

['my name is', 'name is Carl']

In [5]:
    
df['unigrams'], df['bigrams'], df['trigrams'] = [df.preprocessed.apply(lambda x: ngram_extract(x, n + 1)) for n in range(3)]
df.head()

Unnamed: 0,type,posts,preprocessed,unigrams,bigrams,trigrams
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"[httpwwwyoutubecomwatchvqsxhcwe, 3krwhttp41med...","[httpwwwyoutubecomwatchvqsxhcwe, 3krwhttp41med...",[httpwwwyoutubecomwatchvqsxhcwe 3krwhttp41medi...,[httpwwwyoutubecomwatchvqsxhcwe 3krwhttp41medi...
1,ENTP,'I'm finding the lack of me in these posts ver...,"[im, finding, lack, posts, alarmingsex, boring...","[im, finding, lack, posts, alarmingsex, boring...","[im finding, finding lack, lack posts, posts a...","[im finding lack, finding lack posts, lack pos..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,"[good, one, httpswwwyoutubecomwatchvfhigbolffg...","[good, one, httpswwwyoutubecomwatchvfhigbolffg...","[good one, one httpswwwyoutubecomwatchvfhigbol...",[good one httpswwwyoutubecomwatchvfhigbolffgwo...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","[dear, intp, enjoyed, conversation, day, esote...","[dear, intp, enjoyed, conversation, day, esote...","[dear intp, intp enjoyed, enjoyed conversation...","[dear intp enjoyed, intp enjoyed conversation,..."
4,ENTJ,'You're fired.|||That's another silly misconce...,"[youre, firedthats, another, silly, misconcept...","[youre, firedthats, another, silly, misconcept...","[youre firedthats, firedthats another, another...","[youre firedthats another, firedthats another ..."


In [6]:
rows = list()
for row in df[['type', 'unigrams']].iterrows():
    r = row[1]
    for word in r.unigrams:
        rows.append((r.type, word))
unigrams = pd.DataFrame(rows, columns=["type", "unigrams"])
unigrams.head()

Unnamed: 0,type,unigrams
0,INFJ,httpwwwyoutubecomwatchvqsxhcwe
1,INFJ,3krwhttp41mediatumblrcomtumblrlfouy03pma1qa1ro...
2,INFJ,intj
3,INFJ,moments
4,INFJ,httpswwwyoutubecomwatchviz


### A less stupid way to extract n-grams

In [7]:
from collections import Counter
import pprint
pp = pprint.PrettyPrinter()

types = df['type'].unique()
ns = [1,2,3]

n_gram_freqs_by_type = {}
for type in types:
    n_gram_freqs = {}
    for n in ns:
        n_gram_freqs[n] = Counter()
    n_gram_freqs_by_type[type] = n_gram_freqs
pp.pprint(n_gram_freqs_by_type)

{'ENFJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'ENFP': {1: Counter(), 2: Counter(), 3: Counter()},
 'ENTJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'ENTP': {1: Counter(), 2: Counter(), 3: Counter()},
 'ESFJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'ESFP': {1: Counter(), 2: Counter(), 3: Counter()},
 'ESTJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'ESTP': {1: Counter(), 2: Counter(), 3: Counter()},
 'INFJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'INFP': {1: Counter(), 2: Counter(), 3: Counter()},
 'INTJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'INTP': {1: Counter(), 2: Counter(), 3: Counter()},
 'ISFJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'ISFP': {1: Counter(), 2: Counter(), 3: Counter()},
 'ISTJ': {1: Counter(), 2: Counter(), 3: Counter()},
 'ISTP': {1: Counter(), 2: Counter(), 3: Counter()}}


In [8]:
from nltk.util import ngrams
from tqdm import tqdm_notebook as tqdm

for row_id, row in tqdm(df[["type", "preprocessed"]].iterrows()):
    #get ngrams for all values of n defined previously
    for n in ns:
        grams = ngrams(row.preprocessed, n)
        for gram in grams:
            n_gram_freqs_by_type[row.type][n][gram] += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Display the extracted n-grams per personality type

In [9]:
for type in types:
    print(type)
    for n in ns:
        print("\tMost common %d-grams:" % n)
        for entry, count in n_gram_freqs_by_type[type][n].most_common(5):
            print("\t\t" + " ".join(entry) + "\t" + str(count))

INFJ
	Most common 1-grams:
		im	11573
		like	11477
		think	8594
		dont	8449
		people	7950
	Most common 2-grams:
		dont know	1272
		feel like	1118
		dont think	877
		im sure	540
		dont want	424
	Most common 3-grams:
		sent iphone using	137
		feel like im	96
		im pretty sure	80
		know youre infj	71
		dont really know	56
ENTP
	Most common 1-grams:
		like	5133
		im	4857
		dont	3855
		think	3835
		people	3379
	Most common 2-grams:
		dont know	490
		dont think	398
		feel like	310
		im sure	209
		dont really	191
	Most common 3-grams:
		im pretty sure	69
		sent iphone using	48
		know youre entp	42
		dont feel like	27
		nucky nucky nucky	25
INTP
	Most common 1-grams:
		like	9325
		im	8698
		dont	7704
		think	6924
		people	6498
	Most common 2-grams:
		dont know	1012
		dont think	730
		feel like	594
		im sure	409
		dont like	387
	Most common 3-grams:
		im pretty sure	102
		feel like im	62
		dont really know	60
		dont even know	55
		know youre intp	51
INTJ
	Most common 1-grams:
		like	7509
		im	66