# NLTK research questions

In [12]:
# Jupyter
import nltk
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Colab
import nltk
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

In [2]:
# first import the books
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [6]:
# POS: tags parts of speech. why do you think this is useful? 

pos_tag(text3)[:10]

[('In', 'IN'),
 ('the', 'DT'),
 ('beginning', 'NN'),
 ('God', 'NNP'),
 ('created', 'VBD'),
 ('the', 'DT'),
 ('heaven', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('earth', 'NN')]

In [7]:
# for help in figuring out tags

nltk.help.upenn_tagset('IN')

IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...


### personals corpus

In [15]:
FreqDist(text8)

FreqDist({',': 539, '.': 353, '/': 110, 'for': 99, 'and': 74, 'to': 74, 'lady': 68, '-': 66, 'seeks': 60, 'a': 52, ...})

In [16]:
FreqDist(text8).most_common(20)

[(',', 539),
 ('.', 353),
 ('/', 110),
 ('for', 99),
 ('and', 74),
 ('to', 74),
 ('lady', 68),
 ('-', 66),
 ('seeks', 60),
 ('a', 52),
 ('with', 44),
 ('S', 36),
 ('ship', 33),
 ('&', 30),
 ('relationship', 29),
 ('fun', 28),
 ('in', 27),
 ('slim', 27),
 ('build', 27),
 ('o', 26)]

In [17]:
# looks like our text needs some cleaning. Let's remove stopwords and punctuation.

p_no_punct_lower = [] 

for word in text8: 
    if word.isalpha():
        p_no_punct_lower.append(word.lower())

stops = stopwords.words('english')

p_no_stops = []

for item in p_no_punct_lower:
    if item not in stops:
        p_no_stops.append(item)

In [18]:
FreqDist(p_no_stops).most_common(20)

[('lady', 88),
 ('seeks', 72),
 ('male', 42),
 ('looking', 34),
 ('ship', 33),
 ('slim', 33),
 ('fun', 31),
 ('attractive', 29),
 ('relationship', 29),
 ('build', 27),
 ('good', 26),
 ('seeking', 25),
 ('non', 25),
 ('smoker', 23),
 ('n', 23),
 ('guy', 22),
 ('honest', 22),
 ('movies', 22),
 ('age', 21),
 ('married', 21)]

In [19]:
personals = p_no_stops

In [21]:
personals = nltk.Text(personals)

In [22]:
personals.similar('lady')

seeks fem female looking women someone woman short


In [23]:
personals.similar('man')

male young


In [26]:
personals.common_contexts(['lady'])

looking_non asian_sought seeks_age seeks_casual married_discreet
seeks_r single_discreet seeking_uniform mum_fship affectionate_sought
attractive_european attractive_children meet_friendship caring_likes
honest_friendship figured_plus looking_similar classy_wants
medium_friendship loving_friendship


In [28]:
personals.collocations()

non smoker; would like; like meet; age open; social drinker; medium
build; quiet nights; long term; sense humour; med build; easy going;
nights home; poss rship; smoker social; financially secure; fship
poss; fun times; weekends away; single dad; similar interests


In [29]:
personals.count('lady')

88

In [30]:
personals.generate(100)

nts home self supporting happy nat fun loving lady friendship
relationship kids ok eastern suburbs yo male blond outgoing genuine
friendly seeking female bet w yo easygoing friendly year old working
tall guy would like meet sexy honest reliable looking find friend
asian guy prof well presented athletic ties n seeks female early fun
times possible relationship little magic looking lady non smoker
social drinker seeking female bet w yo easygoing friendly year old
lady relationship clare barossa region northern suburbs tall
attractive male ties earth easy going gsoh honest reliable looking
find friend asian guy prof well presented wltm


Building ngram index...


'nts home self supporting happy nat fun loving lady friendship\nrelationship kids ok eastern suburbs yo male blond outgoing genuine\nfriendly seeking female bet w yo easygoing friendly year old working\ntall guy would like meet sexy honest reliable looking find friend\nasian guy prof well presented athletic ties n seeks female early fun\ntimes possible relationship little magic looking lady non smoker\nsocial drinker seeking female bet w yo easygoing friendly year old\nlady relationship clare barossa region northern suburbs tall\nattractive male ties earth easy going gsoh honest reliable looking\nfind friend asian guy prof well presented wltm'