In [57]:
%pylab inline
%precision 6

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


'%.6f'

In [58]:
import pandas as pd
import sklearn as skl
import nltk

In [59]:
pd.options.display.max_colwidth=100
np.set_printoptions(linewidth=140,edgeitems=10)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
rcParams['figure.figsize'] = (8.0, 5.0)

# Wordnet

In [60]:
from nltk.corpus import wordnet as wn

## We can extract meanings of particular word

In [61]:
wn.synsets('dish')

[Synset('dish.n.01'),
 Synset('dish.n.02'),
 Synset('dish.n.03'),
 Synset('smasher.n.02'),
 Synset('dish.n.05'),
 Synset('cup_of_tea.n.01'),
 Synset('serve.v.06'),
 Synset('dish.v.02')]

## and explore each particular meaning

In [62]:
wn.synset('dish.n.01').lemma_names()

['dish']

In [63]:
wn.synset('dish.n.01').definition()

'a piece of dishware normally used as a container for holding or serving food'

In [64]:
wn.synset('dish.n.01').examples()

['we gave them a set of dishes for a wedding present']

In [65]:
wn.synset('dish.n.02').definition()

'a particular item of prepared food'

In [66]:
wn.synset('dish.n.02').examples()

['she prepared a special dish for dinner']

## Semantic relationships

Applications:
    * search: by searching car return pages, containing lumousine, jeep, etc.
    * document classification: if word votes for particular class, then similar vote should be given to synonims and words with related 
      meaning

### Hyponims - subtypes of given object

In [67]:
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()

In [68]:
types_of_motorcar

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [69]:
wn.synset('ambulance.n.01').lemma_names()

['ambulance']

In [70]:
wn.synset('sports_car.n.01').lemma_names()

['sports_car', 'sport_car']

In [71]:
wn.synset('electric.n.01').lemma_names()

['electric', 'electric_automobile', 'electric_car']

In [72]:
sorted(lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas())

['Model_T',
 'S.U.V.',
 'SUV',
 'Stanley_Steamer',
 'ambulance',
 'beach_waggon',
 'beach_wagon',
 'bus',
 'cab',
 'compact',
 'compact_car',
 'convertible',
 'coupe',
 'cruiser',
 'electric',
 'electric_automobile',
 'electric_car',
 'estate_car',
 'gas_guzzler',
 'hack',
 'hardtop',
 'hatchback',
 'heap',
 'horseless_carriage',
 'hot-rod',
 'hot_rod',
 'jalopy',
 'jeep',
 'landrover',
 'limo',
 'limousine',
 'loaner',
 'minicar',
 'minivan',
 'pace_car',
 'patrol_car',
 'phaeton',
 'police_car',
 'police_cruiser',
 'prowl_car',
 'race_car',
 'racer',
 'racing_car',
 'roadster',
 'runabout',
 'saloon',
 'secondhand_car',
 'sedan',
 'sport_car',
 'sport_utility',
 'sport_utility_vehicle',
 'sports_car',
 'squad_car',
 'station_waggon',
 'station_wagon',
 'stock_car',
 'subcompact',
 'subcompact_car',
 'taxi',
 'taxicab',
 'tourer',
 'touring_car',
 'two-seater',
 'used-car',
 'waggon',
 'wagon']

### Hypernyms - more general concepts

In [73]:
motorcar = wn.synset('car.n.01')
motorcar.hypernyms()

[Synset('motor_vehicle.n.01')]

In [74]:
wn.synset('motor_vehicle.n.01').hypernyms()

[Synset('self-propelled_vehicle.n.01')]

In [75]:
wn.synset('self-propelled_vehicle.n.01').definition()

'a wheeled vehicle that carries in itself a means of propulsion'

In [76]:
wn.synset('self-propelled_vehicle.n.01').hypernyms()

[Synset('wheeled_vehicle.n.01')]

In [77]:
wn.synset('wheeled_vehicle.n.01').hypernyms()

[Synset('container.n.01'), Synset('vehicle.n.01')]

In [78]:
wn.synset('container.n.01').definition()

'any object that can be used to hold things (especially a large metal boxlike object of standardized dimensions that can be loaded from one form of transport to another)'

### Holonyms - things a particular word is part of 

In [85]:
wn.synset('animal.n.01').member_holonyms()

[Synset('animalia.n.01')]

In [81]:
wn.synset('car.n.01').substance_holonyms()

[]

### Meronyms - things a particular word contains

#### Substance meronims

In [86]:
wn.synset('water.n.01').substance_meronyms()

[Synset('hydrogen.n.01'), Synset('oxygen.n.01')]

In [87]:
wn.synset('tree.n.01').substance_meronyms()

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

In [90]:
wn.synset('house.n.01').substance_meronyms()

[]

In [88]:
wn.synset('heartwood.n.01').definition()

'the older inactive central wood of a tree or woody plant; usually darker and denser than the surrounding sapwood'

In [89]:
wn.synset('sapwood.n.01').definition()

'newly formed outer wood lying between the cambium and the heartwood of a tree or woody plant; usually light colored; active in water conduction'

#### Part meronims

In [None]:
wn.synset('house.n.01').definition()

In [91]:
wn.synset('house.n.01').part_meronyms()

[Synset('library.n.01'),
 Synset('loft.n.02'),
 Synset('porch.n.01'),
 Synset('study.n.05')]

In [92]:
wn.synset('tree.n.01').definition()

'a tall perennial woody plant having a main trunk and branches forming a distinct elevated crown; includes both gymnosperms and angiosperms'

In [93]:
wn.synset('tree.n.01').part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

### Entailments - acts that a given verb is composed of

In [94]:
wn.synset('eat.v.01').entailments()

[Synset('chew.v.01'), Synset('swallow.v.01')]

In [95]:
wn.synset('breathe.v.01').entailments()

[Synset('exhale.v.01'), Synset('inhale.v.02')]

### Find out more relations

In [96]:
dir(wn.synset('harmony.n.02'))

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_hypernyms',
 '_definition',
 '_examples',
 '_frame_ids',
 '_hypernyms',
 '_instance_hypernyms',
 '_iter_hypernym_lists',
 '_lemma_names',
 '_lemma_pointers',
 '_lemmas',
 '_lexname',
 '_max_depth',
 '_min_depth',
 '_name',
 '_needs_root',
 '_offset',
 '_pointers',
 '_pos',
 '_related',
 '_shortest_hypernym_paths',
 '_wordnet_corpus_reader',
 'also_sees',
 'attributes',
 'causes',
 'closure',
 'common_hypernyms',
 'definition',
 'entailments',
 'examples',
 'frame_ids',
 'hypernym_distances',
 'hypernym_paths',
 'hypernyms',
 'hyponyms',
 'instance_hypernyms',
 'instance_hyponyms',
 'jcn_similarity',
 'lch_si

### Measure semantic similarity between words by finding closest common hypernym

In [97]:
syn1 = wn.synset('jeep.n.01')
syn2 = wn.synset('ambulance.n.01')

syn1.lowest_common_hypernyms(syn2)

[Synset('car.n.01')]

In [98]:
syn1 = wn.synset('jeep.n.01')
syn2 = wn.synset('train.n.01')

syn1.lowest_common_hypernyms(syn2)

[Synset('conveyance.n.03')]

In [99]:
wn.synset('conveyance.n.03').definition()

'something that serves as a means of transportation'

In [100]:
syn1 = wn.synset('jeep.n.01')
syn2 = wn.synset('cat.n.01')

syn1.lowest_common_hypernyms(syn2)

[Synset('whole.n.02')]

In [103]:
wn.synset('whole.n.02').definition()

'an assemblage of parts that is regarded as a single entity'

In [104]:
wn.synset('whole.n.02').hypernyms()

[Synset('object.n.01')]

In [109]:
wn.synset('entity.n.01').hypernyms()

[]

## How abstract a word is?

In [105]:
wn.synset('conveyance.n.03').min_depth()

6

In [106]:
wn.synset('whole.n.02').min_depth()

3

In [110]:
wn.synset('car.n.01').min_depth()

10

In [111]:
syn1 = wn.synset('jeep.n.01')
syn2 = wn.synset('cat.n.01')
syn1.path_similarity(syn2)

0.052632

In [112]:
syn1 = wn.synset('jeep.n.01')
syn2 = wn.synset('ambulance.n.01')
syn1.path_similarity(syn2)

0.333333

# Reading external sources

In [113]:
import nltk, re, pprint
from nltk import word_tokenize

In [114]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')

In [115]:
type(raw)

str

In [116]:
len(raw)

1176896

In [117]:
raw[:75]

'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n'

In [118]:
tokens = word_tokenize(raw)

In [119]:
type(tokens)

list

In [120]:
len(tokens)

254352

In [121]:
tokens[:130]

['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by',
 'Fyodor',
 'Dostoevsky',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '.',
 'You',
 'may',
 'copy',
 'it',
 ',',
 'give',
 'it',
 'away',
 'or',
 're-use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'Project',
 'Gutenberg',
 'License',
 'included',
 'with',
 'this',
 'eBook',
 'or',
 'online',
 'at',
 'www.gutenberg.org',
 'Title',
 ':',
 'Crime',
 'and',
 'Punishment',
 'Author',
 ':',
 'Fyodor',
 'Dostoevsky',
 'Release',
 'Date',
 ':',
 'March',
 '28',
 ',',
 '2006',
 '[',
 'EBook',
 '#',
 '2554',
 ']',
 '[',
 'Last',
 'updated',
 ':',
 'November',
 '15',
 ',',
 '2011',
 ']',
 'Language',
 ':',
 'English',
 'Character',
 'set',
 'encoding',
 ':',
 'ASCII',
 '***',
 'START',
 'OF',
 'THIS',
 'PROJECT',
 'GUTENBERG',
 'EBOOK',
 'CRIME',
 'AND',
 'PUNISHMEN

In [122]:
text = nltk.Text(tokens)

In [123]:
type(text)

nltk.text.Text

In [124]:
text[1024:1062]

['CHAPTER',
 'I',
 'On',
 'an',
 'exceptionally',
 'hot',
 'evening',
 'early',
 'in',
 'July',
 'a',
 'young',
 'man',
 'came',
 'out',
 'of',
 'the',
 'garret',
 'in',
 'which',
 'he',
 'lodged',
 'in',
 'S.',
 'Place',
 'and',
 'walked',
 'slowly',
 ',',
 'as',
 'though',
 'in',
 'hesitation',
 ',',
 'towards',
 'K.',
 'bridge',
 '.']

In [125]:
text.collocations()

Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; Nikodim Fomitch; young man; Ilya Petrovitch; n't know;
Project Gutenberg; Dmitri Prokofitch; Andrey Semyonovitch; Hay Market


In [126]:
pure_text = [w.lower() for w in text if w.isalpha()]

In [127]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(pure_text)

In [128]:
finder.apply_freq_filter(10)

In [129]:
finder.nbest(bigram_measures.pmi, 30)

[('o', 'u'),
 ('united', 'states'),
 ('arkady', 'ivanovitch'),
 ('literary', 'archive'),
 ('andrey', 'semyonovitch'),
 ('nikodim', 'fomitch'),
 ('archive', 'foundation'),
 ('hay', 'market'),
 ('dmitri', 'prokofitch'),
 ('gutenberg', 'literary'),
 ('electronic', 'works'),
 ('honoured', 'sir'),
 ('sofya', 'semyonovna'),
 ('marfa', 'petrovna'),
 ('police', 'station'),
 ('project', 'gutenberg'),
 ('fresh', 'air'),
 ('rodion', 'romanovitch'),
 ('avdotya', 'romanovna'),
 ('twenty', 'copecks'),
 ('pulcheria', 'alexandrovna'),
 ('project', 'electronic'),
 ('several', 'times'),
 ('stopped', 'short'),
 ('police', 'office'),
 ('thank', 'god'),
 ('great', 'deal'),
 ('ten', 'minutes'),
 ('five', 'minutes'),
 ('wide', 'open')]

In [130]:
finder.nbest(bigram_measures.student_t, 30)

[('in', 'the'),
 ('i', 'am'),
 ('he', 'had'),
 ('he', 'was'),
 ('on', 'the'),
 ('of', 'the'),
 ('it', 'was'),
 ('at', 'the'),
 ('you', 'are'),
 ('do', 'you'),
 ('to', 'be'),
 ('did', 'not'),
 ('at', 'once'),
 ('as', 'though'),
 ('katerina', 'ivanovna'),
 ('that', 'he'),
 ('with', 'a'),
 ('there', 'was'),
 ('in', 'a'),
 ('i', 'have'),
 ('to', 'the'),
 ('had', 'been'),
 ('out', 'of'),
 ('from', 'the'),
 ('pyotr', 'petrovitch'),
 ('could', 'not'),
 ('you', 'know'),
 ('the', 'door'),
 ('she', 'was'),
 ('the', 'same')]

In [131]:
finder.nbest(bigram_measures.chi_sq, 30)

[('avdotya', 'romanovna'),
 ('nikodim', 'fomitch'),
 ('o', 'u'),
 ('pulcheria', 'alexandrovna'),
 ('marfa', 'petrovna'),
 ('sofya', 'semyonovna'),
 ('rodion', 'romanovitch'),
 ('andrey', 'semyonovitch'),
 ('katerina', 'ivanovna'),
 ('united', 'states'),
 ('literary', 'archive'),
 ('hay', 'market'),
 ('arkady', 'ivanovitch'),
 ('pyotr', 'petrovitch'),
 ('dmitri', 'prokofitch'),
 ('archive', 'foundation'),
 ('project', 'gutenberg'),
 ('electronic', 'works'),
 ('gutenberg', 'literary'),
 ('great', 'deal'),
 ('honoured', 'sir'),
 ('police', 'station'),
 ('old', 'woman'),
 ('amalia', 'ivanovna'),
 ('project', 'electronic'),
 ('ilya', 'petrovitch'),
 ('fresh', 'air'),
 ('porfiry', 'petrovitch'),
 ('good', 'heavens'),
 ('sat', 'down')]

In [132]:
finder.nbest(bigram_measures.likelihood_ratio, 30)

[('i', 'am'),
 ('katerina', 'ivanovna'),
 ('pyotr', 'petrovitch'),
 ('pulcheria', 'alexandrovna'),
 ('avdotya', 'romanovna'),
 ('he', 'had'),
 ('at', 'once'),
 ('in', 'the'),
 ('as', 'though'),
 ('did', 'not'),
 ('you', 'are'),
 ('rodion', 'romanovitch'),
 ('on', 'the'),
 ('marfa', 'petrovna'),
 ('do', 'you'),
 ('he', 'was'),
 ('of', 'course'),
 ('sofya', 'semyonovna'),
 ('it', 'was'),
 ('the', 'same'),
 ('had', 'been'),
 ('old', 'woman'),
 ('could', 'not'),
 ('the', 'door'),
 ('to', 'be'),
 ('at', 'the'),
 ('there', 'was'),
 ('of', 'the'),
 ('porfiry', 'petrovitch'),
 ('you', 'know')]

## HTML reading

In [141]:
url = "http://www.space.com/16159-first-man-in-space.html"
html = request.urlopen(url).read().decode('utf8')
html[:60]

'<!DOCTYPE html>\n<html>\n<head itemscope itemtype="http://sche'

In [144]:
soup = BeautifulSoup(html, 'lxml')
map(lambda x: x.extract(), soup('script'))
print(soup.text)








Yuri Gagarin: First Man in Space








            var image_server = 'http://www.space.com';
        



        var jqueryUrl = 'http://www.space.com/resources/js/jquery.min.js?efd4e28';
    






















    var om_info = { 'env' : 'production', 'site' :'space', 'suite' : 'ttrglobalproduction', "display_name": "Space" };



<!--
/* You may give each page an identifying name, server, and channel on
the next lines. */
            s.pageName = "sdc:spaceflight:reference:16159";
                s.channel = "spaceflight";
                s.prop2 = "sdc";
                s.prop4 = "reference";
                s.prop5 = "yuri gagarin: first man in space";
                s.prop6 = "sdc-16159";
    
    
 /* Hierarchy Variables */
s.hier1="";
s.eVar15=s.prop15;
s.eVar16=s.prop16;
s.eVar17=s.prop17;
s.eVar18=s.prop18;

/************* DO NOT ALTER ANYTHING BELOW THIS LINE ! **************/
var s_code=s.t();if(s_code)document.write(s_code)//-->
<!--
if(navigator.appVersion.ind

In [134]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html,'lxml').get_text()
tokens = word_tokenize(raw)
tokens

['Yuri',
 'Gagarin',
 ':',
 'First',
 'Man',
 'in',
 'Space',
 'var',
 'image_server',
 '=',
 "'http",
 ':',
 '//www.space.com',
 "'",
 ';',
 'var',
 'jqueryUrl',
 '=',
 "'http",
 ':',
 '//www.space.com/resources/js/jquery.min.js',
 '?',
 'efd4e28',
 "'",
 ';',
 'var',
 'om_info',
 '=',
 '{',
 "'env",
 "'",
 ':',
 "'production",
 "'",
 ',',
 "'site",
 "'",
 ':',
 "'space",
 "'",
 ',',
 "'suite",
 "'",
 ':',
 "'ttrglobalproduction",
 "'",
 ',',
 '``',
 'display_name',
 "''",
 ':',
 '``',
 'Space',
 "''",
 '}',
 ';',
 '<',
 '!',
 '--',
 '/*',
 'You',
 'may',
 'give',
 'each',
 'page',
 'an',
 'identifying',
 'name',
 ',',
 'server',
 ',',
 'and',
 'channel',
 'on',
 'the',
 'next',
 'lines',
 '.',
 '*/',
 's.pageName',
 '=',
 '``',
 'sdc',
 ':',
 'spaceflight',
 ':',
 'reference:16159',
 "''",
 ';',
 's.channel',
 '=',
 '``',
 'spaceflight',
 "''",
 ';',
 's.prop2',
 '=',
 '``',
 'sdc',
 "''",
 ';',
 's.prop4',
 '=',
 '``',
 'reference',
 "''",
 ';',
 's.prop5',
 '=',
 '``',
 'yuri',
 'g

In [135]:
raw[:4000]

'\n\n\n\n\n\nYuri Gagarin: First Man in Space\n\n\n\n\n\n\n\n\n            var image_server = \'http://www.space.com\';\n        \n\n\n\n        var jqueryUrl = \'http://www.space.com/resources/js/jquery.min.js?efd4e28\';\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    var om_info = { \'env\' : \'production\', \'site\' :\'space\', \'suite\' : \'ttrglobalproduction\', "display_name": "Space" };\n\n\n\n<!--\n/* You may give each page an identifying name, server, and channel on\nthe next lines. */\n            s.pageName = "sdc:spaceflight:reference:16159";\n                s.channel = "spaceflight";\n                s.prop2 = "sdc";\n                s.prop4 = "reference";\n                s.prop5 = "yuri gagarin: first man in space";\n                s.prop6 = "sdc-16159";\n    \n    \n /* Hierarchy Variables */\ns.hier1="";\ns.eVar15=s.prop15;\ns.eVar16=s.prop16;\ns.eVar17=s.prop17;\ns.eVar18=s.prop18;\n\n/************* DO NOT ALTER ANYTHING BELOW THIS LINE ! **************/\nvar

In [136]:
raw.find('The United States')

4235

In [137]:
raw.rfind('June 18, 1983.')

10264

In [138]:
raw = raw[4235:10264]

In [139]:
tokens = word_tokenize(raw)
tokens

['The',
 'United',
 'States',
 'and',
 'the',
 'Soviet',
 'Union',
 'vigorously',
 'competed',
 'to',
 'push',
 'the',
 'boundaries',
 'of',
 'mankind',
 "'s",
 'exploration',
 'of',
 'space',
 '.',
 'The',
 'Russians',
 'scored',
 'a',
 'victory',
 'when',
 'they',
 'launched',
 'a',
 'small',
 'craft',
 'carrying',
 'cosmonaut',
 'Yuri',
 'Gagarin',
 'to',
 'new',
 'heights',
 '.',
 'His',
 '108-minute',
 'flight',
 'gave',
 'him',
 'a',
 'permanent',
 'place',
 'in',
 'the',
 'history',
 'books',
 'as',
 'the',
 'first',
 'man',
 'in',
 'space',
 '.',
 'The',
 'race',
 'to',
 'the',
 'stars',
 'With',
 'the',
 '1957',
 'launch',
 'of',
 'Sputnik',
 '1',
 ',',
 'the',
 'world',
 "'s",
 'first',
 'manmade',
 'satellite',
 ',',
 'the',
 'Russians',
 'took',
 'an',
 'early',
 'lead',
 'in',
 'the',
 'space',
 'race',
 '.',
 'The',
 'next',
 'step',
 'was',
 'to',
 'send',
 'a',
 'human',
 'off',
 'the',
 'planet',
 '.',
 'The',
 'American',
 'plan',
 'to',
 'send',
 'a',
 'man',
 'into'

In [140]:
text = nltk.Text(tokens)
text.concordance('space')

Displaying 17 of 17 matches:
                                     space . The Russians scored a victory when
he history books as the first man in space . The race to the stars With the 195
e Russians took an early lead in the space race . The next step was to send a h
The American plan to send a man into space by 1961 created a deadline that the 
take a living , breathing human into space . [ INFOGRAPHIC : How the First Huma
rs of Human Spaceflight ] First in ( space ) flight On April 12 , 1961 , at 9:0
rolov , chief designer of the Soviet space program , disregarded protocol and g
leave Earth 's orbit and travel into space . [ Milestones in Human Spaceflight 
were hesitant to allow him back into space . He continued to make test flights 
es in accidents . First Americans in space Alan Shepard flew in space on May 5 
ricans in space Alan Shepard flew in space on May 5 , 1961 , the first American
an Shepard was the first American in space and the second person in space , lau
an in space

In [145]:
import feedparser
log = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")

In [146]:
log['feed']['title']

'Language Log'

In [147]:
len(log.entries)

13

In [148]:
post = log.entries[2]
post.title

'Of precious swords and Old Sinitic reconstructions, part 3'

In [149]:
content = post.content[0].value
content[:70]

'<p>Previous posts in the series:</p>\n<p style="padding-left: 30px;">"<'

In [150]:
len(post.content)

1

In [151]:
for e in log.entries:
    print(len(e.content))

1
1
1
1
1
1
1
1
1
1
1
1
1


Libraries for reading pdf, doc and other common formats also exist

### Reading files

In [155]:
!dir

 ’®¬ ў гбва®©бвўҐ D ­Ґ Ё¬ҐҐв ¬ҐвЄЁ.
 ‘ҐаЁ©­л© ­®¬Ґа в®¬ : 34C9-0349

 ‘®¤Ґа¦Ё¬®Ґ Ї ЇЄЁ D:\Projects\Natural language processing

17.03.2016  18:51    <DIR>          .
17.03.2016  18:51    <DIR>          ..
17.03.2016  17:29    <DIR>          .ipynb_checkpoints
09.03.2016  14:06            28я281 Collocations.ipynb
09.03.2016  15:05                54 demo.txt
09.03.2016  20:24               162 lexicon.csv
03.03.2016  16:44        22я083я072 mystem.exe
25.02.2016  19:33           994я549 NLTK-intro.ipynb
17.03.2016  17:20            40я747 Part of speech tagging.ipynb
17.03.2016  18:09             7я792 Part of speech tagging.zip
03.03.2016  17:03    <DIR>          Paustovskiy_Corpus
17.03.2016  18:51           140я480 WordNet, stemming, lemmatization, tips&tricks.ipynb
17.03.2016  18:10             4я601 WordNet, stemming, lemmatization, tips&tricks.zip
03.03.2016  16:53            49я509 Ї ¤Ґ¦Ё.gif
              10 д ©«®ў     23я349я247 Ў ©в
               4 Ї Ї®Є  269я805я809я664 Ў ©в

In [153]:
import os  # browse files in path
os.listdir('.')

['.ipynb_checkpoints',
 'Collocations.ipynb',
 'demo.txt',
 'lexicon.csv',
 'mystem.exe',
 'NLTK-intro.ipynb',
 'Part of speech tagging.ipynb',
 'Part of speech tagging.zip',
 'Paustovskiy_Corpus',
 'WordNet, stemming, lemmatization, tips&tricks.ipynb',
 'WordNet, stemming, lemmatization, tips&tricks.zip',
 'падежи.gif']

In [156]:
f = open('demo.txt')
raw = f.read()

In [157]:
f.close()

In [158]:
raw

'Some sample contents \nfor demonstrative \npurposes.\n'

In [159]:
s = input("Enter some text: ")
print("You typed", len(word_tokenize(s)), "words.")

Enter some text: hi
You typed 1 words.


# Operations on strings

In [None]:
'some'+'string'

In [None]:
'text'*3

In [None]:
print('text')

In [160]:
print('some'+'text')

sometext


In [161]:
print('some','text')

some text


In [162]:
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
fdist.most_common(5)

[('e', 117092), ('t', 87996), ('a', 77916), ('o', 69326), ('n', 65617)]

In [163]:
'hottest' in 'the hottest summer'

True

In [164]:
'this is the hottest day of the summer'.find('the')

8

In [165]:
'this is the hottest day of the summer'.rfind('the')

27

In [166]:
' '.join(['bunch','of','words'])

'bunch of words'

In [167]:
'-'.join(['bunch','of','words'])

'bunch-of-words'

In [168]:
'bunch of words'.split(' ')

['bunch', 'of', 'words']

In [169]:
'line\nnew line\none more line'.splitlines()

['line', 'new line', 'one more line']

In [170]:
'Moscow State University'.lower()

'moscow state university'

In [171]:
'  some line beginning and ending with whitespace characters   \r\n'.strip()

'some line beginning and ending with whitespace characters'

## Encodings

<b>Encode</b>: arbitrary encoding->unicode

<b>Decode</b>: unicode->arbitrary encoding

In [172]:
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

f = open(path, encoding='latin2')
for line in f:
    line = line.strip()
    print(line)

Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.


In [173]:
nltk.corpus.treebank.words()

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...]

In [174]:
n=3
cv_word_pairs = [(w[i:i+j], w) for w in ['carrot']
                         for i in range(len(w)-n+1) for j in range(1,n+1)]
cv_word_pairs

[('c', 'carrot'),
 ('ca', 'carrot'),
 ('car', 'carrot'),
 ('a', 'carrot'),
 ('ar', 'carrot'),
 ('arr', 'carrot'),
 ('r', 'carrot'),
 ('rr', 'carrot'),
 ('rro', 'carrot'),
 ('r', 'carrot'),
 ('ro', 'carrot'),
 ('rot', 'carrot')]

In [175]:
n=3
cv_word_pairs = nltk.Index([(w[i:i+j], w) for w in english_vocab
                           for i in range(len(w)-n+1) for j in range(1,n+1)])

NameError: name 'english_vocab' is not defined

In [176]:
cv_word_pairs['xen']

TypeError: list indices must be integers or slices, not str

## findall() on nltk.Text object

<> selects a token

In [177]:
from nltk.corpus import gutenberg

In [178]:
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))

In [179]:
moby[:5]

['[', 'Moby', 'Dick', 'by', 'Herman']

In [180]:
moby.findall(r"<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [181]:
moby.findall(r"<a> (<.*ous>) <man>")

nervous; dangerous; pious; furious


In [184]:
from nltk.corpus import nps_chat
chat = nltk.Text(nps_chat.words())

In [185]:
chat.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [186]:
nltk.app.nemo()  # tool for tuning regular expressions

In [187]:
nltk.re_show('\W\w{4}\W', ' some extravagant news here') # highlight matched regular expressions in the string

{ some }extravagant{ news }here


In [188]:
#find word and hypernyms
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


demands and other factors - false positive!

## Stemming

In [189]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """growing popularities of the advertizement"""
tokens = word_tokenize(raw)
[stem(t) for t in tokens]

['grow', 'popularit', 'of', 'the', 'advertize']

In [190]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [192]:
raw="Разные интересные новости"
tokens = word_tokenize(raw)
tokens

['Разные', 'интересные', 'новости']

In [194]:
raw="Ignoring these factors could lead to many troubles"
tokens = word_tokenize(raw)
tokens

['Ignoring', 'these', 'factors', 'could', 'lead', 'to', 'many', 'troubles']

In [195]:
[porter.stem(t) for t in tokens]

['Ignor', 'these', 'factor', 'could', 'lead', 'to', 'mani', 'troubl']

In [196]:
[lancaster.stem(t) for t in tokens]

['ign', 'thes', 'fact', 'could', 'lead', 'to', 'many', 'troubl']

In [197]:
def find_word(word,raw_text):
    tokens = word_tokenize(raw_text)
    stemmed_tokens = [porter.stem(token) for token in tokens]
    stemmed_word = porter.stem(word)
    inds = [i for i,stemmed_token in enumerate(stemmed_tokens) if stemmed_token==stemmed_word]
    matching_tokens = [tokens[ind] for ind in inds]
    return matching_tokens

In [198]:
print( find_word('boys', 'Many boys play in the garden and one boy is playing indoors') )  

['boys', 'boy']


In [199]:
print( find_word('played', 'Many boys play in the garden and one boy is playing indoors') )  

['play', 'playing']


## Lemmatizer

In [200]:
lemmatizer = nltk.WordNetLemmatizer()

In [201]:
raw="women have gone to the fields"
tokens = word_tokenize(raw)

[porter.stem(t) for t in tokens]

['women', 'have', 'gone', 'to', 'the', 'field']

In [202]:
raw="women have gone to the fields"
tokens = word_tokenize(raw)

[lemmatizer.lemmatize(t) for t in tokens]

['woman', 'have', 'gone', 'to', 'the', 'field']

## Segmenting text into words and sentences

In [203]:
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)

In [204]:
S='I went home. Then I read a book about F.B.I. B.H.Obama is the current president of the U.S.'
nltk.sent_tokenize(S)

['I went home.',
 'Then I read a book about F.B.I.',
 'B.H.Obama is the current president of the U.S.']

# Python tips and tricks

In [205]:
print(nltk.metrics.__file__) # show file corresponding to particular module

C:\Development\Anaconda3\lib\site-packages\nltk\translate\metrics.py


In [208]:
sin??

## Debugging

In [209]:
def fun_with_error():
    a=1
    b=0
    return a/b

In [210]:
import pdb
pdb.run('fun_with_error()')

> <string>(1)<module>()
(Pdb) l
[EOF]
(Pdb) u
> c:\development\anaconda3\lib\bdb.py(431)run()
-> exec(cmd, globals, locals)
(Pdb) u
*** Oldest frame
(Pdb) u
*** Oldest frame
(Pdb) q


In [211]:
from pdb import set_trace as bp

def fun_with_error():
    bp()  # put breakpoint here
    a=1
    b=0
    return a/b

fun_with_error()

> <ipython-input-211-d6cf5207c297>(5)fun_with_error()
-> a=1
(Pdb) l
  1  	from pdb import set_trace as bp
  2  	
  3  	def fun_with_error():
  4  	    bp()  # put breakpoint here
  5  ->	    a=1
  6  	    b=0
  7  	    return a/b
  8  	
  9  	fun_with_error()
[EOF]
(Pdb) n
> <ipython-input-211-d6cf5207c297>(6)fun_with_error()
-> b=0
(Pdb) n
> <ipython-input-211-d6cf5207c297>(7)fun_with_error()
-> return a/b
(Pdb) a
(Pdb) p a
1
(Pdb) p b
0
(Pdb) a/b
(Pdb) p a/b
*** ZeroDivisionError: division by zero
(Pdb) n
ZeroDivisionError: division by zero
> <ipython-input-211-d6cf5207c297>(7)fun_with_error()
-> return a/b
(Pdb) q


BdbQuit: 

In [None]:
1+1

In [212]:
def fun_with_error():
    a=1
    b=0
    return a/b

fun_with_error()

ZeroDivisionError: division by zero

In [213]:
%debug

> [1;32m<ipython-input-212-18b7d942f28c>[0m(4)[0;36mfun_with_error[1;34m()[0m
[1;32m      3 [1;33m    [0mb[0m[1;33m=[0m[1;36m0[0m[1;33m[0m[0m
[0m[1;32m----> 4 [1;33m    [1;32mreturn[0m [0ma[0m[1;33m/[0m[0mb[0m[1;33m[0m[0m
[0m[1;32m      5 [1;33m[1;33m[0m[0m
[0m
ipdb> l
[0;32m      1 [0m[1;32mdef[0m [0mfun_with_error[0m[1;33m([0m[1;33m)[0m[1;33m:[0m[1;33m[0m[0m
[0;32m      2 [0m    [0ma[0m[1;33m=[0m[1;36m1[0m[1;33m[0m[0m
[0;32m      3 [0m    [0mb[0m[1;33m=[0m[1;36m0[0m[1;33m[0m[0m
[1;32m----> 4 [1;33m    [1;32mreturn[0m [0ma[0m[1;33m/[0m[0mb[0m[1;33m[0m[0m
[0m[0;32m      5 [0m[1;33m[0m[0m
[0;32m      6 [0m[0mfun_with_error[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m

ipdb> q


List of debugging commands [here](http://web.stanford.edu/class/physics91si/2013/handouts/Pdb_Commands.pdf)

In [214]:
for l in open("lexicon.csv", "r"):
    print(l,end='')

sleep, sli:p, v.i, a condition of body and mind ...
walk, wo:k, v.intr, progress by lifting and setting down each foot ...
wake, weik, intrans, cease to sleep


In [215]:
import csv
input_file = open("lexicon.csv", "r") 
for row in csv.reader(input_file):
    print(row)

['sleep', ' sli:p', ' v.i', ' a condition of body and mind ...']
['walk', ' wo:k', ' v.intr', ' progress by lifting and setting down each foot ...']
['wake', ' weik', ' intrans', ' cease to sleep']


In [None]:
csv.reader?