In [1]:
import spacy
import pickle
from spacy import displacy

In [2]:
tweets_by_year = None
tweets_by_month = None

tweets_month_file_name = 'data/tweets-by-month.data'
tweets_window_file_name = 'data/tweets-by-window.data'

with open( tweets_window_file_name, 'rb' ) as filehandle:  
    
    # read the data as binary data stream
    tweets_by_year = pickle.load( filehandle )
    
with open( tweets_month_file_name, 'rb' ) as filehandle:  
    
    # read the data as binary data stream
    tweets_by_month = pickle.load( filehandle )
    
print( len( tweets_by_year ) )
print( len( tweets_by_month ) )

101
112


In [3]:
nlp = spacy.load( 'en' )

### Linguistic annotations

In [4]:
doc = u'Apple is looking at buying U.K. startup for $1 billion'

def print_pos( doc ):
    
    doc = nlp( doc )
    
    for token in doc:
        print( token.text, token.pos_, token.dep_ )
        
print_pos( doc )        

Apple PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [5]:
last_month = tweets_by_month[ -1 ]

In [6]:
print( last_month[ 0 ][ -1 ], last_month[ 1 ][ -1 ] )

2018-08-28 09:57:51 NASDAQ has just gone above 8000 for the first time in history!


In [7]:
tweet = last_month[ 1 ][ -1 ]
print_pos( tweet )

NASDAQ PROPN nsubj
has VERB aux
just ADV advmod
gone VERB ROOT
above ADP prep
8000 NUM pobj
for ADP prep
the DET det
first ADJ amod
time NOUN pobj
in ADP prep
history NOUN pobj
! PUNCT punct


### Tokenization

In [8]:
def print_tokens( doc ):
    
    doc = nlp( doc )
    for token in doc:
        print( token.text )
        
tweet = last_month[ 1 ][ -1 ]
print_tokens( tweet )        

NASDAQ
has
just
gone
above
8000
for
the
first
time
in
history
!


### PoS + Dependencies

In [9]:
def print_pos_and_dependencies( doc ):

    doc = nlp( doc )
    for token in doc:
        print( token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop )
        
print_pos_and_dependencies( tweet )        

NASDAQ nasdaq PROPN NNP nsubj XXXX True False
has have VERB VBZ aux xxx True True
just just ADV RB advmod xxxx True True
gone go VERB VBN ROOT xxxx True False
above above ADP IN prep xxxx True True
8000 8000 NUM CD pobj dddd False False
for for ADP IN prep xxx True True
the the DET DT det xxx True True
first first ADJ JJ amod xxxx True True
time time NOUN NN pobj xxxx True False
in in ADP IN prep xx True True
history history NOUN NN pobj xxxx True False
! ! PUNCT . punct ! False False


### Named Entities

In [10]:
def print_named_entities( doc ):
    
    doc = nlp( doc )
    
    for ent in doc.ents:
        print( ent.text, ent.start_char, ent.end_char, ent.label_ )
    
print_named_entities( tweet )    

NASDAQ 0 6 ORG
8000 27 31 CARDINAL
first 40 45 ORDINAL


### Visualize Parse

In [11]:
# from: https://stackoverflow.com/questions/25698448/how-to-embed-html-into-ipython-output
from IPython.core.display import display, HTML

#doc = nlp( u'This is a sentence.' )
display( HTML( displacy.render( nlp( tweet ), style='dep', page="true" ) ) )

### Word Vectors and Similarity

In [16]:
tokens = u'dog cat banana'

def print_similiary( tokens ):
    
    tokens = nlp( tokens )
    for token1 in tokens:
        for token2 in tokens:
            print( token1.text, token2.text, token1.similarity( token2 ) )
            
print_similiary( tokens )            

dog dog 1.0
dog cat 0.53906965
dog banana 0.28761008
cat dog 0.53906965
cat cat 1.0
cat banana 0.48752162
banana dog 0.28761008
banana cat 0.48752162
banana banana 1.0


In [17]:
print_similiary( "rocks mountains" )

rocks rocks 1.0
rocks mountains 0.4934799
mountains rocks 0.4934799
mountains mountains 1.0


In [19]:
print_similiary( "politician human" )

politician politician 1.0
politician human 0.27011883
human politician 0.27011883
human human 1.0
