# Texts as vectors




In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

import string

In [2]:
# Very customizable:
CountVectorizer??

In [3]:
corpus = [
 'This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?',
 '@user This one is a tweet #meta ;)' 
]

In [4]:
vectorizer = CountVectorizer()

In [5]:
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'meta' 'one' 'second' 'the' 'third' 'this'
 'tweet' 'user']


In [6]:
print(X.toarray())

[[0 1 1 1 0 0 0 1 0 1 0 0]
 [0 2 0 1 0 0 1 1 0 1 0 0]
 [1 0 0 1 0 1 0 1 1 1 0 0]
 [0 1 1 1 0 0 0 1 0 1 0 0]
 [0 0 0 1 1 1 0 0 0 1 1 1]]


### Customizing the Vectorizer

In [7]:
import nltk

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)
tokenize_funct = tokenizer.tokenize
nltk.download('stopwords')
word_blacklist = stopwords.words('english') + list(string.punctuation)
vectorizer_tweet = CountVectorizer(tokenizer=tokenize_funct, stop_words=word_blacklist)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
X1 = vectorizer_tweet.fit_transform(corpus)
print(vectorizer_tweet.get_feature_names())

['#meta', ';)', 'document', 'first', 'one', 'second', 'third', 'tweet']




In [9]:
print(X1.toarray())

[[0 0 1 1 0 0 0 0]
 [0 0 2 0 0 1 0 0]
 [0 0 0 0 1 0 1 0]
 [0 0 1 1 0 0 0 0]
 [1 1 0 0 1 0 0 1]]


In [10]:
vectorizer_tweet.transform([corpus[0]]).toarray()

array([[0, 0, 1, 1, 0, 0, 0, 0]])

In [11]:
vectorizer_tweet.get_feature_names_out()

array(['#meta', ';)', 'document', 'first', 'one', 'second', 'third',
       'tweet'], dtype=object)

In [12]:
vectorizer_tweet.fit_transform([corpus[1]]).toarray()

array([[2, 1]])

In [13]:
vectorizer_tweet.fit(corpus)

  "The parameter 'token_pattern' will not be used"


CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7ff3b30104d0>>)

In [14]:
vectorizer_tweet.get_feature_names_out()

array(['#meta', ';)', 'document', 'first', 'one', 'second', 'third',
       'tweet'], dtype=object)

In [15]:
vectorizer_tweet.transform([corpus[1]]).toarray()

array([[0, 0, 2, 0, 0, 1, 0, 0]])

In [16]:
vectorizer2 = CountVectorizer(analyzer='char', ngram_range=(3, 5))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names_out())

[' #m' ' #me' ' #met' ' ;)' ' a ' ' a t' ' a tw' ' do' ' doc' ' docu'
 ' fi' ' fir' ' firs' ' is' ' is ' ' is a' ' is t' ' on' ' one' ' one '
 ' one.' ' se' ' sec' ' seco' ' th' ' the' ' the ' ' thi' ' thir' ' this'
 ' tw' ' twe' ' twee' '#me' '#met' '#meta' '@us' '@use' '@user' 'a ;'
 'a ;)' 'a t' 'a tw' 'a twe' 'and' 'and ' 'and t' 'con' 'cond' 'cond '
 'cum' 'cume' 'cumen' 'd d' 'd do' 'd doc' 'd o' 'd on' 'd one' 'd t'
 'd th' 'd thi' 'doc' 'docu' 'docum' 'e f' 'e fi' 'e fir' 'e i' 'e is'
 'e is ' 'e s' 'e se' 'e sec' 'e t' 'e th' 'e thi' 'eco' 'econ' 'econd'
 'eet' 'eet ' 'eet #' 'ent' 'ent ' 'ent i' 'ent.' 'ent?' 'er ' 'er t'
 'er th' 'et ' 'et #' 'et #m' 'eta' 'eta ' 'eta ;' 'fir' 'firs' 'first'
 'he ' 'he f' 'he fi' 'he s' 'he se' 'he t' 'he th' 'hir' 'hird' 'hird '
 'his' 'his ' 'his d' 'his i' 'his o' 'his t' 'ird' 'ird ' 'ird o' 'irs'
 'irst' 'irst ' 'is ' 'is a' 'is a ' 'is d' 'is do' 'is i' 'is is' 'is o'
 'is on' 'is t' 'is th' 'men' 'ment' 'ment ' 'ment.' 'ment?' 'met' '

In [17]:
X2.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]])

In [18]:
vectorizer_tfidf = TfidfVectorizer(stop_words=word_blacklist)
X3 = vectorizer_tfidf.fit_transform(corpus)
print(vectorizer_tfidf.get_feature_names_out())

['document' 'first' 'meta' 'one' 'second' 'third' 'tweet' 'user']


In [19]:
X3.toarray()

array([[0.63871058, 0.76944707, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.80130969, 0.        , 0.        , 0.        , 0.59824977,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.77828292, 0.        , 0.        ],
       [0.63871058, 0.76944707, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.52335825, 0.42224214, 0.        ,
        0.        , 0.52335825, 0.52335825]])

Freq vs tf-idf

In [20]:
corpus2 = [
    "this is the best example in the world",
    "the cat sits on the mat",
    "the bee and the bird"
]

In [21]:
vectorizerf = CountVectorizer()
Xf = vectorizerf.fit_transform(corpus2)
print(vectorizerf.get_feature_names_out())

['and' 'bee' 'best' 'bird' 'cat' 'example' 'in' 'is' 'mat' 'on' 'sits'
 'the' 'this' 'world']


In [22]:
Xf.toarray()

array([[0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 2, 1, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 2, 0, 0],
       [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]])

In [23]:
vectorizertfidf = TfidfVectorizer()
Xtfidf = vectorizertfidf.fit_transform(corpus2)
print(vectorizertfidf.get_feature_names_out())

['and' 'bee' 'best' 'bird' 'cat' 'example' 'in' 'is' 'mat' 'on' 'sits'
 'the' 'this' 'world']


In [24]:
Xtfidf.toarray()

array([[0.        , 0.        , 0.36772387, 0.        , 0.        ,
        0.36772387, 0.36772387, 0.36772387, 0.        , 0.        ,
        0.        , 0.43436728, 0.36772387, 0.36772387],
       [0.        , 0.        , 0.        , 0.        , 0.4305185 ,
        0.        , 0.        , 0.        , 0.4305185 , 0.4305185 ,
        0.4305185 , 0.50854232, 0.        , 0.        ],
       [0.4769856 , 0.4769856 , 0.        , 0.4769856 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.56343076, 0.        , 0.        ]])

## Computing similarities¶

### A naive solution for semantic text similarity / mini search example

In [25]:
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
vectorizer_tfidf = TfidfVectorizer(stop_words=word_blacklist)
X3 = vectorizer_tfidf.fit_transform(corpus)

In [27]:
# query = 'looking for the first document'
query = 'first open the document'
Xquery = vectorizer_tfidf.transform([query])
# Xquery = vectorizer2.transform([query]) # can we use this further on to compute similarities?
print(Xquery.toarray())

[[0.63871058 0.76944707 0.         0.         0.         0.
  0.         0.        ]]


In [28]:
for i, d1 in enumerate(X3):
# for i, d1 in enumerate(X2): # does this work?
    print("Doc %d - query similarity: %f ('%s')" % (i, cosine_similarity(d1,Xquery), corpus[i]))

Doc 0 - query similarity: 1.000000 ('This is the first document.')
Doc 1 - query similarity: 0.511805 ('This document is the second document.')
Doc 2 - query similarity: 0.000000 ('And this is the third one.')
Doc 3 - query similarity: 1.000000 ('Is this the first document?')
Doc 4 - query similarity: 0.000000 ('@user This one is a tweet #meta ;)')


In [29]:
from scipy.spatial.distance import cdist
import numpy as np

dist = cdist(X3.toarray(), Xquery.toarray(), metric='cosine')
dist

array([[0.        ],
       [0.48819503],
       [1.        ],
       [0.        ],
       [1.        ]])

In [30]:
Xquery.toarray().shape

(1, 8)

In [31]:
import numpy as np
similarity_rank = np.argsort(dist, axis=0)
similarity_rank

array([[0],
       [3],
       [1],
       [2],
       [4]])

In [32]:
print("Searching for '%s'..." % query)
print("Ranked results:")
for i in similarity_rank.flatten():
    print("\t", corpus[i], dist[i])

Searching for 'first open the document'...
Ranked results:
	 This is the first document. [0.]
	 Is this the first document? [0.]
	 This document is the second document. [0.48819503]
	 And this is the third one. [1.]
	 @user This one is a tweet #meta ;) [1.]


More sophisticated solutions (later):

- word and sentence embeddings
  - doc2vec
  - Universal Sentence Encoder, BERT/GPT (hugging face)

# Exercises

### Stylistic similarity - a measure

Find two pieces of news on the internet, or download some texts from a NLTK corpus (e.g. 'shakespeare') and measure their similarity in terms of common word usage as done above - but this time, only consider the stopwords (and remove content words). Try to use a custom vectorizer.

For example, in the text "This is the first document" would be converted to the following list of tokens: 
``["this", "is", "the"]``

Is the resulted similarity higher for texts belonging to the same author?

In [33]:
nltk.download('shakespeare')

[nltk_data] Downloading package shakespeare to /root/nltk_data...
[nltk_data]   Package shakespeare is already up-to-date!


True

In [34]:
!python -m nltk.downloader

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [35]:
"""
 In this section, we will analyze two sections of text from two distinct 
 Shakespeare's plays to highlight how stopwords can serve with the task of 
 authorship identification.
"""
from nltk.corpus import shakespeare
shakespeare.fileids()

play1, play2 = shakespeare.xml('macbeth.xml'), shakespeare.xml('othello.xml')

In [36]:
for act in play1:
  print('%s: %s' % (act.tag, ' '.join(list(act.itertext()))))

TITLE: The Tragedy of Macbeth
PERSONAE: 
 Dramatis Personae 

 DUNCAN, king of Scotland. 

 
 MALCOLM 
 DONALBAIN 
 his sons. 
 


 
 MACBETH 
 BANQUO 
 generals of the king's army. 
 


 
 MACDUFF 
 LENNOX 
 ROSS 
 MENTEITH 
 ANGUS 
 CAITHNESS 
 noblemen of Scotland. 
 

 FLEANCE, son to Banquo. 
 SIWARD, Earl of Northumberland, general of the English forces. 
 YOUNG SIWARD, his son. 
 SEYTON, an officer attending on Macbeth. 
 Boy, son to Macduff.  
 An English Doctor.  
 A Scotch Doctor.  
 A Soldier. 
 A Porter. 
 An Old Man. 
 LADY MACBETH 
 LADY MACDUFF 
 Gentlewoman attending on Lady Macbeth.  
 HECATE 
 Three Witches. 
 Apparitions. 
 Lords, Gentlemen, Officers, Soldiers, Murderers, Attendants, and Messengers.  

SCNDESCR: SCENE  Scotland: England.
PLAYSUBT: MACBETH
ACT: ACT I 

 SCENE I.  A desert place. 
 Thunder and lightning. Enter three Witches 

 
 First Witch 
 When shall we three meet again 
 In thunder, lightning, or in rain? 
 

 
 Second Witch 
 When the hurlyburly's

In [37]:
for act in play2:
  print('%s: %s' % (act.tag, ' '.join(list(act.itertext()))))

TITLE: The Tragedy of Othello, the Moor of Venice
PERSONAE: 
 Dramatis Personae 

 DUKE OF VENICE 
 BRABANTIO, a senator. 
 Other Senators. 
 GRATIANO, brother to Brabantio. 
 LODOVICO, kinsman to Brabantio. 
 OTHELLO, a noble Moor in the service of the Venetian state. 
 CASSIO, his lieutenant. 
 IAGO, his ancient. 
 RODERIGO, a Venetian gentleman. 
 MONTANO, Othello's predecessor in the government of Cyprus. 
 Clown, servant to Othello.  
 DESDEMONA, daughter to Brabantio and wife to Othello. 
 EMILIA, wife to Iago. 
 BIANCA, mistress to Cassio. 
 Sailor, Messenger, Herald, Officers, Gentlemen, Musicians, and Attendants. 

SCNDESCR: SCENE  Venice: a Sea-port in Cyprus.
PLAYSUBT: OTHELLO
ACT: ACT I 

 SCENE I.  Venice. A street. 
 Enter RODERIGO and IAGO 

 
 RODERIGO 
 Tush! never tell me; I take it much unkindly 
 That thou, Iago, who hast had my purse 
 As if the strings were thine, shouldst know of this. 
 

 
 IAGO 
 'Sblood, but you will not hear me: 
 If ever I did dream of such

In [38]:
text1 = ' '.join(list(play1[4].itertext()))
print('First text: ', text1)

text2 = ' '.join(list(play2[4].itertext()))
print('Second text: ', text2)

First text:  ACT I 

 SCENE I.  A desert place. 
 Thunder and lightning. Enter three Witches 

 
 First Witch 
 When shall we three meet again 
 In thunder, lightning, or in rain? 
 

 
 Second Witch 
 When the hurlyburly's done, 
 When the battle's lost and won. 
 

 
 Third Witch 
 That will be ere the set of sun. 
 

 
 First Witch 
 Where the place? 
 

 
 Second Witch 
 Upon the heath. 
 

 
 Third Witch 
 There to meet with Macbeth. 
 

 
 First Witch 
 I come, Graymalkin! 
 

 
 Second Witch 
 Paddock calls. 
 

 
 Third Witch 
 Anon. 
 

 
 ALL 
 Fair is foul, and foul is fair: 
 Hover through the fog and filthy air. 
 


 Exeunt 
 

 SCENE II.  A camp near Forres. 
 Alarum within. Enter DUNCAN, MALCOLM, DONALBAIN,
LENNOX, with Attendants, meeting a bleeding Sergeant 

 
 DUNCAN 
 What bloody man is that? He can report, 
 As seemeth by his plight, of the revolt 
 The newest state. 
 

 
 MALCOLM 
 This is the sergeant 
 Who like a good and hardy soldier fought 
 'Gainst my capt

In [39]:
import nltk
nltk.download('punkt')

tokenized_text1 = nltk.tokenize.word_tokenize(text1, language='english')
tokenized_text2 = nltk.tokenize.word_tokenize(text2, language='english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
english_stopwords = stopwords.words('english')
english_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [41]:
import unicodedata

def normalize_tokens(tokenized_text, stopwords=[]):
  # Lowercasing
  tokens = [t.lower() for t in tokenized_text]
  # Remove punctuation
  tokens = [t for t in tokens if t not in string.punctuation]
  # Remove stopwords
  tokens = [t for t in tokens if t in stopwords]
  # Normalize
  tokens = [unicodedata.normalize('NFKD', t).encode('ascii', 'ignore').decode('utf-8', 'ignore') for t in tokens]

  return tokens

normalized_text1 = ' '.join(normalize_tokens(tokenized_text1, english_stopwords))
normalized_text2 = ' '.join(normalize_tokens(tokenized_text2, english_stopwords))

In [42]:
vectorizer_tfidf = TfidfVectorizer(strip_accents='ascii', stop_words=None)

vectorized_text1 = vectorizer_tfidf.fit_transform([normalized_text1])
vectorized_text2 = vectorizer_tfidf.transform([normalized_text2])

print('Cosine similarity: ', cosine_similarity(vectorized_text1.toarray(), vectorized_text2.toarray()).flatten()[0])

Cosine similarity:  0.950820578746952
