In [3]:
import nltk
import nltk.data
nltk.download('punkt')
from textblob import TextBlob
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load("en_core_web_lg")

#tokenizzazione

#text to tokenize
text = "This is a tokenize test"
from nltk.tokenize import word_tokenize
word_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['This', 'is', 'a', 'tokenize', 'test']

In [4]:
#tokenizzazione con TextBlob
TextBlob(text).words

WordList(['This', 'is', 'a', 'tokenize', 'test'])

In [6]:
text = "S&P and NASDAQ are the two most popular indices in US"

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
text_tokens = word_tokenize(text)
tokens_without_sw= [word for word in text_tokens if not word in stop_words]

print(tokens_without_sw)

['S', '&', 'P', 'NASDAQ', 'two', 'popular', 'indices', 'US']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
text = "It's a Stemming testing"

parsed_text = word_tokenize(text)

# Initialize stemmer.
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

# Stem each word.
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_text)
 if word.lower() != stemmer.stem(parsed_text[i])]

[('Stemming', 'stem'), ('testing', 'test')]

In [9]:
text = "This world has a lot of faces "

from textblob import Word
nltk.download('wordnet')
parsed_data= TextBlob(text).words
[(word, word.lemmatize()) for i, word in enumerate(parsed_data)
 if word != parsed_data[i].lemmatize()]

[nltk_data] Downloading package wordnet to /root/nltk_data...


[('has', 'ha'), ('faces', 'face')]

In [12]:
text = 'Google is looking at buying U.K. startup for $1 billion'
nltk.download('averaged_perceptron_tagger')
TextBlob(text).tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('Google', 'NNP'),
 ('is', 'VBZ'),
 ('looking', 'VBG'),
 ('at', 'IN'),
 ('buying', 'VBG'),
 ('U.K.', 'NNP'),
 ('startup', 'NN'),
 ('for', 'IN'),
 ('1', 'CD'),
 ('billion', 'CD')]

In [13]:
text = 'Google is looking at buying U.K. startup for $1 billion'

for entity in nlp(text).ents:
    print("Entity: ", entity.text)

Entity:  Google
Entity:  U.K.
Entity:  $1 billion


In [14]:
from spacy import displacy 
displacy.render(nlp(text), style="ent", jupyter = True)

In [16]:
text = 'Google is looking at buying U.K. startup for $1 billion'
doc = nlp(text)
import pandas as pd
pd.DataFrame([[t.text, t.is_stop, t.lemma_, t.pos_]
              for t in doc],
             columns=['Token', 'is_stop_word', 'lemma', 'POS'])

Unnamed: 0,Token,is_stop_word,lemma,POS
0,Google,False,Google,PROPN
1,is,True,be,AUX
2,looking,False,look,VERB
3,at,True,at,ADP
4,buying,False,buy,VERB
5,U.K.,False,U.K.,PROPN
6,startup,False,startup,NOUN
7,for,True,for,ADP
8,$,False,$,SYM
9,1,False,1,NUM


In [17]:
sentences = [
'The stock price of google jumps on the earning data today',
'Google plunge on China Data!'
]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
print( vectorizer.fit_transform(sentences).todense() )
print( vectorizer.vocabulary_ )

[[0 1 1 1 1 1 1 0 1 1 2 1]
 [1 1 0 1 0 0 1 1 0 0 0 0]]
{'the': 10, 'stock': 9, 'price': 8, 'of': 5, 'google': 3, 'jumps': 4, 'on': 6, 'earning': 2, 'data': 1, 'today': 11, 'plunge': 7, 'china': 0}


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
TFIDF = vectorizer.fit_transform(sentences)
print(vectorizer.get_feature_names_out()[-10:])
print(TFIDF.shape)
print(TFIDF.toarray())

['china' 'data' 'earning' 'google' 'jumps' 'plunge' 'price' 'stock'
 'today']
(2, 9)
[[0.         0.29017021 0.4078241  0.29017021 0.4078241  0.
  0.4078241  0.4078241  0.4078241 ]
 [0.57615236 0.40993715 0.         0.40993715 0.         0.57615236
  0.         0.         0.        ]]


In [20]:
doc = nlp("Apple orange cats dogs")
print("Vector representation of the sentence for first 10 features: \n",\
doc.vector[0:10])

Vector representation of the sentence for first 10 features: 
 [-0.43863457 -0.3626267  -0.20616335  1.0866606   0.44007337 -0.03703639
  0.7782439   0.55959123  0.21888082 -0.40239647]


In [31]:
from gensim.models import Word2Vec

sentences = [
['The','stock','price', 'of', 'Google', 'increases'],
['Google','plunge',' on','China',' Data!']]

# train model
model = Word2Vec(sentences, min_count=1)

# summarize the loaded model
words = list(model.wv.key_to_index)
print(words)
print(model.wv.get_vector('Google')[1:5])

['Google', ' Data!', 'China', ' on', 'plunge', 'increases', 'of', 'price', 'stock', 'The']
[ 0.00023643  0.00510335  0.00900927 -0.00930295]


In [32]:
sentences = [
'The stock price of google jumps on the earning data today',
'Google plunge on China Data!']
sentiment = (1, 0)
data = pd.DataFrame({'Sentence':sentences,
        'sentiment':sentiment})

# feature extraction
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(data['Sentence'])
X_train_vectorized = vect.transform(data['Sentence'])

# Running naive bayes model
from sklearn.naive_bayes import MultinomialNB
clfrNB = MultinomialNB(alpha=0.1)
clfrNB.fit(X_train_vectorized, data['sentiment'])

#Testing the model
preds = clfrNB.predict(vect.transform(['Apple price plunge',\
 'Amazon price jumps']))
preds

array([0, 1])

In [33]:
sentences = [
'The stock price of google jumps on the earning data today',
'Google plunge on China Data!'
]

#Getting the bag of words
from sklearn.decomposition import LatentDirichletAllocation
vect=CountVectorizer(ngram_range=(1, 1),stop_words='english')
sentences_vec=vect.fit_transform(sentences)

#Running LDA on the bag of words.
from sklearn.feature_extraction.text import CountVectorizer
lda=LatentDirichletAllocation(n_components=3)
lda.fit_transform(sentences_vec)

array([[0.04311114, 0.91377772, 0.04311114],
       [0.06869319, 0.86261362, 0.06869319]])