In [None]:
# IMPORT SPACY

# The default model which is english-core-web, for which we load the “en_core_web_sm” model.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# WORD TOKENIZE

# Tokenize words to get the tokens of the text i.e breaking the sentences into words.

In [None]:
text = """Most of the outlay will be at home. No surprise there, either. While Samsung has expanded overseas, South Korea is still host to most of its factories and research engineers. """
doc = nlp(text)
print(doc)

words = [token.text for token in doc]

print (words)

Most of the outlay will be at home. No surprise there, either. While Samsung has expanded overseas, South Korea is still host to most of its factories and research engineers. 
['Most', 'of', 'the', 'outlay', 'will', 'be', 'at', 'home', '.', 'No', 'surprise', 'there', ',', 'either', '.', 'While', 'Samsung', 'has', 'expanded', 'overseas', ',', 'South', 'Korea', 'is', 'still', 'host', 'to', 'most', 'of', 'its', 'factories', 'and', 'research', 'engineers', '.']


In [None]:
# SENTENCE TOKENIZE

# Tokenize sentences if the there are more than 1 sentence i.e breaking the sentences to list of sentence.

In [None]:
list(doc.sents)

[Most of the outlay will be at home.,
 No surprise there, either.,
 While Samsung has expanded overseas, South Korea is still host to most of its factories and research engineers.]

In [None]:
# STOP WORDS REMOVAL
# Remove irrelevant words using nltk stop words like is,the,a etc
# from the sentences as they don’t carry any information.

In [None]:
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
print (words)

['outlay', 'home', 'surprise', 'Samsung', 'expanded', 'overseas', 'South', 'Korea', 'host', 'factories', 'research', 'engineers']


In [None]:
# Lemma

# lemmatize the text so as to get its root form eg: functions,funtionality as function

In [None]:
for token in doc:
    print(token,"-->", token.lemma_)

Most --> Most
of --> of
the --> the
outlay --> outlay
will --> will
be --> be
at --> at
home --> home
. --> .
No --> no
surprise --> surprise
there --> there
, --> ,
either --> either
. --> .
While --> while
Samsung --> Samsung
has --> have
expanded --> expand
overseas --> overseas
, --> ,
South --> South
Korea --> Korea
is --> be
still --> still
host --> host
to --> to
most --> most
of --> of
its --> its
factories --> factory
and --> and
research --> research
engineers --> engineer
. --> .


In [None]:
# Get word frequency
# counting the word occurrence using FreqDist library. Word frequency helps us to determine how important the word is in the document
#  by knowing how many times the word is being used.

In [None]:
from collections import Counter

In [None]:
word_freq = Counter(words)
common_words = word_freq.most_common(5)
print (common_words)

[('outlay', 1), ('home', 1), ('surprise', 1), ('Samsung', 1), ('expanded', 1)]


In [None]:
# POS tags

# POS tag helps us to know the tags of each word like whether a word is noun, adjective etc

In [None]:
for w in doc:
    print (w,"-->",w.pos_)

Most --> ADJ
of --> ADP
the --> DET
outlay --> NOUN
will --> AUX
be --> AUX
at --> ADP
home --> NOUN
. --> PUNCT
No --> DET
surprise --> NOUN
there --> ADV
, --> PUNCT
either --> ADV
. --> PUNCT
While --> SCONJ
Samsung --> PROPN
has --> AUX
expanded --> VERB
overseas --> ADV
, --> PUNCT
South --> PROPN
Korea --> PROPN
is --> AUX
still --> ADV
host --> NOUN
to --> ADP
most --> ADJ
of --> ADP
its --> PRON
factories --> NOUN
and --> CCONJ
research --> NOUN
engineers --> NOUN
. --> PUNCT
