1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of documents by calculating Term Frequency and Inverse
DocumentFrequency.`

In [3]:
import nltk
import spacy

## 1. Tokenization

In [82]:
from nltk.tokenize import word_tokenize
example_text = 'Mr. Peter just came from running and he was eating apple which is better for health'

nltk_tokens = word_tokenize(example_text)
print('Tokens:')
print(nltk_tokens)

Tokens:
['Mr.', 'Peter', 'just', 'came', 'from', 'running', 'and', 'he', 'was', 'eating', 'apple', 'which', 'is', 'better', 'for', 'health']


In [83]:
# !python -m spacy download en_core_web_sm

In [84]:
model = spacy.load('en_core_web_sm')
spacy_tokens = model(example_text)
for token in spacy_tokens:
    print(f'\'{token}\'',end=' ')

'Mr.' 'Peter' 'just' 'came' 'from' 'running' 'and' 'he' 'was' 'eating' 'apple' 'which' 'is' 'better' 'for' 'health' 

## 2. POS tagging

In [85]:
from nltk import pos_tag

print('Pos Tags:')
print(pos_tag(nltk_tokens))

Pos Tags:
[('Mr.', 'NNP'), ('Peter', 'NNP'), ('just', 'RB'), ('came', 'VBD'), ('from', 'IN'), ('running', 'VBG'), ('and', 'CC'), ('he', 'PRP'), ('was', 'VBD'), ('eating', 'VBG'), ('apple', 'NN'), ('which', 'WDT'), ('is', 'VBZ'), ('better', 'RBR'), ('for', 'IN'), ('health', 'NN')]


In [86]:
for token in spacy_tokens:
    print(token,'-->',token.pos_)

Mr. --> PROPN
Peter --> PROPN
just --> ADV
came --> VERB
from --> ADP
running --> VERB
and --> CCONJ
he --> PRON
was --> AUX
eating --> VERB
apple --> NOUN
which --> PRON
is --> AUX
better --> ADJ
for --> ADP
health --> NOUN


## 3. Stop Words Removal

In [87]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
filtered_words = [word for word in nltk_tokens if word not in stop_words]
filtered_words

['Mr.', 'Peter', 'came', 'running', 'eating', 'apple', 'better', 'health']

In [88]:
for token in spacy_tokens:
    if not token.is_stop:
        print(token,end=' ')

Mr. Peter came running eating apple better health 

## 4. Stemming

In [94]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in nltk_tokens]
print(' '.join(stemmed_words))

mr. peter just came from run and he wa eat appl which is better for health


## 5. Lemmatization

In [99]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word,'a') for word in nltk_tokens]
print(' '.join(lemmatized_words))

Mr. Peter just came from running and he was eating apple which is good for health


In [100]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word,'v') for word in nltk_tokens]
print(' '.join(lemmatized_words))

Mr. Peter just come from run and he be eat apple which be better for health


In [102]:
for token in spacy_tokens:
    print(token.lemma_,end=' ')

Mr. Peter just come from run and he be eat apple which be well for health 

## 6. TF-IDF

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()
corpus = ['There is difference between apple fruit and apple company',
          'An apple everyday keeps the doctor away']

tfidf_weights = tfidfvec.fit_transform(corpus).toarray()

for i,doc in enumerate(corpus):
    print(f'Doc {i}:')
    for word in word_tokenize(corpus[i].lower()):
        if word!='.':
            idx = tfidfvec.vocabulary_[word]
            print(word,'-->',tfidf_weights[i][idx])

Doc 0:
there --> 0.3328717774571481
is --> 0.3328717774571481
difference --> 0.3328717774571481
between --> 0.3328717774571481
apple --> 0.4736820224661604
fruit --> 0.3328717774571481
and --> 0.3328717774571481
apple --> 0.4736820224661604
company --> 0.3328717774571481
Doc 1:
an --> 0.3920440146223274
apple --> 0.2789425453258252
everyday --> 0.3920440146223274
keeps --> 0.3920440146223274
the --> 0.3920440146223274
doctor --> 0.3920440146223274
away --> 0.3920440146223274


In [122]:
tfidf_weights[1]

<1x16 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

## only spacy

In [1]:
import spacy

In [2]:
# !python -m spacy download en_core_web_sm

In [None]:
example_text = 'Mr. Peter just came from running and he was eating apple which is better for health'

model = spacy.load('en_core_web_sm')
tokens = model(example_text)

### tokenization

In [7]:
for token in tokens:
    print(token,end=' ')

Mr. Peter just came from running and he was eating apple which is better for health 

### pos tagging

In [8]:
for token in tokens:
    print(token,'-->',token.pos_)

Mr. --> PROPN
Peter --> PROPN
just --> ADV
came --> VERB
from --> ADP
running --> VERB
and --> CCONJ
he --> PRON
was --> AUX
eating --> VERB
apple --> NOUN
which --> PRON
is --> AUX
better --> ADJ
for --> ADP
health --> NOUN


### stop words removal

In [12]:
for token in tokens:
    if not token.is_stop:
        print(token,end=' ')

Mr. Peter came running eating apple better health 

### lemmatization

In [14]:
for token in tokens:
    print(token.lemma_,end=' ')

Mr. Peter just come from run and he be eat apple which be well for health 