In [None]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
doc1 = "Text fg Analytics is boring boring boring!!"
doc2 = "Analytics is interesting"
doc3 = "We want interesting sports analytics"

In [None]:
def clean_txt(sent):
    tokens = word_tokenize(sent.lower())
    stop_updated = stopwords.words("english") + list(punctuation) 
    final_word = [term for term in tokens if term not in stop_updated 
               and len(term) > 2] 
    res = " ".join(final_word)
    return res

In [None]:
doc1_clean = clean_txt(doc1)
doc1_clean

In [None]:
doc2_clean = clean_txt(doc2)
doc2_clean

In [None]:
doc3_clean = clean_txt(doc3)
doc3_clean

In [None]:
doc = pd.DataFrame([doc1_clean, doc2_clean, doc3_clean], columns=["text"])
doc

In [None]:
#CountVectorizer of sklearn gives us the DTM
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#Instantiating CountVectorizer
count_vect = CountVectorizer(ngram_range =(2,2))

In [None]:
#Invoking fit_transoform on the CountVectorizer object
X = count_vect.fit_transform(doc['text'])
X

In [None]:
X.toarray()

In [None]:
X.todense()

- fit -> extract distinct words to form a bag of words
- transform -> compute the occourrence of every word from the BOW in each document

In [None]:
#Obtaining the BOW's
bow = count_vect.get_feature_names()
bow

In [None]:
#Get the position of every word in the BOW, as a dictionary
print(count_vect.vocabulary_)

In [None]:
#To get the position of a particular word in BOW
count_vect.vocabulary_['boring boring']

In [None]:
#Printing the document term matrix can be done using toarray() or todense()
#methods
X.toarray()

In [None]:
X.todense()

In [None]:
DTM = pd.DataFrame(X.toarray(),columns = count_vect.get_feature_names())
DTM

In [None]:
DTM = pd.DataFrame(X.toarray(),columns = bow)
DTM

In [None]:
#Term document matrix or TDM is a transpose of DTM 
#which is used in finding similarity between words
TDM = DTM.T
TDM

Converting a newly seen document using the same DTM

In [None]:
ndoc = ["awesome paper is not always boring boring","is paper really text analytics"]

In [None]:
count_vect.transform(ndoc)

In [None]:
DTMnd = pd.DataFrame(count_vect.transform(ndoc).toarray(),columns = count_vect.get_feature_names())
DTMnd

## ngram analysis
- unigram -> a token comprises of a exactly a single word
- bigram -> a token comprises of exactly two words
- trigram -> a token comprises of eaxctly three words

### The next immediate word(s) are grouped with the current word to form a bigram or a tri-gram
- For example: suppose we have a string " how are you doing"
- unigram :{"how", "are", "you", "doing"}
- bigram: {"how are", "are you", "you doing"}
- trigram:{"how are you", "are you doing"}

### Any combination of unigrams, bigrams and trigrams can be obtained using a Countvectorizer

- By default the vectorizer's in sklearn will perform on unigrams, i.e ngram_range will be set to (1,1)
- to extarct bigrams ngram_range = (2,2)
- to extract trigrams ngram_range = (3,3)
- To extract both unigrams and bigrams ngram_range = (1,2)

In [None]:
# A token is called as a bigram if its min and max length is 2
count_vect_bg = CountVectorizer(ngram_range=(1,1))

In [None]:
X_bg = count_vect_bg.fit_transform(doc['text'])

In [None]:
doc['text']

In [None]:
print(count_vect_bg.get_feature_names())

In [None]:
DTM_bg = pd.DataFrame(X_bg.toarray(),columns=count_vect_bg.get_feature_names())
DTM_bg

In [None]:
DTM_bg.sum()

In [None]:
count_vect_ubg = CountVectorizer(ngram_range=(1,1),max_features = 4)

In [None]:
X_ubg = count_vect_ubg.fit_transform(doc['text'])

In [None]:
print(count_vect_ubg.get_feature_names())

In [None]:
DTM_ubg = pd.DataFrame(X_ubg.toarray(),
                      columns=count_vect_ubg.get_feature_names())
DTM_ubg

 ### Setting max features
 
 1. your vectorizer's take another argument called max_features where in you can specify the top n features to be selected
 2. on your DTM, if you perfrom column sum, you will get the total No. of occourrence of a word accross all the documents

In [None]:
#Get the top 3 features in our DTM based on their frequencies
#accross all the documents in the corpus by taking a column sum of all the words in the DTM
count_nfeatures = CountVectorizer(max_features=6,ngram_range=(1,1))

In [None]:
X_nfeatures = count_nfeatures.fit_transform(doc['text'])
doc['text']

In [None]:
count_nfeatures.get_feature_names()

In [None]:
DTM_nfeatures = pd.DataFrame(X_nfeatures.toarray(),
                            columns=count_nfeatures.get_feature_names())
DTM_nfeatures

In [None]:
# Get the DTM for all the trigrams - CountVectorizer(ngram_range=(3,3))

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1,1),max_features = 6)

In [None]:
X = tfidf_vect.fit_transform(doc['text'])
X

In [None]:
doc['text']

In [None]:
tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vect.get_feature_names())
tfidf

In [None]:
tfidf_square = tfidf.apply(lambda x: x**2)
tfidf_square

In [None]:
doc['text']

In [None]:
tfidf_square.sum(axis=1)

# Something Extra

1.) Spotting common non-dictionary words

In [None]:
from nltk.corpus import words
word_list = words.words()
len(word_list)

In [None]:
revlst = ["this","is","update","good","lyf","life","bdfgbvd","zebra","lyf"]

In [None]:
ndwords = [word for word in revlst if word not in word_list]
ndwords

2.) REgular Expression - some examples

In [None]:
import re
sentence = "IIT Ropar is best Ropar"

In [None]:
newsent = re.sub("Ropar","Roorkee",sentence)
newsent

In [None]:
sentence = "IIT Ropar., is !@ best Ropar?? #$ 12345"
newsent = re.sub(r"[^\w\s]+"," ",sentence)
newsent