Data cleaing techniques for text. Normalizing 

In [129]:
import re
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, brown
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [62]:
text_blocks = ["    this is text with whitespace. ","   Periods and mUlItiplE cases. ", "   how can we Handle It?"]

In [63]:
#strip whitespace
strip_whitespace_list = [block.strip() for block in text_block]

#remove periods 
removed_and_stripped = [block.replace(".", "") for block in strip_whitespace_list]


#build capitialize function
def capitalize(string: str) -> str:
    return string.upper()

#apply function
cap_arr = [capitalize(block) for block in removed_and_stripped]

#replace function
def replace_with(string: str, letter) -> str:
    return re.sub(r"[a-zA-z]", letter, string)

#apply replace
rep_arr = [replace_with(block, "Y") for block in cap_arr]
rep_arr



['YYYY YY YYYY YYYY YYYYYYYYYY',
 'YYYYYYY YYY YYYYYYYYY YYYYY',
 'YYY YYY YY YYYYYY YY?']

In [64]:
#tokenize words from sentence
token_words = nltk.word_tokenize(text_blocks[0])

sentences = text_blocks[0] + text_blocks[1] + text_blocks[2]
#tokenize sentences from block
token_sentence = nltk.sent_tokenize(sentences)

In [65]:
#remove stop words 
stop_words = stopwords.words('english')
[word for word in token_words if word not in stop_words]

['text', 'whitespace', '.']

In [66]:
#Create Stemmer
stemmer = PorterStemmer()
[stemmer.stem(word) for word in token_words]

['thi', 'is', 'text', 'with', 'whitespac', '.']

In [75]:
pos_data = cap_arr[2]
pos_tagged = pos_tag(word_tokenize(pos_data))
pos_tagged

[('HOW', 'WRB'),
 ('CAN', 'MD'),
 ('WE', 'VB'),
 ('HANDLE', 'VB'),
 ('IT', 'NNP'),
 ('?', '.')]

In [78]:
#list tagger set
#nltk.help.upenn_tagset()

#filter on field
[word for word, tag in pos_tagged if tag in ["WRB"]]

['HOW']

In [96]:
#Convert those sentences into features for individual parts of speech 
excerpt = ["This text could be from any source.",
          "It might be from a webppage, a tweet, or internal company docs.",
          "Or it could be a poem or equations or musical notation."]

tagged_excerpt = []

for words in excerpt:
    ex_tag = nltk.pos_tag(word_tokenize(words))
    tagged_excerpt.append([tag for word, tag in ex_tag])
    
tagged_excerpt[0].sort()
tagged_excerpt[1].sort()
tagged_excerpt[2].sort()

tagged_excerpt

[['.', 'DT', 'DT', 'IN', 'MD', 'NN', 'NN', 'VB'],
 [',',
  ',',
  '.',
  'CC',
  'DT',
  'DT',
  'IN',
  'JJ',
  'MD',
  'NN',
  'NN',
  'NN',
  'NN',
  'PRP',
  'VB'],
 ['.', 'CC', 'CC', 'CC', 'DT', 'JJ', 'MD', 'NN', 'NN', 'NNS', 'PRP', 'VB']]

In [101]:
#convert tags into features
multi_label_bin = MultiLabelBinarizer()
multi_label_bin.fit_transform(tagged_excerpt)
#feature names
multi_label_bin.classes_

array([',', '.', 'CC', 'DT', 'IN', 'JJ', 'MD', 'NN', 'NNS', 'PRP', 'VB'],
      dtype=object)

In [105]:
#Use Browns tagged corpus for training "backoff-ngram"
brown_sentences = brown.tagged_sents(categories='news')
brown_sentences[:2]

[[('The', 'AT'),
  ('Fulton', 'NP-TL'),
  ('County', 'NN-TL'),
  ('Grand', 'JJ-TL'),
  ('Jury', 'NN-TL'),
  ('said', 'VBD'),
  ('Friday', 'NR'),
  ('an', 'AT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NP$'),
  ('recent', 'JJ'),
  ('primary', 'NN'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'AT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'CS'),
  ('any', 'DTI'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'AT'),
  ('jury', 'NN'),
  ('further', 'RBR'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'NN'),
  ('presentments', 'NNS'),
  ('that', 'CS'),
  ('the', 'AT'),
  ('City', 'NN-TL'),
  ('Executive', 'JJ-TL'),
  ('Committee', 'NN-TL'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'HVD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'AT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'AT'),
  ('praise', 'NN'),
  ('and', 

In [111]:
#train and test set
train = brown_sentences[:4500]
test = brown_sentences[4500:]

#creating backoff tagger 
#First we take into account the previous two words using TrigramTagger; 
#if two words are not present, we “back off” and take into account the tag of the previous one word using BigramTagger, 
#and finally if that fails we only look at the word itself using UnigramTagger. 
unigram = UnigramTagger(train)
bigrma = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigrma)

#show accuracy
trigram.evaluate(test)

0.8355054596688976

In [120]:
#Text as bag of words
numpy_text = np.array(["Economics is investing in resources and peoples ideas",
                      "Physics takes resources and ideas, to create marvels",
                      "Living in space is going to take economics and physics resources"])

count_vect = CountVectorizer()

#bag of words
bow = count_vect.fit_transform(numpy_text)

bow.toarray()
count_vect.get_feature_names()

['and',
 'create',
 'economics',
 'going',
 'ideas',
 'in',
 'investing',
 'is',
 'living',
 'marvels',
 'peoples',
 'physics',
 'resources',
 'space',
 'take',
 'takes',
 'to']

In [128]:
#CountVectorizer parameters make creating bag-of-words feature matrices
count_2gram = CountVectorizer(ngram_range=(1,2),
                             stop_words="english",
                             vocabulary=['resources'])
g2_bag = count_2gram.fit_transform(numpy_text)
g2_bag.toarray()

array([[1],
       [1],
       [1]])

In [134]:
#You want a bag of words, but with words weighted by their importance to an observation
#Compare frequency of word in a document with the frequency of the word in documents using tf-idf
features_feq = TfidfVectorizer()
feq_matrix = features_feq.fit_transform(numpy_text)
feq_matrix.toarray()

array([[0.26383484, 0.        , 0.33973539, 0.        , 0.33973539,
        0.33973539, 0.44671121, 0.33973539, 0.        , 0.        ,
        0.44671121, 0.        , 0.26383484, 0.        , 0.        ,
        0.        , 0.        ],
       [0.25339107, 0.42902838, 0.        , 0.        , 0.32628714,
        0.        , 0.        , 0.        , 0.        , 0.42902838,
        0.        , 0.32628714, 0.25339107, 0.        , 0.        ,
        0.42902838, 0.32628714],
       [0.21438498, 0.        , 0.2760597 , 0.36298532, 0.        ,
        0.2760597 , 0.        , 0.2760597 , 0.36298532, 0.        ,
        0.        , 0.2760597 , 0.21438498, 0.36298532, 0.36298532,
        0.        , 0.2760597 ]])

In [137]:
features_feq.vocabulary_

{'economics': 2,
 'is': 7,
 'investing': 6,
 'in': 5,
 'resources': 12,
 'and': 0,
 'peoples': 10,
 'ideas': 4,
 'physics': 11,
 'takes': 15,
 'to': 16,
 'create': 1,
 'marvels': 9,
 'living': 8,
 'space': 13,
 'going': 3,
 'take': 14}