# I. Text Segmentation

Text Segmentation is the process of transforming text into meaningful units. These units can be words, sentences or different topics. 

In [5]:
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
text = "CODE is founded by Mr. Bachem. Studying at CODE will be unlike any other higher education experience. Our intensive, interdisciplinary bachelor’s programs are designed to dramatically improve the way you work and to prepare you for the reality of tomorrow’s workplace."

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manishanker.talusani/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
# split it into sentences
print(sent_tokenize(text))

['CODE is founded by Mr. Bachem.', 'Studying at CODE will be unlike any other higher education experience.', 'Our intensive, interdisciplinary bachelor’s programs are designed to dramatically improve the way you work and to prepare you for the reality of tomorrow’s workplace.']


In [7]:
# split into words
print(word_tokenize(text))

['CODE', 'is', 'founded', 'by', 'Mr.', 'Bachem', '.', 'Studying', 'at', 'CODE', 'will', 'be', 'unlike', 'any', 'other', 'higher', 'education', 'experience', '.', 'Our', 'intensive', ',', 'interdisciplinary', 'bachelor', '’', 's', 'programs', 'are', 'designed', 'to', 'dramatically', 'improve', 'the', 'way', 'you', 'work', 'and', 'to', 'prepare', 'you', 'for', 'the', 'reality', 'of', 'tomorrow', '’', 's', 'workplace', '.']


In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize

text = "beneath the extraodrinary staircase..."

tokenize= sent_tokenize(text)


# II. Stop Words & Word Segmentation
Also part of Natural Language are words that are basically useless, which are referred to as "stop words". Since we dont want that these words extend our processing time or take up unnecessary space in our database, we will remove them. 

In [9]:
# Removing stop words from text


nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

text = """CODE is founded by Mr. Bachem. Studying at CODE will be unlike 
any other higher education experience. Our intensive, interdisciplinary 
bachelor’s programs are designed to dramatically improve the way you work 
and to prepare you for the reality of tomorrow’s workplace."""

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manishanker.talusani/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# defining the stop words we will use
stop_words = set(stopwords.words('english'))

tokens = word_tokenize(text)

In [11]:
# filter the text for stop words
filtered_sentence = [w for w in tokens if not w in stop_words]
filtered_sentence = []

for w in tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

In [12]:
# show just the tokenized text
print(tokens)

['CODE', 'is', 'founded', 'by', 'Mr.', 'Bachem', '.', 'Studying', 'at', 'CODE', 'will', 'be', 'unlike', 'any', 'other', 'higher', 'education', 'experience', '.', 'Our', 'intensive', ',', 'interdisciplinary', 'bachelor', '’', 's', 'programs', 'are', 'designed', 'to', 'dramatically', 'improve', 'the', 'way', 'you', 'work', 'and', 'to', 'prepare', 'you', 'for', 'the', 'reality', 'of', 'tomorrow', '’', 's', 'workplace', '.']


In [13]:
# show filtered tokenized text
print(filtered_sentence)

['CODE', 'founded', 'Mr.', 'Bachem', '.', 'Studying', 'CODE', 'unlike', 'higher', 'education', 'experience', '.', 'Our', 'intensive', ',', 'interdisciplinary', 'bachelor', '’', 'programs', 'designed', 'dramatically', 'improve', 'way', 'work', 'prepare', 'reality', 'tomorrow', '’', 'workplace', '.']


# III. Stemming 

In [14]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

#### Stemming single words:

In [15]:
example_words = ["ride","riding", "rider"]

In [16]:
for w in example_words:
    print(ps.stem(w))

ride
ride
rider


#### Stemming sentences:

In [17]:
new_text = """CODE is a newly founded private university of applied sciences that is embedded into the vibrant 
network of Berlin's digital economy."""

In [18]:
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

code
is
a
newli
found
privat
univers
of
appli
scienc
that
is
embed
into
the
vibrant
network
of
berlin
's
digit
economi
.


# IV. Parsing (Speech Tagging & Chunking)

## 1. Speech Tagging

Speech Tagging in NLTK is the process of labeling words in a sentence as nouns, adjectives, verbs and more. 

Fortunately, NLTK provides us with a sentence tokenizer called the "PunktSentenceTokenizer", which is a un-supervised ML algorithm that can be trained on any text corpus you wish to. 

In [19]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer

In [20]:
# using novels by chesterton
nltk.download('gutenberg')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import gutenberg
test = gutenberg.raw("chesterton-ball.txt")
train = gutenberg.raw("chesterton-brown.txt")

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/manishanker.talusani/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/manishanker.talusani/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [21]:
# train tokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train)
# tokenize chesterton ball
tokenized = custom_sent_tokenizer.tokenize(test)

In [22]:
def tag_text():
    try:
        for i in tokenized[:7]:
            actual_words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(actual_words)
            print(tagged)

    except Exception as e:
        print(str(e))

tag_text()

[('[', 'IN'), ('The', 'DT'), ('Ball', 'NNP'), ('and', 'CC'), ('The', 'DT'), ('Cross', 'NNP'), ('by', 'IN'), ('G.K', 'NNP'), ('.', '.')]
[('Chesterton', 'NNP'), ('1909', 'CD'), (']', 'NN'), ('I', 'PRP'), ('.', '.')]
[('A', 'DT'), ('DISCUSSION', 'NNP'), ('SOMEWHAT', 'NNP'), ('IN', 'NNP'), ('THE', 'NNP'), ('AIR', 'NNP'), ('The', 'DT'), ('flying', 'VBG'), ('ship', 'NN'), ('of', 'IN'), ('Professor', 'NNP'), ('Lucifer', 'NNP'), ('sang', 'VBD'), ('through', 'IN'), ('the', 'DT'), ('skies', 'NNS'), ('like', 'IN'), ('a', 'DT'), ('silver', 'NN'), ('arrow', 'NN'), (';', ':'), ('the', 'DT'), ('bleak', 'JJ'), ('white', 'JJ'), ('steel', 'NN'), ('of', 'IN'), ('it', 'PRP'), (',', ','), ('gleaming', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bleak', 'JJ'), ('blue', 'JJ'), ('emptiness', 'NN'), ('of', 'IN'), ('the', 'DT'), ('evening', 'NN'), ('.', '.')]
[('That', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('far', 'RB'), ('above', 'IN'), ('the', 'DT'), ('earth', 'NN'), ('was', 'VBD'), ('no', 'DT'), ('expression', '

## 2. Chunking Text

Chunking is the process of grouping words into more meaningful junks than just the speech tags. This can be things such as "noun phrases" or "verb phrases". With chunking you can get a parse tree.

We will search for chunks that correspond to individual noun phrases. 

In [23]:
# using pre-tagged text out of simplicity
text = [("the", "DT"), ("huge", "JJ"), ("german", "JJ"), ("Rottweiler", "NN"), 
        ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")] 

In [24]:
# define a noun-phrase as:
# np = determiner + adjective + singular noun
grammar = "NP: {<DT>?<JJ>*<NN>}" 

# apply grammar to regexparser
cp = nltk.RegexpParser(grammar)

# do the chunking
result = cp.parse(text) 
print(result)

(S
  (NP the/DT huge/JJ german/JJ Rottweiler/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


# V. Sentiment Analysis using Keras

Blogpost:
https://towardsdatascience.com/how-to-build-a-neural-network-with-keras-e8faa33d0ae4

In [28]:
!pip install tensorflow

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/d5/1c/3ac472009a5c54ae7ec5a3294520ca36d1908cd1e5cf3e3fd923f9b7b31f/tensorflow-1.13.1-cp37-cp37m-macosx_10_11_x86_64.whl (73.6MB)
[K    100% |████████████████████████████████| 73.6MB 390kB/s ta 0:00:011    94% |██████████████████████████████▏ | 69.3MB 12.0MB/s eta 0:00:01
[?25hCollecting absl-py>=0.1.6 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/da/3f/9b0355080b81b15ba6a9ffcf1f5ea39e307a2778b2f2dc8694724e8abd5b/absl-py-0.7.1.tar.gz (99kB)
[K    100% |████████████████████████████████| 102kB 10.0MB/s a 0:00:01
Collecting gast>=0.2.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Collecting astor>=0.6.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/35/6b/11530768cac581a12952a2aad00e1526b89d242d0b9f59534ef6e6a1752f/astor-0.7.1-py2.py3-none-

In [29]:
import numpy as np
from keras.utils import to_categorical
from keras import models
from keras import layers
from keras.datasets import imdb
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)
def vectorize(sequences, dimension = 10000):
 results = np.zeros((len(sequences), dimension))
 for i, sequence in enumerate(sequences):
  results[i, sequence] = 1
 return results
 
data = vectorize(data)
targets = np.array(targets).astype("float32")
test_x = data[:10000]
test_y = targets[:10000]
train_x = data[10000:]
train_y = targets[10000:]
model = models.Sequential()
# Input - Layer
model.add(layers.Dense(50, activation = "relu", input_shape=(10000, )))
# Hidden - Layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()
# compiling the model
model.compile(
 optimizer = "adam",
 loss = "binary_crossentropy",
 metrics = ["accuracy"]
)
results = model.fit(
 train_x, train_y,
 epochs= 2,
 batch_size = 500,
 validation_data = (test_x, test_y)
)
print("Test-Accuracy:", np.mean(results.history["val_acc"]))

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                500050    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________

### Example of a IMDB Review:

In [30]:
from keras.datasets import imdb
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

In [31]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in data[0]] )
print(decoded)
print("Label:", targets[0], ", which means positive.")

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for the whole film but thes