In [2]:
import pandas as pd
import re
import nltk
from collections import defaultdict # Dictionaries with default values
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sddjl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sddjl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\sddjl\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sddjl\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sddjl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
## importing data sets and dropping nan's
df = pd.read_csv('GFM_data.csv',sep = '\t')
df = df.loc[df['Text'].dropna().index]
df.head()

Unnamed: 0.1,Unnamed: 0,Url,Category,Position,Title,Location,Amount_Raised,Goal,Number_of_Donations,Length_of_Fundraising,FB_Shares,Number_of_Donors,Followers,Text
0,0,https://www.gofundme.com/f/justiceforjacobblake,Medical,0,Justice for Jacob Blake,"Kenosha, WI",2297930.0,3000000.0,73K,93 days 12:02:38.405126000,118K,72.5K,73.4K,On August 23rd my son was shot multiple times ...
1,0,https://www.gofundme.com/f/official-navajo-nat...,Medical,0,Official Navajo Nation COVID-19 Relief Fund,"Window Rock, AZ",1862040.0,1000000.0,22.5K,205 days 12:02:39.366241000,71.7K,21.9K,22K,\r\nThe Navajo Nation COVID-19 Fund has been e...
2,0,https://www.gofundme.com/f/help-a-front-line-n...,Medical,0,Help a front line nurse and baby get proper care,"Randolph, NJ",954793.0,1200000.0,19K,215 days 12:02:40.340314000,16.4K,18.3K,17.9K,"On Sunday, April 12, Sylvia Leroy, a pregnant ..."
3,0,https://www.gofundme.com/f/Tommy-Rivers-Rest-Up,Medical,1,"Rest up, Tommy, we'll see you soon","Scottsdale, AZ",673179.0,1000000.0,11.3K,131 days 12:02:41.464483000,21.3K,10.3K,10.4K,"First, thank you for being here. Tommy Rivers ..."
4,0,https://www.gofundme.com/f/brandon039s-medical...,Medical,1,OFFICIAL BRANDON SAENZ MEDICAL FUND,"Tyler, TX",570529.0,750000.0,24.7K,175 days 12:02:42.383091000,5.5K,24.3K,24.5K,My name is Melissa Green and I am the mother o...


In [4]:
df['Amount_Raised'].describe()

count    8.370000e+02
mean     1.157495e+05
std      3.218705e+05
min      6.370000e+02
25%      1.954500e+04
50%      5.305800e+04
75%      1.233970e+05
max      6.750030e+06
Name: Amount_Raised, dtype: float64

In [5]:
df = df[df['Amount_Raised'] > 123397] 

In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

def extract_entities(text):
    names = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                names.append(' '.join(c[0] for c in chunk.leaves()))
    new_text = text
    for name in names:
        if name in text:
            new_text = new_text.replace(name, 'NLP')
    return new_text

def clean_text(x):
    ## removing names
    x = extract_entities(x)
    ## normalizing text by stripping white space and lower casing
    x =  x.lower().strip()
    ## removing urls
    x = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', x)
    ## removing phone numbers
    x = re.sub('\([0-9]{3}\)\s*[0-9]{3}-[0-9]{4}','',x)
    ## strip all non alphanumeric things
    x = re.sub('\n',' ',x)
    x = re.sub("[^a-zA-Z0-9 #]",'',x)
    x = re.sub("\s+",' ',x)
    text = x.replace('\n', ' ').lower()# lowercase text
    text = REPLACE_IP_ADDRESS.sub('', text) # remove ip address
    text = REPLACE_BY_SPACE_RE.sub(' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    
    return text

In [7]:
text = df['Text'][4]
new_text = clean_text(text)
df['Text'] = df['Text'].apply(clean_text)

In [8]:
df['Text'][0]

'august 23rd son shot multiple times back nlp police department officer son broke altercation unrelated party shooting left son critically injured fights life extent sons injuries unknown remain prayerful continues undergo multiple rounds operations save life fund established cover sons medical expenses mental grief counseling family assist family days come continue seek justice nlp portion proceeds also used benefit sons six children witnessed horrific act violence anyone wishing send cards letters encouragement andor contributions form money order check may mail nlp co nlp nlp 122 calhoun street tallahassee fl 32301 attn nlp'

In [9]:
file = ''
for text in df['Text']:
    file += text

Start Stack Abuse Example

In [10]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [11]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [12]:
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [13]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [14]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 268097
Total vocab: 37


In [15]:
seq_length = 100
x_data = []
y_data = []

In [16]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [17]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 267997


In [18]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [19]:
y = np_utils.to_categorical(y_data)

In [20]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [21]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [22]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [23]:
model.fit(X, y, epochs=10, batch_size=128, callbacks=desired_callbacks)

Epoch 1/10

Epoch 00001: loss improved from inf to 2.81351, saving model to model_weights_saved.hdf5
Epoch 2/10

Epoch 00002: loss improved from 2.81351 to 2.42113, saving model to model_weights_saved.hdf5
Epoch 3/10

Epoch 00003: loss improved from 2.42113 to 2.25868, saving model to model_weights_saved.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.25868 to 2.15237, saving model to model_weights_saved.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.15237 to 2.07512, saving model to model_weights_saved.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.07512 to 2.01532, saving model to model_weights_saved.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.01532 to 1.96941, saving model to model_weights_saved.hdf5
Epoch 8/10

Epoch 00008: loss improved from 1.96941 to 1.92933, saving model to model_weights_saved.hdf5
Epoch 9/10

Epoch 00009: loss improved from 1.92933 to 1.89759, saving model to model_weights_saved.hdf5
Epoch 10/10

Epoch 00010: loss improved from 1.89759 to 1.8

<keras.callbacks.callbacks.History at 0x19a2ced0a88>

In [24]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [25]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [26]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" sinesses phase 3 movie theaters allowed operate 25 capacity 3 screens even sell would equal 69 peopl "


In [27]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

e nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nlp nl