# Recurrent Neural Network

In this Notebook I will train a Recurrent Neural Network (RNN) on the articles of my watson data set. Then I let the RNN write a new article. 



In [17]:
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

In [35]:
import pandas as pd 
import numpy as np
import string
import nltk
import ipynb
import ipynb.fs.full.Classifier as cl#from https://github.com/ptnplanet/NLTK-Contributions/blob/master/ClassifierBasedGermanTagger/ClassifierBasedGermanTagger.py
import random
import pickle
import keras
import re
import gensim

## Load the Data

In [3]:
data = pd.read_csv("watson_schweiz.csv",sep = ";") 
display(data.head(5))

# filter no_author
data = data[-data['author'].str.contains("no_author")]

# filter authors <50 articles - in order to make 
data = data.groupby('author')
data = data.filter(lambda x: len(x) > 152).reset_index(drop = True)
display(data.groupby('author').count())

Unnamed: 0,title,author,date,nmbr_comments,themes,article
0,Tourismus-Professor pendelt mit Flugzeug zur A...,no_author,"28.03.19, 22:15 28.03.19, 22:40",19,"['Schweiz', 'Gesellschaft & Politik', 'Klima']","['Naaa, wie kommt ihr so zur Uni? Mit dem Fahr..."
1,no_title,no_author,no_date,no_comments,[],['\r\n\t\tMit deiner Anmeldung erklärst du dic...
2,Anstatt mit Bus und Zug fahren mehr Menschen m...,no_author,"28.03.19, 17:39",29,"['Schweiz', 'Gesellschaft & Politik', 'Mobilit...",['\nDer Ausbau des öffentlichen Verkehrs würde...
3,Über 80'000 Franken bei Online-Bank N26 geklau...,no_author,"28.03.19, 17:34",18,"['Digital', 'Schweiz', 'Datenschutz', 'Deutsch...",['\nDie gefeierte Online-Bank N26 verspielt ge...
4,Der Wolf ist zurück – was auch Städter wissen ...,no_author,"28.03.19, 16:19",45,"['Schweiz', 'Wissen', 'Aargau', 'Natur', 'Tier']",['\nDer gesetzliche Schutz des Wolfes wird der...


Unnamed: 0_level_0,title,date,nmbr_comments,themes,article
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jacqueline Büchi,155,155,155,155,155


In [4]:
# define a function to convert the string, which is actually a list, to a nice string.
def listtostring(l):
    import ast
    list1 = ast.literal_eval(l)
    list2 = " ".join(list1)
    list3 = list2.strip()
    return list3

In [5]:
articles = data["article"].apply(listtostring)

In [6]:
articles[2]

'Seit über 30 Jahren kämpft Christoph Blocher für eine Abschaffung der Sommerzeit. Nun könnte sein Wunsch in Erfüllung gehen – ausgerechnet dank seiner Erzfeindin, der EU.  Es war einmal, in den frühen 80er-Jahren, ein Zürcher Nationalrat. Er hiess Christoph Blocher und er hatte einen Traum: Die Sommerzeit sollte weg! Eilends lancierte er eine Volksinitiative. Allerdings brachten er und seine Mitstreiter der Zürcher SVP die nötigen Unterschriften nicht zusammen – das Vorhaben scheiterte im Sammelstadium. \nChristoph Blocher anno 1980. Bild: KEYSTONE In den darauffolgenden Jahren verschob sich der Fokus Blochers. Sein Kampf galt nun der Unabhängigkeit der Schweiz von der Europäischen Union. Indem er sich erfolgreich gegen den Beitritt zum Europäischen Wirtschaftsraum (EWR) zur Wehr setzte, avancierte er 1992 zur Ikone aller EU-Skeptiker. Den Kampf gegen die Sommerzeit führten derweil andere für ihn weiter. Insbesondere Yvette Estermann, SVP-Nationalrätin aus dem Kanton Luzern. Erstmals 

## Word2vec model for word embeddings

In [30]:
articles_split = articles.apply(lambda x: re.split("\.|\!|\\?",x))

In [33]:
# list again
texts = []
for i in articles_split: # loop in loop, since each sentence needs to be seperately added
    for s in i:
        texts.append(s)

In [50]:
sentences =  [gensim.utils.simple_preprocess(_) for _ in texts] # do some preprocessing

In [51]:
sg_ = 1 # the training algorithm. If sg=0, CBOW is used. Otherwise (sg=1), skip-gram is employed.
alg = 'CBOW' if sg_ == 0 else 'sg'
size_ = 10 #  the dimensionality of the feature vectors
window_ = 2 # the context size or the maximum distance between the current and predicted word


model_watson = Word2Vec(sentences, size=100, window=5, min_count=1, workers=8)
words = list(model_watson.wv.vocab.keys())
print(f"The number of words: {len(words)}")
print(f"The first 10 words in the vocabularies: {words[0:10]}")

TypeError: 'Tokenizer' object is not iterable

## prepare for RNN

In [52]:
# tokenizer object
tokenizer = keras.preprocessing.text.Tokenizer(num_words=None,
                                              filters = '#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                                              lower = True,
                                              split = " ")

In [53]:
# train tokenizer to text
tokenizer.fit_on_texts(articles)

In [54]:
sequences = tokenizer.texts_to_sequences(articles)

In [55]:
features = []
labels = []

training_length = 50

# Iterate through the sequences of tokens
for seq in sequences:

    # Create multiple training examples from each sequence
    for i in range(training_length, len(seq)):
        
        # Extract the features and label
        extract = seq[i - training_length:i + 1]

        # Set the features and label
        features.append(extract[:-1])
        labels.append(extract[-1])
        
features = np.array(features)

In [56]:
# number of words in vocabulary
num_words = len(tokenizer.index_word)+1
word_index = tokenizer.index_word


display(num_words)

# empty array for labels
label_array = np.zeros((len(features), num_words),dtype = np.int8)

# one hot encode labels
for example_index, word, in enumerate(labels):
    label_array[example_index, word]= 1
    
label_array.shape

26507

(137474, 26507)

In [59]:
# import gzip
# import shutil
# with gzip.open(r'C:\Users\gwehrm\Downloads\cc.de.300.vec.gz', 'rb') as f_in:
#     with open(r'C:\Users\gwehrm\Documents\Repos\watson_analysis\german_emb.txt', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)


  


KeyError: "word '1' not in vocabulary"

In [63]:
# New matrix to hold word embeddings
embedding_matrix = np.zeros((num_words, model_watson.vector_size))

for i, word in enumerate(word_index.keys()):
    # Look up the word embedding
    if isinstance(i, int):
        continue
    vector = model_watson[str(word)]

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector

In [64]:
# build RNN
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

model = Sequential()

# Embedding layer
model.add(
    Embedding(input_dim=num_words,
              input_length = training_length,
              output_dim=100,
              weights=[embedding_matrix],
              trainable=False,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

W1121 09:54:28.451590 20604 deprecation.py:323] From C:\Users\gwehrm\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\keras\backend.py:3794: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [72]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint('models/model.h5', save_best_only = True, 
                             save_weights_only=False)]

In [None]:
# split train /test 

In [73]:
# train the model
history = model.fit(X_train,  y_train, 
                    batch_size=2048, epochs=150,
                    callbacks=callbacks,
                    validation_data=(X_valid, y_valid))

NameError: name 'X_train' is not defined