# Learning Deep Learning
First try to use Keras
Based on

https://nlpforhackers.io/keras-intro

In [1]:
import pickle
import numpy as np
import spacy
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, LSTM
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [2]:
VALIDATION_SIZE = 500
np.random.seed(666)

In [3]:

def encodeY(Y):
    '''create one-hot (dummies) for output, see also https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
    encode class values as integers
    '''
    encoder = LabelEncoder()
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)
    dummy_y = np_utils.to_categorical(encoded_Y)
    return dummy_y

In [4]:
df = pickle.load(open('newsdataset/news_dataset_annotated.pkl', mode ='rb'))
# remove missing values
print(df['v9_major'].map(str.isdigit).value_counts())
df = df[df['v9_major'].map(str.isdigit)]
# remove niche categories
df = df[df['v9_major'].map(df.v9_major.value_counts()>150)]

X_train, X_test, y_train, y_test = train_test_split(df['text'], encodeY(df['v9_major'].map(int)), test_size = 0.2)
del df

True     11124
False     1435
Name: v9_major, dtype: int64


In [17]:
vectorizer = CountVectorizer(min_df=5, max_df=.9)
X_train_onehot = vectorizer.fit_transform(X_train)

In [15]:
# dit zou werken als y_train nog steeds een pandas-series was
# numberoflabels = len(y_train.unique())
# maar dat is het dankzij onze encodeY-functie niet meer.
# vandaar even een hele lelijke hack:
numberoflabels = len(set((str(e) for e in y_train.tolist())))
numberoflabels

19

In [19]:
model = Sequential()
model.add(Dense (units = 500, activation = 'relu', input_dim = len(vectorizer.get_feature_names())))
#model.add(Dense(units = 400, activation = 'sigmoid'))
#model.add(Dense(units = 300, activation = 'sigmoid'))
model.add(Dense(units = 200, activation = 'sigmoid'))
# model.add(Dense(units = 100, activation = 'sigmoid'))
model.add(Dense(units = numberoflabels, activation = 'sigmoid'))

In [20]:
model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 500)               11062000  
_________________________________________________________________
dense_4 (Dense)              (None, 200)               100200    
_________________________________________________________________
dense_5 (Dense)              (None, 19)                3819      
Total params: 11,166,019
Trainable params: 11,166,019
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.fit(X_train_onehot[:-VALIDATION_SIZE], y_train[:-VALIDATION_SIZE], 
          epochs=3, verbose=1,
          validation_data=(X_train_onehot[-VALIDATION_SIZE:], y_train[-VALIDATION_SIZE:]))

Train on 7712 samples, validate on 500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f337744cc18>

In [22]:
scores = model.evaluate(vectorizer.transform(X_test), y_test, verbose = True)
print("Accuracy: {}".format(scores[1]))

Accuracy: 0.601558694622311


## Convolutional Network

In [11]:
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()
 
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes

X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_train] 

In [12]:
# Compute the max lenght of a text
MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGHT=", MAX_SEQ_LENGHT)
 
N_FEATURES = len(vectorizer.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)
print(X_train_sequences[0])
 

MAX_SEQ_LENGHT= 3197
[22123 22123 22123 ... 16975  7401 16977]


In [13]:
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGHT))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=numberoflabels, activation='sigmoid'))
 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3197, 64)          1415936   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 3193, 64)          20544     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 638, 64)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 40832)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                2613312   
_________________________________________________________________
dense_5 (Dense)              (None, 19)                1235      
Total params: 4,051,027
Trainable params: 4,051,027
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train_sequences[:-VALIDATION_SIZE], y_train[:-VALIDATION_SIZE], 
          epochs=3, verbose=True,
          validation_data=(X_train_sequences[-VALIDATION_SIZE:], y_train[-VALIDATION_SIZE:]))
 

Train on 7712 samples, validate on 500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_test]
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)

In [None]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1]) 
 

# LSTM

In [None]:
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGHT))
model.add(LSTM(64))
model.add(Dense(units=numberoflabels, activation='sigmoid'))
 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
 

In [None]:
model.fit(X_train_sequences[:-VALIDATION_SIZE], y_train[:-VALIDATION_SIZE], 
          epochs=2, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-VALIDATION_SIZE:], y_train[-VALIDATION_SIZE:]))
 

In [None]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1])
 

# pretrained embeddings

bla
bla

https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html


In [6]:
embeddings = np.load(open('aem/w2v_320d_AEM_corpus_2000-01-01_2017-12-31', mode='rb'))

In [26]:
embeddings.corpus_total_words

1657264089

In [31]:
len(embeddings.wv.word_vec('morgen'))

320

In [8]:
embeddings.vector_size

320

https://adventuresinmachinelearning.com/gensim-word2vec-tutorial/

In [7]:
embedding_matrix = np.zeros((len(embeddings.wv.vocab), embeddings.vector_size))
for i in range(len(embeddings.wv.vocab)):
    embedding_vector = embeddings.wv[embeddings.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [9]:
len(embeddings.wv.vocab)

1055100

In [10]:
embeddings.vector_size

320

In [11]:
del embeddings

In [14]:
model = Sequential()


#model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
#                    64,  # Embedding size
#                    input_length=MAX_SEQ_LENGHT))

#model.add(embeddings.wv.get_keras_embedding())

model.add(Embedding(input_dim = 1055100, 
                    weights=[embedding_matrix], 
                    output_dim=320, trainable=False,
                    input_shape= ()))

#model.add(Conv1D(320, 5, activation='relu'))
#model.add(MaxPooling1D(5))
#model.add(Flatten())
#model.add(Dense(units=64, activation='relu'))

model.add(Dense(units = 200, activation = 'sigmoid'))


model.add(Dense(units=numberoflabels, activation='sigmoid'))
 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

NameError: name 'numberoflabels' is not defined

In [17]:
Embedding?

In [10]:
model = Sequential()
model.add(embeddings.wv.get_keras_embedding())
# model.add(Dense (units = 500, activation = 'relu', input_dim = len(vectorizer.get_feature_names())))
#model.add(Dense(units = 400, activation = 'sigmoid'))
#model.add(Dense(units = 300, activation = 'sigmoid'))
model.add(Dense(units = 200, activation = 'sigmoid'))
# model.add(Dense(units = 100, activation = 'sigmoid'))
model.add(Dense(units = numberoflabels, activation = 'sigmoid'))

In [11]:
model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 320)         337632000 
_________________________________________________________________
dense_1 (Dense)              (None, None, 200)         64200     
_________________________________________________________________
dense_2 (Dense)              (None, None, 19)          3819      
Total params: 337,700,019
Trainable params: 68,019
Non-trainable params: 337,632,000
_________________________________________________________________


In [12]:
model.fit(X_train_onehot[:-VALIDATION_SIZE], y_train[:-VALIDATION_SIZE], 
          epochs=3, verbose=1,
          validation_data=(X_train_onehot[-VALIDATION_SIZE:], y_train[-VALIDATION_SIZE:]))

ValueError: Error when checking target: expected dense_2 to have 3 dimensions, but got array with shape (7712, 19)

In [None]:
scores = model.evaluate(vectorizer.transform(X_test), y_test, verbose = True)
print("Accuracy: {}".format(scores[1]))