In [0]:
import os
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/DS8013 Deep Learning/Project')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Conv1D, MaxPool1D, Flatten, Dense, Dropout
from keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support


Using TensorFlow backend.


In [0]:
# set seed
np.random.seed(42)

In [0]:
# load and view data
df_blogs = pd.read_csv('blogtext.csv')

df_blogs.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [0]:
# take post contents and user id: the X and y for this task
df_Xy = df_blogs[['text', 'id']]

df_Xy.head()

Unnamed: 0,text,id
0,"Info has been found (+/- 100 pages,...",2059027
1,These are the team members: Drewe...,2059027
2,In het kader van kernfusie op aarde...,2059027
3,testing!!! testing!!!,2059027
4,Thanks to Yahoo!'s Toolbar I can ...,3581210


In [0]:
# find the 10 users with the most blog posts
top_10 = df_Xy['id'].value_counts()[:10]

top_10

449628     4221
734562     2301
589736     2294
1975546    2261
958176     2244
1107146    2237
303162     2114
942828     2068
1270648    1951
1784456    1843
Name: id, dtype: int64

In [0]:
# total number of posts
top_10.sum()

23534

In [0]:
# save the user id of the top 10 users
top_id = list(top_10.index)

print(top_id)

[449628, 734562, 589736, 1975546, 958176, 1107146, 303162, 942828, 1270648, 1784456]


In [0]:
# get posts from the top 10 users
df_top = df_Xy.loc[df_Xy['id'].isin(top_id)]

df_top.head()

Unnamed: 0,text,id
1393,Much funny. 2 points. As mentioned in the...,589736
1394,"Harpers, Harpers, everywhere. Harpers, Har...",589736
1395,"In an earlier post, Johnathan said: 'And ...",589736
1396,"I'd post this on the RTG Blog, but I can't...",589736
1397,The answer to the first question lies with ...,589736


In [0]:
texts = df_top['text'].values

print(texts[:2])

["   Much funny.  2 points.  As mentioned in the email, no game this week; I have to research a 13th century poem for a final paper.  I enjoy the work, but I'd much rather have swashbuckling adventure in wintery Hillsfar. "
 "   Harpers, Harpers, everywhere.  Harpers, Harpers, they really care. Harpers, Harpers, stay in motion.  Harpers, Harpers, healing potions. Harpers, Harpers, pins of green.  Harpers, Harpers, enter the scene. Harpers, Harpers, formed by a bard.  Harpers, Harpers, .... What?  A Bard?!?!  You got to be fucking kidding me, right?!?!  A Bard?!  Jesus Christ, we're screwed!! "]


In [0]:
# preprocessing and implementation adapted from:
# https://towardsdatascience.com/character-level-cnn-with-keras-50391c3adf33

# initialize Tokenizer
tk = Tokenizer(num_words=None, lower=False, char_level=True, oov_token='UNK')

tk.fit_on_texts(texts)

print(tk.word_index)

{'UNK': 1, ' ': 2, 'e': 3, 't': 4, 'o': 5, 'a': 6, 'n': 7, 'i': 8, 's': 9, 'r': 10, 'h': 11, 'l': 12, 'd': 13, 'u': 14, 'm': 15, 'y': 16, 'c': 17, 'g': 18, '.': 19, 'w': 20, 'f': 21, 'p': 22, 'b': 23, ',': 24, 'k': 25, "'": 26, 'v': 27, 'I': 28, 'T': 29, 'A': 30, 'S': 31, '!': 32, '-': 33, 'L': 34, 'H': 35, 'W': 36, 'C': 37, 'O': 38, 'M': 39, 'B': 40, '?': 41, 'j': 42, 'E': 43, 'x': 44, 'N': 45, 'D': 46, ':': 47, 'P': 48, ')': 49, 'R': 50, 'G': 51, 'F': 52, '0': 53, '(': 54, 'Y': 55, '1': 56, 'J': 57, 'z': 58, '2': 59, 'q': 60, '*': 61, ';': 62, 'U': 63, '/': 64, 'K': 65, 'V': 66, '3': 67, '5': 68, '4': 69, '9': 70, '�': 71, '8': 72, '7': 73, '6': 74, '&': 75, 'Q': 76, '_': 77, 'Z': 78, '#': 79, '[': 80, ']': 81, '~': 82, '=': 83, '|': 84, '$': 85, '>': 86, 'X': 87, '%': 88, '@': 89, '+': 90, '\\': 91, '`': 92, '^': 93, 'é': 94, '’': 95, '}': 96, '{': 97, 'è': 98, 'à': 99, '<': 100, 'ê': 101, 'û': 102, 'â': 103, '\x03': 104, 'ç': 105, '«': 106, '»': 107, 'Ç': 108, 'ô': 109, 'ù': 110, '

In [0]:
# create a custom character dictionary
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-,;.!?:'\"/\\|_@#$%^&*~`+=<>()[]{}"

char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

print(char_dict)

{'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52, '-': 53, ',': 54, ';': 55, '.': 56, '!': 57, '?': 58, ':': 59, "'": 60, '"': 61, '/': 62, '\\': 63, '|': 64, '_': 65, '@': 66, '#': 67, '$': 68, '%': 69, '^': 70, '&': 71, '*': 72, '~': 73, '`': 74, '+': 75, '=': 76, '<': 77, '>': 78, '(': 79, ')': 80, '[': 81, ']': 82, '{': 83, '}': 84}


In [0]:
# all digit characters get the same encoding
digits = '0123456789'

idx = max(char_dict.values()) + 1

for d in digits:
    char_dict[d] = idx

print(char_dict)

{'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52, '-': 53, ',': 54, ';': 55, '.': 56, '!': 57, '?': 58, ':': 59, "'": 60, '"': 61, '/': 62, '\\': 63, '|': 64, '_': 65, '@': 66, '#': 67, '$': 68, '%': 69, '^': 70, '&': 71, '*': 72, '~': 73, '`': 74, '+': 75, '=': 76, '<': 77, '>': 78, '(': 79, ')': 80, '[': 81, ']': 82, '{': 83, '}': 84, '0': 85, '1': 85, '2': 85, '3': 85, '4': 85, '5': 85, '6': 85, '7': 85, '8': 85, '9': 85}


In [0]:
# Use char_dict to replace the tk.word_index
tk.word_index = char_dict 
# Add 'UNK' to the vocabulary (for spaces and out of vocabulary characters)
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

print(tk.word_index)

{'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52, '-': 53, ',': 54, ';': 55, '.': 56, '!': 57, '?': 58, ':': 59, "'": 60, '"': 61, '/': 62, '\\': 63, '|': 64, '_': 65, '@': 66, '#': 67, '$': 68, '%': 69, '^': 70, '&': 71, '*': 72, '~': 73, '`': 74, '+': 75, '=': 76, '<': 77, '>': 78, '(': 79, ')': 80, '[': 81, ']': 82, '{': 83, '}': 84, '0': 85, '1': 85, '2': 85, '3': 85, '4': 85, '5': 85, '6': 85, '7': 85, '8': 85, '9': 85, 'UNK': 86}


In [0]:
# encode the texts
sequences = tk.texts_to_sequences(texts)
print(texts[0])
print(sequences[0])

   Much funny.  2 points.  As mentioned in the email, no game this week; I have to research a 13th century poem for a final paper.  I enjoy the work, but I'd much rather have swashbuckling adventure in wintery Hillsfar. 
[86, 86, 86, 13, 47, 29, 34, 86, 32, 47, 40, 40, 51, 56, 86, 86, 85, 86, 42, 41, 35, 40, 46, 45, 56, 86, 86, 1, 45, 86, 39, 31, 40, 46, 35, 41, 40, 31, 30, 86, 35, 40, 86, 46, 34, 31, 86, 31, 39, 27, 35, 38, 54, 86, 40, 41, 86, 33, 27, 39, 31, 86, 46, 34, 35, 45, 86, 49, 31, 31, 37, 55, 86, 9, 86, 34, 27, 48, 31, 86, 46, 41, 86, 44, 31, 45, 31, 27, 44, 29, 34, 86, 27, 86, 85, 85, 46, 34, 86, 29, 31, 40, 46, 47, 44, 51, 86, 42, 41, 31, 39, 86, 32, 41, 44, 86, 27, 86, 32, 35, 40, 27, 38, 86, 42, 27, 42, 31, 44, 56, 86, 86, 9, 86, 31, 40, 36, 41, 51, 86, 46, 34, 31, 86, 49, 41, 44, 37, 54, 86, 28, 47, 46, 86, 9, 60, 30, 86, 39, 47, 29, 34, 86, 44, 27, 46, 34, 31, 44, 86, 34, 27, 48, 31, 86, 45, 49, 27, 45, 34, 28, 47, 29, 37, 38, 35, 40, 33, 86, 27, 30, 48, 31, 40, 46, 47

In [0]:
# pad or crop inputs to uniform character length
data = pad_sequences(sequences, maxlen=1014, padding='post')

In [0]:
len(sequences[0])

220

In [0]:
print(sequences[0][:300])
print()
print(data[0][:300])

[86, 86, 86, 13, 47, 29, 34, 86, 32, 47, 40, 40, 51, 56, 86, 86, 85, 86, 42, 41, 35, 40, 46, 45, 56, 86, 86, 1, 45, 86, 39, 31, 40, 46, 35, 41, 40, 31, 30, 86, 35, 40, 86, 46, 34, 31, 86, 31, 39, 27, 35, 38, 54, 86, 40, 41, 86, 33, 27, 39, 31, 86, 46, 34, 35, 45, 86, 49, 31, 31, 37, 55, 86, 9, 86, 34, 27, 48, 31, 86, 46, 41, 86, 44, 31, 45, 31, 27, 44, 29, 34, 86, 27, 86, 85, 85, 46, 34, 86, 29, 31, 40, 46, 47, 44, 51, 86, 42, 41, 31, 39, 86, 32, 41, 44, 86, 27, 86, 32, 35, 40, 27, 38, 86, 42, 27, 42, 31, 44, 56, 86, 86, 9, 86, 31, 40, 36, 41, 51, 86, 46, 34, 31, 86, 49, 41, 44, 37, 54, 86, 28, 47, 46, 86, 9, 60, 30, 86, 39, 47, 29, 34, 86, 44, 27, 46, 34, 31, 44, 86, 34, 27, 48, 31, 86, 45, 49, 27, 45, 34, 28, 47, 29, 37, 38, 35, 40, 33, 86, 27, 30, 48, 31, 40, 46, 47, 44, 31, 86, 35, 40, 86, 49, 35, 40, 46, 31, 44, 51, 86, 8, 35, 38, 38, 45, 32, 27, 44, 56, 86]

[86 86 86 13 47 29 34 86 32 47 40 40 51 56 86 86 85 86 42 41 35 40 46 45
 56 86 86  1 45 86 39 31 40 46 35 41 40 31 30 86 3

In [0]:
data = np.array(data)
data.shape

(23534, 1014)

In [0]:
# get one-hot encoding for user ids
ids = df_top['id'].values

ids

array([ 589736,  589736,  589736, ..., 1270648, 1270648, 1270648])

In [0]:
ids = ids.reshape(-1, 1)

ids

array([[ 589736],
       [ 589736],
       [ 589736],
       ...,
       [1270648],
       [1270648],
       [1270648]])

In [0]:
enc = OneHotEncoder()
ids = enc.fit_transform(ids).toarray()

ids

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [0]:
ids.shape

(23534, 10)

In [0]:
# get vocab size
# (use max(value) instead of len() since all digits are given the same value)
vocab_size = max(tk.word_index.values())

vocab_size

86

In [0]:
# set initial embedding weights (one-hot)
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))  # padding vector

for i in set(tk.word_index.values()):
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

print(embedding_weights.shape)
embedding_weights

(87, 86)


array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [0]:
# split test, train, validation
X, X_test, y, y_test = train_test_split(data, ids, test_size=0.1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [0]:
# set hyperparameters

input_size = 1014
embedding_size = 86

num_of_classes = 10
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]


In [0]:
#initialize embedding layer
embedding_layer = Embedding(vocab_size+1, embedding_size, 
                            input_length=input_size, 
                            weights=[embedding_weights])


In [0]:
# build the model
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64') 
# Embedding
x = embedding_layer(inputs)
# Conv and pooling
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size, activation='relu')(x)
    if pooling_size != -1:
        x = MaxPool1D(pool_size=pooling_size)(x) 
x = Flatten()(x) 
# Fully connected layers
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x) 
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)


In [0]:
# compile model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1014)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1014, 86)          7482      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1008, 256)         154368    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 336, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 110, 256)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 108, 256)          1968

In [0]:
# train model 
model.fit(X_train, y_train, batch_size=128, epochs=15, validation_data=(X_val, y_val))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 19062 samples, validate on 2118 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7f3d4362c470>

In [0]:
# get predictions on validation set for metrics
pred = np.round(model.predict(X_val, verbose=1))



In [0]:
precision, recall, fMeasure, support = \
    precision_recall_fscore_support(y_val, pred, average='micro')

fMeasure

0.6595022624434389

In [0]:
# get predictions on test set for metrics
pred = np.round(model.predict(X_test, verbose=1))



In [0]:
precision, recall, fMeasure, support = \
    precision_recall_fscore_support(y_test, pred, average='micro')

fMeasure

0.6722180166540499