In [1]:
import xlrd

import re
import nltk
from random import shuffle

In [2]:
def get_words(text, stop=False):
    raw_text =  re.split(' ',text)
    unwanted = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~£'
    raw_text2 = ''
    for k in raw_text:
        # Ignore urls
        if k.startswith('http') or k.startswith('http'):
            continue
        elif k.startswith('@'):     
            continue
        elif k.startswith('#'):
            continue
        else:
            raw_text2 += ' '
            raw_text2 += k
    words_raw = nltk.word_tokenize(raw_text2.lower())
    words = [w for w in words_raw if w not in unwanted]
    if stop: 
        words = [w for w in words if w not in stopwords.words('english')]
    return words

In [3]:
%%time
path = 'blog-gender-dataset.xlsx'

workbook = xlrd.open_workbook(path)
worksheet = workbook.sheet_by_index(0)

# Change this depending on how many header rows are present
# Set to 0 if you want to include the header data.
offset = 0

rows = []
for i, row in enumerate(range(worksheet.nrows)):
    if i <= offset:  # (Optionally) skip headers
        continue
    r = []
    for j, col in enumerate(range(worksheet.ncols)):
        r.append(worksheet.cell_value(i, j))
    rows.append(r[:2])
    
x_raw = []
y = []

for i in rows:
    x_raw.append(i[0])
    y.append(i[1])
    
    

CPU times: user 21.4 s, sys: 264 ms, total: 21.7 s
Wall time: 21.7 s


In [4]:
%%time
model_dict = {}

for sent in x_raw:
    for word in get_words(sent):
        if word not in model_dict.keys():
            model_dict[word] = 1
        else:
            model_dict[word] += 1


CPU times: user 11.9 s, sys: 4 ms, total: 11.9 s
Wall time: 11.9 s


In [5]:
top_words = len(model_dict)+1
print(top_words)

66259


In [6]:
%%time
t1 = sorted(model_dict.items(), key=lambda x: x[1], reverse=True)
up_prunung = 200
down_pruning = 50000
x_encoded = []
encoding = {}
x_id = 0
for term in t1[up_prunung:down_pruning]:
    encoding[term[0]] = x_id
    x_id+=1

for sent in x_raw:
    words = []
    for word in get_words(sent):
        if word in encoding.keys():
            words.append(encoding[word])
    x_encoded.append(words)
top_words = down_pruning - up_prunung

CPU times: user 12.4 s, sys: 52 ms, total: 12.4 s
Wall time: 12.4 s


In [7]:
print(top_words)

49800


In [8]:
%%time
idx = list(range(len(rows)))
shuffle(idx)
stamp = int(0.8*len(idx))

x_rand = []
y_rand = []
for i in idx:
    x_rand.append(x_encoded[i])
    y_rand.append(y[i])

split_stamp = int(0.8 * len(x_rand))  
x_train = x_rand[:split_stamp]
y_train = y_rand[:split_stamp]

x_test = x_rand[split_stamp:]
y_test = y_rand[split_stamp:]


y_train_norm = []
for i in range(len(y_train)):
    if 'F' in y_train[i]:
        y_train_norm.append(0)
    else:
        y_train_norm.append(1)
        
y_test_norm = []
for i in range(len(y_test)):
    if 'F' in y_test[i]:
        y_test_norm.append(0)
    else:
        y_test_norm.append(1)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.85 ms


## Neural training

In [9]:
# LSTM and CNN for sequence classification in the IMDB dataset
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility


Using Theano backend.
Using gpu device 0: GeForce GTX 980 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN not available)


In [10]:
print(top_words)

49800


In [11]:
%%time
max_review_length = 350
X_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(x_test, maxlen=max_review_length)


CPU times: user 68 ms, sys: 16 ms, total: 84 ms
Wall time: 83.5 ms


In [15]:
%%time
# create the model
embedding_vecor_length = 128 # vary length
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Convolution1D(nb_filter=40, filter_length=5, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(LSTM(200))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 350, 128)      6374400     embedding_input_2[0][0]          
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 350, 40)       25640       embedding_2[0][0]                
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D)    (None, 175, 40)       0           convolution1d_2[0][0]            
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 200)           192800      maxpooling1d_2[0][0]             
___________________________________________________________________________________________

In [16]:
%%time
model.fit(X_train, y_train_norm, nb_epoch=5, batch_size=128)
# Final evaluation of the model

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 2h 25min 8s, sys: 7h 29min 47s, total: 9h 54min 56s
Wall time: 1h 41min 38s


<keras.callbacks.History at 0x7fe635f917f0>

In [18]:
%%time
scores = model.evaluate(X_test, y_test_norm, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 66.77%
CPU times: user 1min 31s, sys: 1min 1s, total: 2min 32s
Wall time: 1min 24s
