In [38]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation , Embedding , Flatten
from keras.callbacks import Callback
from keras.optimizers import Adam



np.set_printoptions(threshold=np.inf)
# right now we only take 10k samples to make the testing of the code faster
sample_size = 10000
# this is the maximum number of words we take from the blogs
max_length = 500
df = pd.read_json("data.json")
df.head()

Unnamed: 0,age,gender,post
0,27,male,Thabo admits defeat on quiet diplomacy Mbeki ...
1,25,male,Brainbench welcomes its 5 millionth subscriber...
2,23,female,"Even though the air in Jerusalem is dry, it is..."
3,25,female,there's nothing else more embarassing in life ...
4,38,female,Today I had a glass artist over for a firing. ...


In [39]:
# We utilized nltk module to remove known stopwords from the blog texts.
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
  
stop_words = set(stopwords.words('english'))    #set of stopwords
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
{'all', "haven't", 'there', 'he', 'other', 'mightn', 'those', 'shouldn', 'an', 'hasn', 'under', 'do', 'its', 'when', 'where', 'her', 'hadn', "wouldn't", 'if', 'his', 'than', 'is', 'for', 'can', 'at', 'again', 'as', 'being', 'yourself', 'until', 'each', 'i', 'but', 'down', 'am', "you're", 'too', 'further', 'before', 'theirs', 'what', 'didn', 'on', 'very', 'should', 'she', 'after', 'm', 'only', 'while', "should've", "hasn't", "mightn't", 'wouldn', 'up', 'were', 'in', 'into', 'needn', 'with', 'same', 'how', "wasn't", 'your', 'doing', 'just', "didn't", 'o', 'against', 'me', 'you', 'been', 'aren', 'nor', 'whom', 's', 'this', 'that', "weren't", 'above', 'here', 'himself', 'ours', 'most', "that'll", "shouldn't", 'won', 'the', 'which', 'y', 'through', 'shan', 'him', '

In [40]:
# get sample_size amount of data from the database
values = df.values[0:sample_size]
print(values.shape)
#print(values[0:2])

docs = values[: , 2]
#print(docs.shape)
labels = values[: , 0:2]
#print(docs[0:5])
#print(labels[0:5][:])

#change database to lower case letters
for i,doc in enumerate(docs):
  docs[i] = doc.lower()

(10000, 3)


In [41]:
#WORD DICTIONARY START
# Here we create a word dictionary of the blog posts, to see how many different
# words are in them, to decide what the vocab_size should be.
# We also check how many words we might not even need, because they appear very
# infrequently.
# We also use this to create a list of the infrequent words, so we can remove them,
# to see if it helps later in the modelling phase.
import re

docs_to_process = docs

  
def get_word_dict(docs_to_process):
    word_dict = {}
    for text in docs_to_process:
        words = re.findall(r"[\w']+", text)
        for word in words:
            if word in word_dict:
                word_dict[word] = word_dict[word] + 1
            else:
                word_dict[word] = 1
      
    return word_dict
  
  
word_dict = get_word_dict(docs_to_process)

infrequent_words = []

word_dict_small = word_dict.copy()
for elem in word_dict:
    if word_dict[elem] <= 1:
      infrequent_words.append(elem)
      del word_dict_small[elem]
      
print("Size of the word dictionary: " + str(len(word_dict)))
print("Size without infrequent words: " + str(len(word_dict_small)))
#print(infrequent_words)

# Create a set that contains everything to remove
#rm_words = set(infrequent_words)
#print(rm_words)

#WORD DICTIONARY STOP

Size of the word dictionary: 68942
Size without infrequent words: 34032


In [0]:
#This function removes stopwords and also words that appear infrequently, and it cuts blogposts longer than 500 words.

def reduce_vocab(docs, word_limit, sparsewords=False):
  for i,  blogpost in enumerate(docs):
    word_tokens = word_tokenize(docs[i])
    
    if(len(word_tokens) > word_limit):
      word_tokens = word_tokens[0:word_limit]
    
    blogpost_reduced = [w for w in word_tokens if not w in stop_words]
    
    if(sparsewords == True):
      blogpost_reduced = [w for w in blogpost_reduced if not w in infrequent_words ]
      
    docs[i] = ' '.join( blogpost_reduced ) #.replace(' , ',',').replace(' .','.').replace(' !','!').replace(' ?','?')
    
# Right now the removal of sparse/infrequent words is slow, we might have to find a different way to reduce vocabulary size
reduce_vocab(docs, max_length, sparsewords=False)

In [43]:
#testing how the vocabulary reduction worked by examples:
df_orig = pd.read_json("data.json")
docs_orig = df_orig.values[:,2]

print(docs_orig[0])
print(docs[0])
print(docs_orig[1])
print(docs[1])

Thabo admits defeat on quiet diplomacy  Mbeki  urlLink stated  yesterday that his policy of quiet diplomacy on Zimbabwe has been a failure. Speaking through presidential spokesperson Bheki Khumalo, he said that the talks between the Zanu-PF and MDC were 'too slow' and that no progress had been made on the issue.  However, Mbeki also stated that he would "press on with his diplomatic efforts in Zimbabwe despite fierce criticism, because he still believed there was no alternative to dialogue." These is a confusing sentiment. There are many political alternatives to dialogue that include simple public censure and economic pressure.  Mbeki has largely based himself as a foreign policy president, particularly in his first term, and to gain any credibility for NEPAD and his other strong foreign policy initiatives, he must take action on Zimbabwe. After his softening position on AIDS, it is the one factor in his presidency that the world still cannot understand. Struggle friends or not, a dic

In [0]:
dataout = {'age' : labels[:,0] , 'gender': labels[:,1] , 'post': docs}

dfout = pd.DataFrame( data=dataout ) 
dfout.to_json('out.json')

In [108]:

df_processed = pd.read_json("out.json")
df_processed.head()

values_processed = df_processed.values
docs_processed = values_processed[:,2]
labels_processed = values_processed[: , 0:2]
# check finalized word_dict:
final_wd = get_word_dict(docs_processed)
print("final vocabulary size is: " + str(len(final_wd)))

final vocabulary size is: 59148


In [109]:
# We chose a vocab size based on the word dictionary's length
vocab_size = len(final_wd)
encoded_docs = [keras.preprocessing.text.one_hot(d, vocab_size , filters='') for d in docs_processed]
#print("One hot encoded docs: " , encoded_docs)
#print("Count of docs: ",len(encoded_docs))
#print("Length of the first doc after one-hot:", len(encoded_docs[0]))      
#print("Lenght of the original doc in words:" , len(docs2[0].split(' ')))
# According to the documentation the difference is because keras' one-hot removes special characters.

lengths= []
lenmax = 0
for i in range(len(encoded_docs)):
  lengths.append( len(docs[i].split(' ')))
  if ( lengths[-1] > lenmax ):
      lenmax = lengths[-1]
print("length of blog post containing most words is:" , lenmax )


length of blog post containing most words is: 497


In [110]:
labels_processed.shape

(10000, 2)

In [47]:
# This is where the input vectors get padded to the same size. The length has to be longest of all the one-hot encoded inputs.

padded_docs = keras.preprocessing.sequence.pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs[0]))    #check the lenght of a padded vector

500


In [0]:
# get valid/test/train split
valid_split = 0.2
test_split = 0.1
nb_samples = values.shape[0]


X_train = padded_docs[0:int(nb_samples*(1-valid_split-test_split))]
Y_train = labels_processed[0:int(nb_samples*(1-valid_split-test_split)),:]
X_valid = padded_docs[int(nb_samples*(1-valid_split-test_split)):int(nb_samples*(1-test_split))]
Y_valid = labels_processed[int(nb_samples*(1-valid_split-test_split)):int(nb_samples*(1-test_split)),:]
X_test  = padded_docs[int(nb_samples*(1-test_split)):]
Y_test  = labels_processed[int(nb_samples*(1-test_split)):,:]

In [113]:
# check shapes
print("X_train shape: " + str(X_train.shape))
print("Y_train shape: " + str(Y_train.shape))
print("X_valid shape: " + str(X_valid.shape))
print("Y_valid shape: " + str(Y_valid.shape))
print("X_test shape: " + str(X_test.shape))
print("Y_test shape: " + str(Y_test.shape))

X_train shape: (7000, 500)
Y_train shape: (7000, 2)
X_valid shape: (2000, 500)
Y_valid shape: (2000, 2)
X_test shape: (1000, 500)
Y_test shape: (1000, 2)
