In [None]:
import pandas as pd
import re
import numpy as np

In [None]:
dataset = pd.read_csv('./dataset/post_risklabel.csv')

In [None]:
dataset.head()

In [None]:
len(dataset) # 496, 496 samples

In [None]:
# testing

In [None]:
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
#     string = re.sub(r"\[.*\]", "Name", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [None]:
def load_data_and_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    dataset = pd.read_csv('./dataset/post_risklabel.csv')
    post = dataset['post']
    post = [str(s).strip() for s in post]
    user_risk_label = dataset['risk_label']
    # Split by words
    x_text = [clean_str(sent) for sent in post]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    risk_labels = list()
    for label in user_risk_label:
        if label == 'a':
            risk_labels.append([1, 0, 0, 0])
        elif label == 'b':
            risk_labels.append([0, 1, 0, 0])
        elif label == 'c':
            risk_labels.append([0, 0, 1, 0])
        elif label == 'd':
            risk_labels.append([0, 0, 0, 1])
    y = np.asarray(risk_labels)
    return [x_text, y]

In [None]:
x_text, y = load_data_and_labels()

In [None]:
len(x_text)

In [None]:
len(y)

In [None]:
y[0]

In [None]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences, sequence_length

In [None]:
sentences_padded, sequence_length = pad_sentences(x_text)

In [None]:
len(sentences_padded)

In [None]:
sequence_length

In [1]:
import numpy as np
import re
import itertools
from collections import Counter
import pandas as pd
import csv
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
# from sentence_splitter import SentenceSplitter, split_text_into_sentences
# splitter = SentenceSplitter(language='en')
# from util import TextCleaner, InputReader

def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
#     string = re.sub(r"\[.*\]", "Name", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    dataset = pd.read_csv('./dataset/post_risklabel.csv')
    post = dataset['post']
    post = [str(s).strip() for s in post]
    user_risk_label = dataset['risk_label']
    # Split by words
    x_text = [clean_str(sent) for sent in post]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    risk_labels = list()
    for label in user_risk_label:
        if label == 'a':
            risk_labels.append([1, 0, 0, 0])
        elif label == 'b':
            risk_labels.append([0, 1, 0, 0])
        elif label == 'c':
            risk_labels.append([0, 0, 1, 0])
        elif label == 'd':
            risk_labels.append([0, 0, 0, 1])
    y = np.asarray(risk_labels)
    return [x_text, y]



def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences, sequence_length


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    sentences_padded_list = list()
    for sentence in sentences:
        sentence_list = list()
        for word in sentence:
            if word in vocabulary:
                sentence_list.append(vocabulary[word])
            else:
                sentence_list.append(np.random.uniform(-0.01, 0.01))
        sentences_padded_list.append(sentence_list)
    x = np.array(sentences_padded_list)
    y = np.array(labels)
    return [x, y]



def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix


def get_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    annotated_data = pd.read_csv("./dataset/post_risklabel.csv")
    annotation_class = annotated_data['risk_label']
    # Generate labels
    class_labels = list()
    for label in annotation_class:
        if label == 'a':
            class_labels.append(0)
        elif label == 'b':
            class_labels.append(1)
        elif label == 'c':
            class_labels.append(2)
        elif label == 'd':
            class_labels.append(3)
    y = np.asarray(class_labels)
    return y


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    # plt.autoscale()
    # plt.tight_layout()

In [2]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.layers import Input, Dense, Embedding, LSTM, Dropout
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

Using TensorFlow backend.


In [3]:
print('Loading annotated social text data')
x_text, y_class = load_data_and_labels()
y = get_labels()

sentences_padded, sequence_length = pad_sentences(x_text)

# global variebles
embedding_dim = 200
num_filters = 512
drop = 0.5
epochs = 1
batch_size = 100

# define 10-fold cross validation test harness
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cvscores = []
auc_scores = []

Loading annotated social text data


In [None]:
print('10 fold CV starting')
for train, test in kfold.split(sentences_padded, y_class):
    # split train & test set
    print('spliting train and test set')
    X_train = list()
    X_test = list()
    for index in train:
        X_train.append(sentences_padded[index])
    for index in test:
        X_test.append(sentences_padded[index])
    y_train = y_class[train]
    y_test = y_class[test]

    # building vocabulary on train set
    print('building vocabulary on train set')
    vocabulary, vocabulary_inv = build_vocab(X_train)

    # Maps sentences to vectors based on vocabulary
    print('Mapping sentences to vectors based on vocabulary')
    X_train, y_train = build_input_data(X_train, y_train, vocabulary)
    # print(X_train.shape)
    X_test, y_test = build_input_data(X_test, y_test, vocabulary)
    # all x and y for predicting
    x, y_class = build_input_data(sentences_padded, y_class, vocabulary)
    # print(X_test.shape)
    vocabulary_size = len(vocabulary_inv)

    # building embedding matrix using GloVe word embeddings
    print('building embedding matrix using GloVe word embeddings')
    embedding_matrix = create_embedding_matrix('./dataset/myGloVe200d.txt', vocabulary, embedding_dim)

    # this returns a tensor
    print("Creating Model...")
    inputs = Input(shape=(sequence_length,), dtype='int32')
    embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=sequence_length)(inputs)

    lstm = LSTM(num_filters, kernel_initializer='normal', activation='relu')(embedding)

    dropout = Dropout(drop)(lstm)
    output = Dense(units=4, activation='softmax')(dropout)

    # this creates a model that includes
    model = Model(inputs=inputs, outputs=output)

# checkpoint = ModelCheckpoint('./model/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    print("Training Model...")
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)  # starts training

    # evaluate the model
    print("Evaluate Model...")
    scores = model.evaluate(X_test, y_test, verbose=1)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
    cvscores.append(scores[1] * 100)

    print('Saving Model...')
    model_name = 'LSTM_GloVe_' + str(len(cvscores))
    model.save('./model/'+ model_name + '.hdf5')  # creates a HDF5 file 'my_model.h5'

    print('Saving vocabulary to .json file')
    with open('./vocabulary/' + model_name + '.json', 'w') as fp:
        json.dump(vocabulary, fp)

    print('Predicting categories...')
    y_pred = model.predict(x)
    y_classes = y_pred.argmax(axis=-1)
    auc_score = multiclass_roc_auc_score(y, y_classes, average="weighted")
    print("%s: %.2f%%" % ('Average AUC', auc_score * 100))
    auc_scores.append(auc_score * 100)
    df_y_classes = pd.DataFrame(y_classes)
    df_y = pd.DataFrame(y)
    result = pd.concat([df_y, df_y_classes], axis=1)
    result.columns = ['true_class', 'predict_class']
    result.to_csv('./results/' + model_name + 'result.csv', encoding='utf-8', index = False)

    print('Generating confusion matrix...')
    conf_mat = confusion_matrix(y, y_classes)

    print('Plotting results...')
    fig, ax = plt.subplots(figsize=(10, 10))
    labels = ['Economic', 'Education', 'Health Care', 'Housing', 'Interaction with the legal system',
              'Occupational', 'Other', 'social environment', 'Spiritural Life',
              'Support circumstances and networks', 'Transportaion']
    # sns.heatmap(conf_mat, annot=True, fmt='d',CNN_social11_model.pyCNN_social11_model.py
    #             xticklabels=labels, yticklabels=labels)
    # plt.ylabel('Actual')
    # plt.xlabel('Predicted')
    # fig.savefig('./figure/' + model_name + 'result.png')
    # plt.figure()
    plot_confusion_matrix(conf_mat, classes=labels, normalize=True,
                          title='Normalized confusion matrix')
    plt.gcf().subplots_adjust(bottom=0.15)

    print('Saving plots...')
    fig.savefig('./figure/' + model_name + 'result.png')
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
print("auc" + "%.2f%% (+/- %.2f%%)" % (np.mean(auc_scores), np.std(auc_scores)))

10 fold CV starting
spliting train and test set
building vocabulary on train set
Mapping sentences to vectors based on vocabulary
building embedding matrix using GloVe word embeddings
Creating Model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Training Model...
Instructions for updating:
Use tf.cast instead.
Epoch 1/1
