In [14]:
import nltk
nltk.download('stopwords')
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.framework import ops
from tensorflow.contrib import learn
from sklearn import metrics
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /home/fatma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
st = LancasterStemmer()

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'ve", " ", string)
    string = re.sub(r"n\'t", " ", string)
    string = re.sub(r"n\'s", " ", string)
    string = re.sub(r"\'re", " ", string)
    string = re.sub(r"\'d", " ", string)
    string = re.sub(r"\'ll", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\//?", " ", string)
    string = re.sub(r"\d+", " ", string)
    string = re.sub(r"\$", " ", string)
    string = re.sub(r"\#", " ", string)
    return string.strip().lower()

In [3]:
def remove_stop(str):
    stop = set(stopwords.words('english'))
    lst = str.split()
    lst = [i for i in lst if i not in stop]
    return ' '.join(lst)

def stem (str):
    lst = str.split()
    lst = [st.stem(x) for x in lst]
    return ' '.join(lst)

def remove_unwanted_words(str):
    unwanted_words = ["httpaddress", "usrid", "D", "dd", "rt", "amp", "am", "pm", '``',
                      "''", "", "//", "\\", "\\'s", "\\?", "\?","http","httpaddresshttpaddresst", "cohttpaddressek",
                      "taksim","gezi", "park", "direngeziparki", "occupygezi", "istanbul", "turkish","turkey",
                      "protest","direngezipark","direnankara","geziparki", "protesters", "protests", "sat", "sun", "mon,",
                     "tue", "wed", "thu", "fri", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    lst = str.split(" ")
    lst = [i for i in lst if i not in unwanted_words]
    return ' '.join(lst)

def toLower(str):
    lst = str.split()
    lst = [i.lower() for i in lst]
    return ' '.join(lst)

def word_len (str):
    lst = str.split()
    lst = [i for i in lst if len(i)>1 and len(i) <7]
    return ' '.join(lst)

def sent_len (str):
    lst = str.split()
    if len(lst)>=3:
        return ' '.join(lst)

In [4]:
def load_data_and_labels_shuffled(positive_data_file, negative_data_file):
    """
    Loads MR polarity african_data from files, splits the african_data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load african_data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    positive_examples = [remove_stop(item) for item in positive_examples]
    #print(positive_examples[0])
    positive_examples = [toLower(item) for item in positive_examples]
    #rint(positive_examples[0])
    positive_examples = [clean_str(sent) for sent in positive_examples]
    #rint(positive_examples[0])
    #positive_examples = [stem(item) for item in positive_examples]
    #rint(positive_examples[0])
    positive_examples = [remove_unwanted_words(item) for item in positive_examples]
    positive_examples = [word_len(item) for item in positive_examples]
    positive_examples = [sent_len(item) for item in positive_examples]
    positive_examples = list(filter(None, positive_examples))
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    negative_examples = [toLower(item) for item in negative_examples]
    #print(negative_examples[0])
    negative_examples = [remove_stop(item) for item in negative_examples]
    #print(negative_examples[0])
    negative_examples = [clean_str(sent) for sent in negative_examples]
    #print(negative_examples[0])
    #ngative_examples = [stem(sent) for sent in negative_examples]
    #pint(negative_examples[0])
    negative_examples = [remove_unwanted_words(item) for item in negative_examples]
    negative_examples = [word_len(item) for item in negative_examples]
    #print(negative_examples[0])
    negative_examples = [sent_len(item) for item in negative_examples]

    negative_examples = list(filter(None, negative_examples))

    # Split by words
    x_text = positive_examples + negative_examples
    # x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [5]:
ops.reset_default_graph()

# Start a graph session
sess = tf.Session()

# Choose max text word length at 25
sentence_size = 10
min_word_freq = 3

In [15]:
tweets_text, tweets_y = load_data_and_labels_shuffled('Data/CF_Fatma_label_pos.txt', 'Data/CF_Fatma_label_neg.txt')

In [17]:
CF_tweets_DS_X_train, CF_tweets_DS_X_test, CF_tweets_DS_y_train, CF_tweets_DS_y_test = train_test_split(np.array(tweets_text),np.array(tweets_y),
test_size= 0.5, random_state=1)

In [19]:
#source train dataset Build vocabulary
vocab_processor_CF_training_text = learn.preprocessing.VocabularyProcessor(sentence_size, min_frequency=min_word_freq)
CF_tweets_DS_x_train = np.array(list(vocab_processor_CF_training_text.fit_transform(CF_tweets_DS_X_train)))
print("source training dataset vocabulary size", len(vocab_processor_CF_training_text.vocabulary_))
training_vocab_size = len(vocab_processor_CF_training_text.vocabulary_)

#source test dataset Build vocabulary
vocab_processor_CF_test_text = learn.preprocessing.VocabularyProcessor(sentence_size, min_frequency=min_word_freq)
CF_tweets_DS_x_test = np.array(list(vocab_processor_CF_test_text.fit_transform(CF_tweets_DS_X_test)))
test_vocab_size = len(vocab_processor_CF_test_text.vocabulary_)
print("source test dataset vocabulary size", test_vocab_size)

source training dataset vocabulary size 152
source test dataset vocabulary size 129


In [16]:
# Split up data set into train/test
#train_indices = np.random.choice(len(tweets_text), round(len(tweets_text) * 0.7), replace=False)
#test_indices = np.array(list(set(range(len(tweets_text))) - set(train_indices)))
#texts_train = [x for ix, x in enumerate(tweets_text) if ix in train_indices]
#texts_test = [x for ix, x in enumerate(tweets_text) if ix in test_indices]
#target_train = [x for ix, x in enumerate(tweets_y) if ix in train_indices]
#target_test = [x for ix, x in enumerate(tweets_y) if ix in test_indices]

In [23]:
print("train size", len(CF_tweets_DS_X_train))
print("positive samples", list(CF_tweets_DS_y_train).count([1]))
print("negative samples", list(CF_tweets_DS_y_train).count([0]))

print("test size", len(CF_tweets_DS_X_test))
print("positive samples",  list(CF_tweets_DS_y_test).count([1]))
print("negative samples", list(CF_tweets_DS_y_test).count([0]))

train size 498
positive samples 97
negative samples 401
test size 499
positive samples 111
negative samples 388


In [41]:
print(len([x for x in vocab_processor_CF_training_text.transform(CF_tweets_DS_X_train)]))

498


In [34]:
# Setup Index Matrix for one-hot-encoding
identity_mat = tf.diag(tf.ones(shape=[training_vocab_size,len(CF_tweets_DS_X_train)]))

# Create variables for logistic regression
A = tf.Variable(tf.random_normal(shape=[training_vocab_size, 1]))
b = tf.Variable(tf.random_normal(shape=[1, 1]))

In [35]:
# Initialize placeholders
x_data = tf.placeholder(shape=[training_vocab_size, len(CF_tweets_DS_X_train)], dtype=tf.int32)
y_target = tf.placeholder(shape=[1, len(CF_tweets_DS_y_train)], dtype=tf.float32)


x_test_data = tf.placeholder(shape=[training_vocab_size, len(CF_tweets_DS_X_test)], dtype=tf.int32)
y_test_target = tf.placeholder(shape=[1, len(CF_tweets_DS_y_test)], dtype=tf.float32)


In [38]:
# Text-Vocab Embedding
x_embed = tf.contrib.layers.bow_encoder(x_data, vocab_size=training_vocab_size, embed_dim=128)
x_col_sums = tf.reduce_sum(x_embed, 0)


In [39]:
# Declare model operations
x_col_sums_2D = tf.expand_dims(x_col_sums, 0)
model_output = tf.add(tf.matmul(x_col_sums_2D, A), b)

# Declare loss function (Cross Entropy loss)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_output, labels=y_target))

# Prediction operation
prediction = tf.sigmoid(model_output)
_, label_auc = tf.metrics.auc(y_target, prediction)

# Declare optimizer
my_opt = tf.train.GradientDescentOptimizer(0.001)
train_step = my_opt.minimize(loss)

# Intitialize Variables
init =  tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
sess.run(init)

ValueError: Dimensions must be equal, but are 128 and 1 for 'MatMul_3' (op: 'MatMul') with input shapes: [1,128], [1,152].

In [198]:
print('Starting Training Over {} Sentences.'.format(len(texts_train)))
loss_vec = []
train_acc_all = []
train_acc_avg = []
train_auc_all= []
train_sk_auc_all= []
for ix, t in enumerate(vocab_processor.fit_transform(texts_train)):
    y_data = [[target_train[ix]]]

    sess.run(train_step, feed_dict={x_data: t, y_target: y_data})
    temp_loss = sess.run(loss, feed_dict={x_data: t, y_target: y_data})
    loss_vec.append(temp_loss)

    #if (ix + 1) % 10 == 0:
        #print('Training Observation #' + str(ix + 1) + ': Loss = ' + str(temp_loss))

    # Keep trailing average of past 50 observations accuracy
    # Get prediction of single observation
    [[temp_pred]] = sess.run(prediction, feed_dict={x_data: t, y_target: y_data})
    auc = sess.run(label_auc, feed_dict={x_data: t, y_target: y_data})
    #sk_auc = metrics.roc_auc_score(y_data,temp_pred)
    # Get True/False if prediction is accurate
    train_acc_temp = target_train[ix] == np.round(temp_pred)
    train_acc_all.append(train_acc_temp)
    train_auc_all.append(auc)

    if len(train_acc_all) >= 50:
        train_acc_avg.append(np.mean(train_acc_all[-50:]))


Starting Training Over 933 Sentences.


In [199]:
# Get test set accuracy
print('Getting Test Set Accuracy For {} Sentences.'.format(len(texts_test)))
test_acc_all = []
test_auc_all =[]
test_sk_auc_all =[]
test_acc_avg = []
for ix, t in enumerate(vocab_processor.fit_transform(texts_test)):
    y_data = [[target_test[ix]]]
    if (ix + 1) % 50 == 0:
        print('Test Observation #' + str(ix + 1))

        # Keep trailing average of past 50 observations accuracy
    # Get prediction of single observation
    [[temp_pred]] = sess.run(prediction, feed_dict={x_data: t, y_target: y_data})
    auc = sess.run(label_auc, feed_dict={x_data: t, y_target: y_data})
    # Get True/False if prediction is accurate
    test_acc_temp = target_test[ix] == np.round(temp_pred)
    test_auc_all.append(auc)
    test_acc_all.append(test_acc_temp)
    if len(test_acc_all) >= 50:
        test_acc_avg.append(np.mean(test_acc_all[-50:]))
print('\nOverall Test Accuracy: {}'.format(np.mean(test_acc_all)))

print('\nOverall Training auc: {}'.format(np.mean(train_auc_all)))
print('\nOverall Test auc: {}'.format(np.mean(test_auc_all)))

Getting Test Set Accuracy For 400 Sentences.
Test Observation #50
Test Observation #100
Test Observation #150
Test Observation #200
Test Observation #250
Test Observation #300
Test Observation #350
Test Observation #400

Overall Test Accuracy: 0.87

Overall Training auc: 0.3508424460887909

Overall Test auc: 0.5201953053474426


In [38]:
f = tf.ones(shape=[embedding_size,len(texts_train)])

In [39]:
sess.run(f)

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)