In [1]:
import nltk
nltk.download('stopwords')
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np
import tensorflow as tf
import random

[nltk_data] Downloading package stopwords to /Users/fatma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
st = LancasterStemmer()

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'ve", " ", string)
    string = re.sub(r"n\'t", " ", string)
    string = re.sub(r"n\'s", " ", string)
    string = re.sub(r"\'re", " ", string)
    string = re.sub(r"\'d", " ", string)
    string = re.sub(r"\'ll", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\//?", " ", string)
    string = re.sub(r"\d+", " ", string)
    string = re.sub(r"\$", " ", string)
    string = re.sub(r"\#", " ", string)
    return string.strip().lower()

In [4]:
def remove_stop(str):
    stop = set(stopwords.words('english'))
    lst = str.split()
    lst = [i for i in lst if i not in stop]
    return ' '.join(lst)

def stem (str):
    lst = str.split()
    lst = [st.stem(x) for x in lst]
    return ' '.join(lst)

def remove_unwanted_words(str):
    unwanted_words = ["httpaddress", "usrid", "D", "dd", "rt", "amp", "am", "pm", '``',
                      "''", "", "//", "\\", "\\'s", "\\?", "\?","http","httpaddresshttpaddresst", "cohttpaddressek",
                      "taksim","gezi", "park", "direngeziparki", "occupygezi", "istanbul", "turkish","turkey",
                      "protest","direngezipark","direnankara","geziparki", "protesters", "protests", "sat", "sun", "mon,",
                     "tue", "wed", "thu", "fri", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    lst = str.split(" ")
    lst = [i for i in lst if i not in unwanted_words]
    return ' '.join(lst)

def toLower(str):
    lst = str.split()
    lst = [i.lower() for i in lst]
    return ' '.join(lst)

def word_len (str):
    lst = str.split()
    lst = [i for i in lst if len(i)>1 and len(i) <7]
    return ' '.join(lst)

def sent_len (str):
    lst = str.split()
    if len(lst)>=3:
        return ' '.join(lst)

In [5]:
def load_data_and_labels_shuffled(positive_data_file, negative_data_file):
    """
    Loads MR polarity african_data from files, splits the african_data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load african_data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    positive_examples = [remove_stop(item) for item in positive_examples]
    #print(positive_examples[0])
    positive_examples = [toLower(item) for item in positive_examples]
    #rint(positive_examples[0])
    positive_examples = [clean_str(sent) for sent in positive_examples]
    #rint(positive_examples[0])
    #positive_examples = [stem(item) for item in positive_examples]
    #rint(positive_examples[0])
    positive_examples = [remove_unwanted_words(item) for item in positive_examples]
    positive_examples = [word_len(item) for item in positive_examples]
    positive_examples = [sent_len(item) for item in positive_examples]
    positive_examples = list(filter(None, positive_examples))
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    negative_examples = [toLower(item) for item in negative_examples]
    #print(negative_examples[0])
    negative_examples = [remove_stop(item) for item in negative_examples]
    #print(negative_examples[0])
    negative_examples = [clean_str(sent) for sent in negative_examples]
    #print(negative_examples[0])
    #ngative_examples = [stem(sent) for sent in negative_examples]
    #pint(negative_examples[0])
    negative_examples = [remove_unwanted_words(item) for item in negative_examples]
    negative_examples = [word_len(item) for item in negative_examples]
    #print(negative_examples[0])
    negative_examples = [sent_len(item) for item in negative_examples]

    negative_examples = list(filter(None, negative_examples))

    # Split by words
    x_text = positive_examples + negative_examples
    # x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [None]:
tweets_text, tweets_y = load_data_and_labels_shuffled('Data/Turkish_tweets_CF_results_09_05_2018_prccd_pos.txt', 'Data/Turkish_tweets_CF_results_09_05_2018_prccd_neg.txt')

In [None]:
# Setup vocabulary processor
vocab_processor = learn.preprocessing.VocabularyProcessor(sentence_size , min_frequency=min_word_freq)

# Have to fit transform to get length of unique words.
vocab_processor.transform(tweets_text)
embedding_size = len([x for x in vocab_processor.transform(tweets_text)])
print("embedding size = ", embedding_size)

In [None]:
# Split up data set into train/test
train_indices = np.random.choice(len(tweets_text), round(len(tweets_text) * 0.7), replace=False)
test_indices = np.array(list(set(range(len(tweets_text))) - set(train_indices)))
texts_train = [x for ix, x in enumerate(tweets_text) if ix in train_indices]
texts_test = [x for ix, x in enumerate(tweets_text) if ix in test_indices]
target_train = [x for ix, x in enumerate(tweets_y) if ix in train_indices]
target_test = [x for ix, x in enumerate(tweets_y) if ix in test_indices]

In [None]:
# Process vocabulary
texts_train = np.array(list(vocab_processor.fit_transform(texts_train)))
texts_test = np.array(list(vocab_processor.transform(texts_test)))
n_words = len(vocab_processor.vocabulary_)

In [None]:
print('Total words: %d' % n_words)

print("train size", len(texts_train))
print("positive samples", target_train.count([1]))
print("negative samples", target_train.count([0]))

print("test size", len(texts_test))
print("positive samples",  target_test.count([1]))
print("negative samples", target_test.count([0]))

n_words = len(vocab_processor.vocabulary_)
print('no. words', n_words)

In [None]:
#low level TF API
import tensorflow as tf
#creating hidden layers
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
with tf.name_scope("dnn"):
    hidden1 = neuron_layer(X, n_hidden1, "hidden1", activation="relu")
    hidden2 = neuron_layer(hidden1, n_hidden2, "hidden2", activation="relu")
    logits = neuron_layer(hidden2, n_outputs, "outputs")

In [None]:
# softmax cost function: cross entropy
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

In [None]:
# prediction
with tf.name_scope("prediction"):
    predction = tf.nn.softmax(d_logits)

In [None]:
#gradient descent optimization
with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
    training_op = optimizer.minimize(loss)

In [None]:
#the accuracy metric
with tf.name_scope("evaluation"):
    correct = tf.nn.in_top_k(logits, y,1)
    accuracy = tf.reduce_mean(tf.cast(correct,tf.float32))
    # Prediction operation
    _, label_auc = tf.metrics.auc(y_target, prediction)

In [None]:
init = [tf.global_variables_initializer(), tf.local_variables_initializer()]

In [None]:
n_epoches = 400
batch_size = 50

# model trining
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epoches):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X:X_batch, y:y_batch})
        acc_test = accuracy.eval(feed_dict={X:mnist.test.images, y:mnist.test.labels})
        print (epoch, "train_accuracy:", acc_train, "test_accuracy:", acc_test)
