In [11]:
from __future__ import print_function

import numpy as np
import re
import itertools
from collections import Counter
import pickle

from gensim.models import word2vec
from os.path import join, exists, split
import os

import sys
from textblob import TextBlob
import pandas as pd


"""
Use functions from data_helpers.py to get training and test instances.
"""

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_labels():
    """
    Load data and labels from files
    """

#-------------------- Generate TRAIN subst --------------------------------------
    positive_examples_lowerhalf_nounsSubst = list(open("./data/positive_lowerhalf_shuffled_100.txt").readlines())
    positive_examples_lowerhalf_nounsSubst = [s.strip() for s in positive_examples_lowerhalf_nounsSubst]

    negative_examples_lowerhalf_nounsSubst = list(open("./data/negative_lowerhalf_shuffled_100.txt").readlines())
    negative_examples_lowerhalf_nounsSubst = [s.strip() for s in negative_examples_lowerhalf_nounsSubst]

    # Split by words
    x_text_subst = positive_examples_lowerhalf_nounsSubst + negative_examples_lowerhalf_nounsSubst
    x_text_subst = [clean_str(sent) for sent in x_text_subst]
    x_text_subst = [s.split(" ") for s in x_text_subst]

    # Generate labels
    positive_labels_subst = [[0, 1] for _ in positive_examples_lowerhalf_nounsSubst]
    negative_labels_subst = [[1, 0] for _ in negative_examples_lowerhalf_nounsSubst]
    y_subst = np.concatenate([positive_labels_subst, negative_labels_subst], 0)


#-------------------- Generate TEST  --------------------------------------
    positive_examples_upperHalf = list(open("./data/positive_upperhalf_shuffled_800.txt").readlines())
    positive_examples_upperHalf = [s.strip() for s in positive_examples_upperHalf]

    negative_examples_upperHalf = list(open("./data/negative_upperhalf_shuffled_800.txt").readlines())
    negative_examples_upperHalf = [s.strip() for s in negative_examples_upperHalf]

    # Split by words
    x_text_upper = positive_examples_upperHalf + negative_examples_upperHalf
    x_text_upper = [clean_str(sent) for sent in x_text_upper]
    x_text_upper = [s.split(" ") for s in x_text_upper]

    # Generate labels
    positive_labels_upper = [[0, 1] for _ in positive_examples_upperHalf]
    negative_labels_upper = [[1, 0] for _ in negative_examples_upperHalf]
    y_upper = np.concatenate([positive_labels_upper, negative_labels_upper], 0)

    return [x_text_subst, y_subst, x_text_upper, y_upper]


def pad_sentences(sentences, max_length, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length with the string <PAD/>. The length is defined by the longest sentence.
    Returns padded sentences.
    """

    sequence_length = max_length
    print("Print sequence_length: ", sequence_length)

    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))

    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]

    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}

    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]


def load_data():
    """
    Load and preprocess data.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """

    sentences_train, labels_train, sentences_test, labels_test = load_data_and_labels()

    sentences_all = np.concatenate([sentences_train, sentences_test], 0)

    max_length = max(len(x) for x in sentences_all)
    sentences_all_padded = pad_sentences(sentences_all, max_length)

    max_length = max(len(x) for x in sentences_all)

    sentences_train_padded = pad_sentences(sentences_train, max_length)
    sentences_test_padded = pad_sentences(sentences_test, max_length)

    vocabulary, vocabulary_inv = build_vocab(sentences_all_padded)

    x_train, y_train = build_input_data(sentences_train_padded, labels_train, vocabulary)
    x_test, y_test = build_input_data(sentences_test_padded, labels_test, vocabulary)

    return [x_train, y_train, x_test, y_test, vocabulary, vocabulary_inv]

def cnn_load_data():
    """
    Creates the training instances.
    Returns the instances and the inverted vocabulary.
    """

    x_train, y_train, x_test, y_test, vocabulary, vocabulary_inv_list = load_data()
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
    
    # Returns the indices of the maximum values along an axis.
    # Lable: positiv [0, 1] -> index 1, negativ [1, 0] -> index 0
    y_train = y_train.argmax(axis=1)
    y_test = y_test.argmax(axis=1)

    return x_train, y_train, x_test, y_test, vocabulary_inv

In [12]:
"""
Create training and test instances.
"""
x_train, y_train, x_test, y_test, vocabulary_inv = cnn_load_data()

Print sequence_length:  374
Print sequence_length:  374
Print sequence_length:  374


In [13]:
# len(x_train)

In [14]:
# len(y_test)

In [15]:
"""
function from w2v.py to create and train word2vec model
"""

def train_word2vec_model(sentence_matrix, vocabulary_inv, skipgram, iterations, num_features=300, min_word_count=1, context=10):
    """
    Trains, saves, loads Word2Vec model
    Returns initial weights for embedding layer.
   
    inputs:
    sentence_matrix # int matrix: num_sentences x max_sentence_len
    vocabulary_inv  # dict {int: str}
    num_features    # Word vector dimensionality                      
    min_word_count  # Minimum word count                        
    context         # Context window size 
    """
    
    model_dir = 'models'
    model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
    model_name = join(model_dir, model_name)
    
    # Set values for various parameters
    num_workers = 2  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words

    print('Training Word2Vec model...')
    sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]

    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                            size=num_features, min_count=min_word_count,
                                            window=context, sample=downsampling, sg = skipgram, iter=iterations)

    # If we don't plan to train the model any further, calling 
    # init_sims will make the model much more memory-efficient.
    embedding_model.init_sims(replace=True)

    # Saving the model for later use. You can load it later using Word2Vec.load()
    if not exists(model_dir):
        os.mkdir(model_dir)
    print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
    embedding_model.save(model_name)

    return embedding_model

In [16]:
"""
Train the word2vec model with 200 Training instances (100 pos./100 neg.) + 1600 Test instances (800 pos./800 neg.)

Use CBOW algorithm, train for 5 Iterations.
"""
# 0: CBOW, 1: skipgram
skipgram = 0

# number of training iterations
iterations = 5

embedding_dim = 50
min_word_count = 1
context = 10

model = train_word2vec_model(np.vstack((x_train, x_test)), vocabulary_inv, skipgram, iterations, num_features=embedding_dim, min_word_count=min_word_count, context=context)

Training Word2Vec model...
Saving Word2Vec model '50features_1minwords_10context'


In [17]:
# model.wv.most_similar('bed')

In [18]:
# model.wv.most_similar('bed')[1][0]

In [19]:
"""
    Open positive lowerhalf shuffeld 100 training instances file. 
    Read file line by line, find Nouns ('NN', 'NNS', 'NNP', 'NNPS') 
    and replace with most_similar word, selected rank depending on actual position.
    If no most_similar exists, leave the actual word. 
    Write un- and modified lines to new file.
    Create one file per most_similar rank position.
"""

new_line = []
string = ''
i = 0
linenr = 0

for position in range(0, 39, 1):
    with open("./data/positive_lowerhalf_shuffled_100.txt") as infile:
        for line in infile:
            new_line = []
            content = TextBlob(line)
            nouns = [n for n,t in content.tags if(t == 'NN' or t == 'NNS' or t == 'NNP' or t == 'NNPS')]
            for word, tag in content.tags:
                if(tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS'):
                    if word in model.wv.vocab:
                        new_line.append(str(model.wv.most_similar(word, topn=50)[position][0]))
                    else:
                        new_line.append(word)
                else:
                    new_line.append(word)

            if(linenr % 10000 == 0):
                print(linenr)
            else:
                pass
            
            linenr = linenr + 1
            
            filename = './data/positive_lowerhalf_shuffled_exchange_' + str(position) + '.txt'
            with open(filename, 'a') as file:
                if(len(new_line) > 0):
                    file.write(' '.join(new_line) + '\n')
                else:
                    pass

print('done.')

0
done.


In [20]:
"""
    Open negative lowerhalf shuffeld 100 training instances file. 
    Read file line by line, find Nouns ('NN', 'NNS', 'NNP', 'NNPS') 
    and replace with most_similar word, selected rank depending on actual position.
    If no most_similar exists, leave the actual word. 
    Write un- and modified lines to new file.
    Create one file per most_similar rank position.
"""

new_line = []
string = ''
i = 0
linenr = 0

for position in range(0, 39, 1):
    with open("./data/negative_lowerhalf_shuffled_100.txt") as infile:
        for line in infile:
            new_line = []
            content = TextBlob(line)
            nouns = [n for n,t in content.tags if(t == 'NN' or t == 'NNS' or t == 'NNP' or t == 'NNPS')]
            for word, tag in content.tags:
                if(tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS'):
                    if word in model.wv.vocab:
                        new_line.append(str(model.wv.most_similar(word, topn=50)[position][0]))
                    else:
                        new_line.append(word)
                else:
                    new_line.append(word)

            if(linenr % 10000 == 0):
                print(linenr)
            else:
                pass
            
            linenr = linenr + 1

            filename = './data/negative_lowerhalf_shuffled_exchange_' + str(position) + '.txt'
            with open(filename, 'a') as file:
                if(len(new_line) > 0):
                    file.write(' '.join(new_line) + '\n')

print('done.')

0
done.
