In [1]:
from __future__ import print_function

from collections import Counter
import itertools
import numpy as np
import tensorflow as tf
import re
import os
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [2]:
def load_data_and_labels_from_many_files(data_folder, data_files):
    """
    Loads sentences from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    print("Loading data...")
    x_text = []
    y = []

    for i, data_file in enumerate(data_files):

        sentences = list(open(data_folder + "/" + data_file, "r").readlines())
        sentences = [s.strip() for s in sentences]
        # Split by words
        # sentences = [clean_str(s) for s in sentences]
        sentences = [s.split() for s in sentences]
        x_text += sentences
        # Labels as numbers
        labels = [i for s in sentences]
        y += labels

    # Generate one-hot labels
    y = to_categorical(y, num_classes=len(data_files))

    return x_text, y

def load_data_and_labels_from_one_file(data_folder, data_file):
    """
    Loads sentences from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    print("Loading data...")
    labels = ["EGY", "GLF", "LAV", "MSA", "NOR"]
    x_text = []
    y = []

    with open(data_folder + "/" + data_file, "r") as f_in:
        for line in f_in:
            sentence, label = line.split("\t")
            # Split by words
            sentence = sentence.strip().split()
            x_text.append(sentence)
            # Labels as numbers
            y.append(labels.index(label.strip("\n")))

    # Generate one-hot labels
    y = to_categorical(y, num_classes=len(labels))

    return x_text, y

def pad_sentences(sentences, padding_word=""):
    """
    Pads all sentences to be the length of the longest sentence.
    Returns padded sentences.
    """
    print("Padding sentences...")
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
        
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from token to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    print("Building word vocabulary...")
    word_counts = Counter(itertools.chain(*sentences))
    
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    
    return vocabulary, vocabulary_inv


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    print("Converting to ids...")
    x = np.array([
            [vocabulary[word] for word in sentence]
            for sentence in sentences])
    y = np.array(labels)
    
    return x, y


In [3]:
data_folder = "../data/vardial2017-sample"
data_files = ["EGY", "GLF", "LAV", "MSA", "NOR"]
sentences, labels = load_data_and_labels_from_many_files(data_folder, data_files)
print(len(sentences))
print(sentences[0])

Loading data...
50
['tthdm', 'AlmsAjd', 'fy', "synA'", 'wAl>bAt$y', 'sxryp', 'mn', 'Alhjrp', 'Alnbwyp', 'fy', 'Aljrydp', 'Alrsmyp', 'lmA', 'mAdty', 'Altrbyp', 'Al<slAmyp', 'mn', 'AlmdArs']


In [4]:
data_folder = "../data/vardial2018-sample"
train_file = "train.words"
dev_file = "dev.words"
# Step 1: Read in data
sentences_train, labels_train = load_data_and_labels_from_one_file(data_folder, train_file)
sentences_dev, labels_dev = load_data_and_labels_from_one_file(data_folder, train_file)
sentences = sentences_train + sentences_dev
labels = np.concatenate((labels_train,labels_dev))
print(len(sentences))
print(sentences[0])
print(labels)

Loading data...
Loading data...
20
['AlkAmyrwn', 'AlkAmlp']
[[0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [5]:
# Step 2: Pad sentences and convert to ids
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)

# Step 3: Split train/test set
dev_sample_index = len(sentences_train)
print(dev_sample_index)
x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]

vocab_size = len(vocabulary)
sentence_size = x_train.shape[1]

print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
print('train shape:', x_train.shape)
print('dev shape:', x_dev.shape)
print('vocab_size', vocab_size)
print('sentence max words', sentence_size)

Padding sentences...
Building word vocabulary...
Converting to ids...
10
Train/Dev split: 10/10
train shape: (10, 48)
dev shape: (10, 48)
vocab_size 131
sentence max words 48


In [6]:
# Step 4: Create datasets and iterator
batch_size = 5
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.shuffle(10000) # to shuffle your data
train_data = train_data.batch(batch_size)
test_data = tf.data.Dataset.from_tensor_slices((x_dev, y_dev))
test_data = test_data.batch(batch_size)
print(test_data.output_shapes)

(TensorShape([Dimension(None), Dimension(48)]), TensorShape([Dimension(None), Dimension(5)]))


In [59]:
iterator = tf.data.Iterator.from_structure(train_data.output_types, 
                                                   train_data.output_shapes)
sentence, label = iterator.get_next()
# shape = [batch_size, sentence_length],[batch_size, num_classes]

sentence_length = sentence.shape[1].value
num_classes = label.shape[1].value
print(sentence_length)
print(num_classes)
train_init = iterator.make_initializer(train_data)  # initializer for train_data
test_init = iterator.make_initializer(test_data)    # initializer for train_data

48
5
