In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
np.random.seed(32)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.manifold import TSNE

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras.callbacks import EarlyStopping

%matplotlib inline

### The dataset


In [None]:
import pandas as pd
# read the data
def read_files(path):
    file = pd.read_csv(path, sep='\t')
    print ('The shape of the data: ', file.shape)
    return file

train_df = read_files('DA_train_labeled.tsv')
dev_df = read_files('DA_dev_labeled.tsv')
test_df = read_files('DA_test_unlabeled.tsv')
dev_df

### Preprocessing data 

In [None]:
# train, dev, test
train_X = train_df['#2_tweet']
dev_X = dev_df['#2_tweet']
test_X = test_df['#2_tweet']

train_y = train_df['#3_country_label']
dev_y = dev_df['#3_country_label']

In [None]:
MAX_NB_WORDS = 10000

# get the raw text data
train_X = train_X.astype(str)
dev_X = dev_X.astype(str)
test_X = test_X.astype(str)

In [None]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(train_X)
sequences = tokenizer.texts_to_sequences(train_X)
sequences_dev = tokenizer.texts_to_sequences(dev_X)
sequences_test = tokenizer.texts_to_sequences(test_X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



tokenized sequences


In [None]:
seq_lens = [len(s) for s in sequences]
print("average length: %0.1f" % np.mean(seq_lens))
print("max length: %d" % max(seq_lens))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.hist(seq_lens, bins=50);



In [None]:
# pad vectors to maximum length
MAX_SEQUENCE_LENGTH = 300

# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_dev = pad_sequences(sequences_dev, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data dev tensor:', x_dev.shape)
print('Shape of data test tensor:', x_test.shape)


In [None]:
# encode y data labels
encoder = LabelEncoder()
encoder.fit(train_y)
y_train = encoder.transform(train_y)
y_dev = encoder.transform(dev_y)

y_train

In [None]:
# Converts the labels to a one-hot representation
N_CLASSES = np.max(y_train) + 1
N_CLASSES

In [None]:
y_train = to_categorical(y_train, N_CLASSES,)
y_dev = to_categorical(y_dev, N_CLASSES)
print('Shape of label tensor:', y_train.shape)

# (1) Simple Network

In [None]:
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, Embedding
from tensorflow.keras.models import Model

EMBEDDING_DIM = 50

# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

embedded_sequences = embedding_layer(sequence_input)

average = GlobalAveragePooling1D()(embedded_sequences)
predictions = Dense(N_CLASSES, activation='softmax')(average)

model = Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['acc'])

In [None]:
model.fit(x_train, y_train, epochs=100, batch_size=64)

In [None]:
output_test = model.predict(x_dev)
print("test auc:", roc_auc_score(y_dev,output_test))

In [None]:
dev_loss, dev_acc = model.evaluate(x_dev, y_dev)
dev_loss, dev_acc

In [None]:
# generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(50,80):
    prediction = model.predict(np.array([x_dev[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(dev_X.iloc[i], "...")
    print('Actual label:' + dev_y.iloc[i])
    print("Predicted label: " + predicted_label + "\n")  

# (2) LSTM

In [None]:
# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = LSTM(128, dropout=0.5, recurrent_dropout=0.2)(embedded_sequences)
predictions = Dense(N_CLASSES, activation='softmax')(x)


model = Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=64)

In [None]:
output_test = model.predict(x_dev)
print("test auc:", roc_auc_score(y_dev,output_test))

In [None]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_dev, y_dev,
                       batch_size=64, verbose=1)
print('Dev loss:', score[0])
print('Dev accuracy:', score[1])

# (3) CNN - LSTM

In [None]:
# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

# 1D convolution with 64 output channels
x = Conv1D(64, 5)(embedded_sequences)
# MaxPool divides the length of the sequence by 5
x = MaxPooling1D(5)(x)
x = Dropout(0.5)(x)
x = Conv1D(64, 5)(x)
x = MaxPooling1D(5)(x)
# LSTM layer with a hidden size of 64
x = Dropout(0.3)(x)
x = LSTM(64)(x)
predictions = Dense(N_CLASSES, activation='softmax')(x)

model = Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
model.fit(x_train, y_train, epochs=3, batch_size=128)

In [None]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_dev, y_dev,
                       batch_size=64, verbose=1)
print('Dev loss:', score[0])
print('Dev accuracy:', score[1])

In [None]:
output_test = model.predict(x_dev)
print("dev auc:", roc_auc_score(y_dev,output_test))

In [None]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(dev_X.iloc[i], "...")
    print('Actual label:' + dev_y.iloc[i])
    print("Predicted label: " + predicted_label + "\n")  