In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys

import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import keras
from keras import layers

import sklearn
from sklearn import svm

import re
import nltk
import spacy
import string

In [3]:
def load_data(data_path):
  training_data = pd.read_csv(os.path.join(data_path, 'training.txt'),
                            sep = ',', header=None)
  test_data = pd.read_csv(os.path.join(data_path, 'test.txt'),
                              sep = ',', header=None)
  val_data = pd.read_csv(os.path.join(data_path, 'validation.txt'),
                              sep = ',', header=None)
  training_data.columns = ['Id', 'Latitude', 'Longitude', 'Tweet']
  val_data.columns = ['Id', 'Latitude', 'Longitude', 'Tweet']
  test_data.columns = ['Id', 'Tweet']
  return training_data, val_data, test_data

In [4]:
data_path = '/content/drive/MyDrive/German Tweets Geolocation/data/'
training_data, val_data, test_data = load_data(data_path)

In [5]:
training_ids = training_data['Id'].values
training_latitudes = training_data['Latitude'].values
training_longitudes = training_data['Longitude'].values
training_tweets = training_data['Tweet'].values

val_ids = val_data['Id'].values
val_latitudes = val_data['Latitude'].values
val_longitudes = val_data['Longitude'].values
val_tweets = val_data['Tweet'].values

test_ids = test_data['Id'].values
test_tweets = test_data['Tweet'].values

In [6]:
!pip install emot

Collecting emot
  Downloading https://files.pythonhosted.org/packages/49/07/20001ade19873de611b7b66a4d5e5aabbf190d65abea337d5deeaa2bc3de/emot-2.1-py3-none-any.whl
Installing collected packages: emot
Successfully installed emot-2.1


In [None]:
# convert emojis and emoticons to words
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
def convert_emojis_and_emoticons_to_word(tweet):
  for emote in UNICODE_EMO:
    tweet = tweet.replace(emote, 
                          " " + "_".join(UNICODE_EMO[emote]
                                  .replace(",","")
                                  .replace(":","").split()) + " ")
  for emote in EMOTICONS:
    tweet = re.sub(u'('+emote+')', 
                   " " + "_".join(EMOTICONS[emote]
                            .replace(",","")
                            .split()) + " ", tweet)
    
  return tweet


training_tweets = [convert_emojis_and_emoticons_to_word(t) for t in training_tweets]
val_tweets = [convert_emojis_and_emoticons_to_word(t) for t in val_tweets]
test_tweets = [convert_emojis_and_emoticons_to_word(t) for t in test_tweets]

In [7]:
# convert emojis and emoticons to words
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
def convert_emojis_and_emoticons_to_word(tweet):
  for emote in UNICODE_EMO:
    tweet = tweet.replace(emote, '')
  for emote in EMOTICONS:
    tweet = re.sub(u'('+emote+')', '', tweet)
    
  return tweet


training_tweets = [convert_emojis_and_emoticons_to_word(t) for t in training_tweets]
val_tweets = [convert_emojis_and_emoticons_to_word(t) for t in val_tweets]
test_tweets = [convert_emojis_and_emoticons_to_word(t) for t in test_tweets]

In [8]:
# to lower and remove punctuation
training_tweets = [re.sub('[^\w\s]','', t.lower()) for t in training_tweets]
val_tweets = [re.sub('[^\w\s]','', t.lower()) for t in val_tweets]
test_tweets = [re.sub('[^\w\s]','', t.lower()) for t in test_tweets]

In [9]:
!python -m spacy download de

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 2.7MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907057 sha256=42eb32f87062371c6238a440b30d12ceab6330b2f7a77c969499196734093ced
  Stored in directory: /tmp/pip-ephem-wheel-cache-z0_iqpq2/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [10]:
# lemmatize
import spacy

nlp = spacy.load('de')
allowed = ['NOUN', 'ADJ', 'VERB', 'ADV']

data_lemmatized = []
for tweet in training_tweets:
    data_lemmatized.append(" ".join([token.lemma_ for token in nlp(tweet) if token.pos_ in allowed]))
training_tweets = data_lemmatized

data_lemmatized = []
for tweet in val_tweets:
    data_lemmatized.append(" ".join([token.lemma_ for token in nlp(tweet) if token.pos_ in allowed]))
val_tweets = data_lemmatized

data_lemmatized = []
for tweet in test_tweets:
    data_lemmatized.append(" ".join([token.lemma_ for token in nlp(tweet) if token.pos_ in allowed]))
test_tweets = data_lemmatized

In [11]:
# eliminate stopwords
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words('german'))

def eliminate_stopwords(tweet):
  return " ".join([word for word in tweet.split()
                  if word not in stop_words])

training_tweets = [eliminate_stopwords(tweet) for tweet in training_tweets]
val_tweets = [eliminate_stopwords(tweet) for tweet in val_tweets]
test_tweets = [eliminate_stopwords(tweet) for tweet in test_tweets]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# remove most/least common
from collections import Counter
counter = Counter()
for tweet in training_tweets:
  for word in tweet.split():
    counter[word] += 1

most_common = set([word for (word, count) in counter.most_common(10)])
n = len(counter.most_common())
least_common = set([word for (word, count) in counter.most_common()[:n-9:-1]])

def remove_most_least_common(tweet):
  return " ".join([word for word in tweet.split()
                  if word not in most_common and word not in least_common])

training_tweets = [remove_most_least_common(tweet) for tweet in training_tweets]
val_tweets = [remove_most_least_common(tweet) for tweet in val_tweets]
test_tweets = [remove_most_least_common(tweet) for tweet in test_tweets]

In [None]:
# replace empty strings with "EMPTY"
for i in range(len(training_tweets)):
  if not training_tweets[i]:
    training_tweets[i] = 'EMPTY'

for i in range(len(val_tweets)):
  if not val_tweets[i]:
    val_tweets[i] = 'EMPTY'

for i in range(len(test_tweets)):
  if not test_tweets[i]:
    test_tweets[i] = 'EMPTY'

In [12]:
train_preprocessed = pd.DataFrame(columns=['Id', 'Latitude', 'Longitude', 'Tweet'])
train_preprocessed['Id'] = training_ids
train_preprocessed['Latitude'] = training_latitudes
train_preprocessed['Longitude'] = training_longitudes
train_preprocessed['Tweet'] = training_tweets

val_preprocessed = pd.DataFrame(columns=['Id', 'Latitude', 'Longitude', 'Tweet'])
val_preprocessed['Id'] = val_ids
val_preprocessed['Latitude'] = val_latitudes
val_preprocessed['Longitude'] = val_longitudes
val_preprocessed['Tweet'] = val_tweets

test_preprocessed = pd.DataFrame(columns=['Id', 'Tweet'])
test_preprocessed['Id'] = test_ids
test_preprocessed['Tweet'] = test_tweets

In [13]:
train_preprocessed.to_csv("/content/drive/My Drive/German Tweets Geolocation/data/train_preprocessed_no_emojis.txt", index=False)
val_preprocessed.to_csv("/content/drive/My Drive/German Tweets Geolocation/data/val_preprocessed_no_emojis.txt", index=False)
test_preprocessed.to_csv("/content/drive/My Drive/German Tweets Geolocation/data/test_preprocessed_no_emojis.txt", index=False)

In [None]:
list(set(training_tweets[0].split()))

['dadruf',
 'satomkraftwerk',
 'anere',
 'fett',
 'seisch',
 'antworten',
 'bringen',
 'gang',
 'leben',
 'langen',
 'lache',
 'schlecht',
 'jahr',
 'fangen',
 'lebsch',
 'nüt',
 'frau',
 'fall',
 'kaputt',
 'gar',
 'unglück']

In [None]:
def get_grams_util(alphabet, n, grams):
    if n == 0:
        grams[0] = []
        return []
    if n == 1:
        grams[1] = list(alphabet)
        return list(alphabet)
    
    if len(grams[n - 1]) > 0:
        prev_grams = grams[n - 1]
    else:
        prev_grams = get_grams_util(alphabet, n - 1, grams)

    gram = []
    for letter in alphabet:
        for string in prev_grams:
            gram.append(letter + string)
    
    grams[n] = gram
    return gram


def get_grams(alphabet, n):
    grams = [[] for _ in range(n + 1)]
    get_grams_util(alphabet, n, grams)
    return grams

In [None]:
# trial
words = list(set(training_tweets[0].split()))[:5]

kernel = np.zeros((len(words), len(words)))

for i in range(kernel.shape[0] // 2 + 1):
  for j in range(kernel.shape[1] // 2 + 1):
    alphabet = list(set(words[i]) | set(words[j]))
    all_grams = get_grams(alphabet, 2)
    for n_gram in all_grams:
      for gram in n_gram:
        if gram in words[i] and gram in words[j]:
          kernel[i, j] += 1
          kernel[j, i] += 1

print(kernel)

[[20.  6.  4.  0.  0.]
 [ 6. 46.  8.  0.  0.]
 [ 4.  8. 16.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]]


In [24]:
# TF IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(training_tweets)

train_sequences = vectorizer.transform(training_tweets)
val_sequences = vectorizer.transform(val_tweets)
test_sequences = vectorizer.transform(test_tweets)

In [14]:
# tokenize
tokenizer = Tokenizer(num_words=None, char_level=False, oov_token='OOV')
tokenizer.fit_on_texts(training_tweets)

# get sequences
train_sequences = tokenizer.texts_to_sequences(training_tweets)
val_sequences = tokenizer.texts_to_sequences(val_tweets)
test_sequences = tokenizer.texts_to_sequences(test_tweets)

In [27]:
max1, max2, max3 = max([len(s) for s in train_sequences]), max([len(s) for s in val_sequences]), max([len(s) for s in test_sequences])
max_len = max(max1, max2, max3)
print(max_len)

TypeError: ignored

In [16]:
# pad sequences
train_sequences = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = pad_sequences(val_sequences, maxlen=max_len, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_len, padding='post')

In [17]:
train_sequences = np.array(train_sequences)
training_latitudes = np.array(training_latitudes)
training_longitudes = np.array(training_longitudes)

# labels for multioutput model
train_labels = np.zeros((training_latitudes.shape[0], 2))
train_labels[:, 0] = np.array(training_latitudes)
train_labels[:, 1] = np.array(training_longitudes)

val_sequences = np.array(val_sequences)
val_latitudes = np.array(val_latitudes)
val_longitudes = np.array(val_longitudes)

#labels for multioutput models
val_labels = np.zeros((val_latitudes.shape[0], 2))
val_labels[:, 0] = np.array(val_latitudes)
val_labels[:, 1] = np.array(val_longitudes)

test_sequences = np.array(test_sequences)

In [None]:
# construct grid search
from sklearn import metrics

nus = [0.1, 0.3, 0.5, 0.7, 1]
Cs = [0.1, 1, 10, 100]
parameters = [{'C':Cs, 'nu': nus}]

scorer = metrics.make_scorer(metrics.mean_squared_error, 
                             greater_is_better=False)

svr_lat_grid = svr_long_grid = sklearn.model_selection.GridSearchCV(svm.NuSVR(), 
                                           parameters, 
                                           cv = 20,
                                           scoring=scorer)

In [None]:
# execute grid search for latitudes
svr_lat_grid.fit(train_sequences, training_latitudes)

In [None]:
svr_lat_grid.cv_results_

NameError: ignored

In [None]:
# execute grid search for longitudes
svr_long_grid.fit(traing_sequences, training_longitudes)

In [29]:
from sklearn.multioutput import MultiOutputRegressor
multi_regressor_test = MultiOutputRegressor(svm.NuSVR(verbose=True))
multi_regressor_test.fit(train_sequences, train_labels)

[LibSVM][LibSVM]

MultiOutputRegressor(estimator=NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                     gamma='scale', kernel='rbf', max_iter=-1,
                                     nu=0.5, shrinking=True, tol=0.001,
                                     verbose=True),
                     n_jobs=None)

In [30]:
from sklearn import metrics
predictions = multi_regressor_test.predict(val_sequences)
mse_1 = metrics.mean_squared_error(val_labels[:, 0], predictions[:, 0]) 
mse_2 = metrics.mean_squared_error(val_labels[:, 1], predictions[:, 1])
print(mse_1)
print(mse_2)

0.5865341952682823
1.1069406605915566


In [31]:
test_predictions = multi_regressor_test.predict(test_sequences)

In [32]:
submission_df = pd.DataFrame(columns=["id", "lat", "long"])
submission_df['id'] = test_ids
submission_df['lat'] = test_predictions[:, 0]
submission_df['long'] = test_predictions[:, 1]
submission_df.to_csv("/content/drive/My Drive/German Tweets Geolocation/nu_svr_multioutput_submission_tfidf.txt", index=False)

In [None]:
# initialize embeddings
embeddings = []
embeddings.append(np.zeros(vocab_size)) # one-hot for OOV token
for char, idx in tokenizer.word_index.items():
  char_onehot = np.zeros(vocab_size)
  char_onehot[idx - 1] # idx-1 because word_index indexes from 1
  embeddings.append(char_onehot)

embeddings = np.array(embeddings)

In [None]:
# construct the model 
embedding_size = vocab_size
input_size = 750

model = keras.Sequential([
      layers.Embedding(vocab_size + 1, embedding_size, 
                      input_length=input_size, weights = [embeddings]),
      layers.Conv1D(256, 7, activation=keras.activations.relu),
      layers.MaxPooling1D(3),
      layers.Conv1D(256, 7, activation=keras.activations.relu),
      layers.MaxPooling1D(3),
      layers.Conv1D(256, 3, activation=keras.activations.relu),
      layers.Conv1D(256, 3, activation=keras.activations.relu),
      layers.Conv1D(256, 3, activation=keras.activations.relu),
      layers.Conv1D(256, 3, activation=keras.activations.relu),
      layers.GlobalMaxPooling1D(),
      layers.Dense(1024, activation=keras.activations.relu),
      layers.Dropout(0.5),
      layers.Dense(1024, activation=keras.activations.relu),
      layers.Dropout(0.5),
      layers.Dense(2)
  ]
)

optimizer = keras.optimizers.Adam(learning_rate=10e-3)
loss = keras.losses.MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 750, 1008)         1017072   
_________________________________________________________________
conv1d (Conv1D)              (None, 744, 256)          1806592   
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 248, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 242, 256)          459008    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 80, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 78, 256)           196864    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 76, 256)           1

In [None]:
y_train_shape = (training_latitudes.shape[0], 2)

x_train = train_sequences
y_train = np.zeros(y_train_shape)
y_train[:, 0] = training_latitudes
y_train[:, 1] = training_longitudes

x_val = val_sequences
y_val_shape = (val_latitudes.shape[0], 2)
y_val = np.zeros(y_val_shape)
y_val[:, 0] = val_latitudes
y_val[:, 1] = val_longitudes

In [None]:
model.fit(x_train, y_train, validation_split=0.3,
          batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f29b65e4e48>

In [None]:
predictions = model.predict(x_val)