## Country Level: Aravec Word embeddings and CNNs, RNNs

**Subtask 1**: Country-level dialect identification: A total of 21,000 tweets, covering all 21 Arab countries. 

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import re

from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
# read the data
def read_files(path):
    file = pd.read_csv(path, sep='\t')
    print ('The shape of the data: ', file.shape)
    return file

In [3]:
train_df = read_files('../data/train_dev/Subtask_1.2+2.2_DA/DA_train_labeled.tsv')
dev_df = read_files('../data/train_dev/Subtask_1.2+2.2_DA/DA_dev_labeled.tsv')
test_df = read_files('../data/test/Subtask_1.2+2.2_DA/DA_test_unlabeled.tsv')
train_df

The shape of the data:  (21000, 4)
The shape of the data:  (5000, 4)
The shape of the data:  (5000, 2)


Unnamed: 0,#1_tweetid,#2_tweet,#3_country_label,#4_province_label
0,TRAIN_0,حاجة حلوة اكيد,Egypt,eg_Faiyum
1,TRAIN_1,عم بشتغلوا للشعب الاميركي اما نحن يكذبوا ويغشو...,Iraq,iq_Dihok
2,TRAIN_2,ابشر طال عمرك,Saudi_Arabia,sa_Ha'il
3,TRAIN_3,منطق 2017: أنا والغريب علي إبن عمي وأنا والغري...,Mauritania,mr_Nouakchott
4,TRAIN_4,شهرين وتروح والباقي غير صيف ملينا,Algeria,dz_El-Oued
...,...,...,...,...
20995,TRAIN_20995,هذا أناني و نافخ روحو,Algeria,dz_Ouargla
20996,TRAIN_20996,ابا أتعلم ارسم URL …,Iraq,iq_Basra
20997,TRAIN_20997,كلمة وقح شكلك توك الا متعلمتنها كثير تقوليها ب...,Iraq,iq_Basra
20998,TRAIN_20998,ربنا ما يوريك الناس الدحيحة لما يترَوشِنوا,Egypt,eg_Gharbia


In [4]:
# clean data
def normalize(text):
    normalized = str(text)
    normalized = re.sub('URL','',normalized) # remove links
    normalized = re.sub('USER','',normalized) # remove USER
    normalized = re.sub('#','',normalized) # remove #
    #normalized = re.sub('(@[A-Za-z0-9]+)_[A-Za-z0-9]+','',normalized) # remove @names with underscore
    #normalized = re.sub('(@[A-Za-z0-9]+)','',normalized) # remove @names
    #normalized = re.sub('pic\S+','',normalized) # remove pic.twitter.com links
    normalized = re.sub('\d+','',normalized) # remove numbers
    normalized = re.sub('-','',normalized) # remove symbols - . /
    normalized = re.sub('[a-zA-Z0-9]+','',normalized) # remove English words 
    normalized = re.sub('!','',normalized) # remove English words
    normalized = re.sub(':','',normalized) # remove English words
    normalized = re.sub('[()]','',normalized) # remove English words
    normalized = re.sub('☻','',normalized) # remove English words
    normalized = re.sub('[""]','',normalized) # remove English words
    normalized = re.sub('é','',normalized) # remove English words
    normalized = re.sub('\/','',normalized) # remove English words
    normalized = re.sub('؟','',normalized) # remove English words
    return normalized

In [5]:
train_df['#2_tweet'] = train_df['#2_tweet'].progress_apply(lambda text: normalize(text))
dev_df['#2_tweet'] = dev_df['#2_tweet'].progress_apply(lambda text: normalize(text))
test_df['#2_tweet'] = test_df['#2_tweet'].progress_apply(lambda text: normalize(text))


100%|██████████| 21000/21000 [00:00<00:00, 48768.14it/s]
100%|██████████| 5000/5000 [00:00<00:00, 47435.61it/s]
100%|██████████| 5000/5000 [00:00<00:00, 49886.70it/s]


In [6]:
# delete some stopwords 
def clean(text):
    stops = ['و','الله','في', 'على', 'عن', 'إلى', 'الى', 'من', 'ما', 'لا', 'انا', 'أنا']
    text = str(text)
    text = text.split()
    text = [w for w in text if w not in stops]
    text = " ".join(text)
    return text

In [7]:
train_df['#2_tweet'] = train_df['#2_tweet'].progress_apply(lambda text: clean(text))
dev_df['#2_tweet'] = dev_df['#2_tweet'].progress_apply(lambda text: clean(text))
test_df['#2_tweet'] = test_df['#2_tweet'].progress_apply(lambda text: clean(text))

100%|██████████| 21000/21000 [00:00<00:00, 182242.57it/s]
100%|██████████| 5000/5000 [00:00<00:00, 180283.86it/s]
100%|██████████| 5000/5000 [00:00<00:00, 179778.49it/s]


In [8]:
train_df['#2_tweet']

0                                           حاجة حلوة اكيد
1        عم بشتغلوا للشعب الاميركي اما نحن يكذبوا ويغشو...
2                                            ابشر طال عمرك
3        منطق والغريب علي إبن عمي وأنا والغريب وإبن عمي...
4                        شهرين وتروح والباقي غير صيف ملينا
                               ...                        
20995                                  هذا أناني نافخ روحو
20996                                     ابا أتعلم ارسم …
20997    كلمة وقح شكلك توك الا متعلمتنها كثير تقوليها ب...
20998              ربنا يوريك الناس الدحيحة لما يترَوشِنوا
20999                 حرام يا منى ليه بس السيره دي ع الصبح
Name: #2_tweet, Length: 21000, dtype: object

In [9]:
# prepare Train_X, Dev_X, Test_X

# train X, val X, test X
train_X = train_df["#2_tweet"]
dev_X = dev_df["#2_tweet"]
test_X = test_df["#2_tweet"]

# target values
train_y = train_df['#3_country_label']
#print (train_y)
dev_y = dev_df['#3_country_label']
#print (dev_y)

train_X = train_X.astype(str)
dev_X = dev_X.astype(str)
test_X = test_X.astype(str)

In [10]:
# Some varialbles to preprocess the data with keras
# 1) embed_size: the length of each word vector
embed_size = 300
# 2) features: unique words to use
max_features = 50000
# 3) maxlen: max number of words to use
maxlen = 100

In [11]:
# tokenize tweets
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_X)

train_sequences = tokenizer.texts_to_sequences(train_X)
dev_sequences = tokenizer.texts_to_sequences(dev_X)
test_sequences = tokenizer.texts_to_sequences(test_X)

X_train = pad_sequences(train_sequences, maxlen=maxlen)
X_dev = pad_sequences(dev_sequences, maxlen=maxlen)
X_test = pad_sequences(test_sequences, maxlen=maxlen)

In [12]:
# encode y data labels

encoder = LabelEncoder()
encoder.fit(train_y)
y_train = encoder.transform(train_y)
y_dev = encoder.transform(dev_y)

N_CLASSES = np.max(y_train) + 1
N_CLASSES
y_train = to_categorical(y_train, N_CLASSES)
y_dev = to_categorical(y_dev, N_CLASSES)
print('Shape of label tensor:', y_train.shape)

Shape of label tensor: (21000, 21)


# The AraVec Word Embeddings

In [13]:
import gensim
import itertools
# load the AraVec model for Arabic word embeddings - twitter-CBOW (300 vector size)
print ('please wait ... loading the AraVec')
aravec_model = gensim.models.Word2Vec.load('../aravec/full_grams_cbow_300_twitter.mdl')
print (aravec_model)
my_dict = {}
for index, key in enumerate(aravec_model.wv.vocab):
    my_dict[key] = aravec_model.wv[key]
    
#print the first 2 words and their vectors    
# N = 2
# out = dict(itertools.islice(my_dict.items(), N))
# print (out)

please wait ... loading the AraVec
Word2Vec(vocab=1476715, size=300, alpha=0.025)


In [14]:
embedding_matrix = np.zeros((50000, 300))
for word, index in tokenizer.word_index.items():
    if index > 50000 - 1:
        break
    else:
        embedding_vector = my_dict.get(word)
        #print (embedding_vector)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
            #print (len(embedding_matrix[index]))
embedding_matrix.shape

(50000, 300)

# CNN + LSTM

In [15]:
#import tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Input
from tensorflow.keras.layers import GlobalAveragePooling1D, Embedding

model_1 = Sequential()
model_1.add(Embedding(50000, 300, input_length=100, weights=[embedding_matrix], trainable=False))
model_1.add(Dropout(0.2))
model_1.add(Conv1D(64, 5, activation='relu'))
model_1.add(MaxPooling1D(pool_size=4))
model_1.add(LSTM(300))
model_1.add(Dense(21, activation='softmax'))
model_1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
train = model_1.fit(X_train, y_train, epochs = 5, batch_size=50)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# apply on validation data

In [17]:
#apply to validation set
pred_dev_y = model_1.predict([X_dev], batch_size=50, verbose=1)
pred_dev_y
indexes = np.argsort(pred_dev_y)[::-1]
indexes

# labels for the predicted dev data
labels = np.argmax(pred_dev_y, axis=-1)    
print('Labels are: ',labels)

# getting the labels throw (inverse_transform)
dev_y_predicted = encoder.inverse_transform(labels)
print ('The length of predicted labels is: ', len(dev_y_predicted))

# save labels to txt file
with open("../prediction_files/predicted_dev_labels_aravec_cbow_cnn_3.txt", "w") as f:
    for s in dev_y_predicted:
        f.write(str(s) +"\n")

Labels are:  [ 3  3  3 ... 11 11 14]
The length of predicted labels is:  5000


In [18]:
from sklearn.metrics import roc_auc_score
output_test = model_1.predict(X_dev)
print("test auc:", roc_auc_score(y_dev,output_test))
dev_loss, dev_acc = model_1.evaluate(X_dev, y_dev)
dev_loss, dev_acc

test auc: 0.7228674861858592


(2.2543509006500244, 0.3774000108242035)

# apply on the test data

In [19]:
# applying to testing unlabeled dataset
pred_test_y = model_1.predict([X_test], batch_size=50, verbose=1)
pred_test_y
indexes = np.argsort(pred_dev_y)[::-1]
#print (indexes)

# labels for the predicted dev data
labels = np.argmax(pred_test_y, axis=-1)    
#print('Labels are: ',labels)

# getting the labels throw (inverse_transform)
test_y_predicted = encoder.inverse_transform(labels)
#print ('The length of predicted labels is: ', len(test_y_predicted))

# save labels to txt file
with open("../prediction_files/predicted_test_labels_aravec_cbow_cnn_3.txt", "w") as f:
    for s in dev_y_predicted:
        f.write(str(s) +"\n")

