In [61]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
# requires >= python 3.8
from typing import Dict, Literal

In [62]:
def read_files(path):
    file = pd.read_csv(path, sep='\t')
    print('shape', file.shape)
    return file

train_df = read_files("DA_train_labeled.tsv")
dev_df = read_files("DA_dev_labeled.tsv")
test_df = read_files("DA_test_unlabeled.tsv")

shape (21000, 4)
shape (5000, 4)
shape (5000, 2)


## Create a custom feature extractor by extending `DictVectorizer`

In [45]:
# data type alias where value must be 0 or 1
Binary = Literal[0, 1]

class LinguisticFeatureEncoder(DictVectorizer):
    """
    Encodes linguistic features defined in self
    """
    def __init__(self, **kwargs):        
        super().__init__(sparse=kwargs.get("sparse", False))
        self.use_negative_features = kwargs.get("use_negative_features", False)
        # all positive features
        self.pos_features: Dict[str, Callable[str, Binary]] = {
            # AFRICA
            #"egy_dem": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sدي\s', u'\sده\s', u'\sدى\s')) else 0,
            #"egypt_neg": lambda text: 1 if text.find(u'\sمش\s') >= 0 else 0,
            "tunis_iterog": lambda text: 1 if text.find(u'\sعلاش\s') >= 0 else 0,
            "tunis_degree": lambda text: 1 if text.find(u'\sبرشا\s') >= 0 else 0,
            "tunis_contextualword": lambda text: 1 if text.find(u'\sباهي\s') >= 0 else 0,
            "algeria": lambda text: 1 if text.find(u'\sكاش\s') >= 0 else 0,
            "mor_dem": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sديال\s', u'\sديالي\s', u'\sديالى\s')) else 0,
            "mauritania": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sكاغ\s', u'\sايكد\s')) else 0,
            "sudan": lambda text: 1 if text.find(u'\sياخ\s') >= 0 else 0,
            "somalia": lambda text: 1 if text.find(u'\sتناطل\s') >= 0 else 0,
            "dijubuti": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sهاد\s', u'\sهلق\s')) else 0,
            
            # ASIA
            "iraq_degree": lambda text: 1 if any(text.find(i) >=0 for i in (u' خوش ', u' كاعد ')) else 0, 
            "iraq_dem": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sهاي\s', u'\sدا\s')) else 0, 
            "iraq_degree": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sخوش\s', u'\sكاعد\s')) else 0, 
            "iraq_adj": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sفدوه\s', u'\sفدوة\s')) else 0, 
            "iraq_interrog": lambda text: 1 if text.find(u'\sشديحس\s') >= 0 else 0,
            "iraq_tensemarker": lambda text: 1 if any(text.find(i) >=0 for i in (u'\sهسه\s', u'\sهسع\s', u'\sلهسه\s')) else 0, 
            "saudi_dem": lambda text: 1 if text.find(u'\sكذا\s') >= 0 else 0,
            "qatar": lambda text: 1 if text.find(u'\sوكني\s') >= 0 else 0,
            "bahrain": lambda text: 1 if text.find(u'\sشفيها\s') >= 0 else 0,
            "emirates": lambda text: 1 if text.find(u'\sعساه\s') >= 0 else 0,
            "kuwait": lambda text: 1 if text.find(u'\sعندج\s') >= 0 else 0,
            "oman": lambda text: 1 if text.find(u'\sعيل\s') >= 0 else 0,
            "yemen": lambda text: 1 if text.find(u'\sكدي\s') >= 0 else 0,
            "syria": lambda text: 1 if text.find(u'\sشنو\s') >= 0 else 0,
            "palestine": lambda text: 1 if text.find(u'\sليش\s') >= 0 else 0,
            "jordan": lambda text: 1 if text.find(u'\sهاظ\s') >= 0 else 0,
            "lebanon": lambda text: 1 if text.find(u'\sهيدي\s') >= 0 else 0,   
    
        }
        
    @property
    def size(self) -> int:
        return len(self.get_feature_names())
    
    def create_feature_dict(self, datum) -> Dict[str, Binary]:
        """
        Creates a feature dictionary of str -> 1 or 0.
        Optionally include negated forms of each feature (i.e., NOT_*)
        """
        # 1 if value == 0 else value)
        pos_features = dict((feat, fn(datum)) for (feat, fn) in self.pos_features.items())
        neg_features = dict()
        if not self.use_negative_features:
            return pos_features
        # assumes we're using positive features
        neg_features = dict((f"NOT_{feat}", not value) for (feat, value) in pos_features.items())
        return {**pos_features, **neg_features}
            
    def fit(self, X, y = None):
        dicts = [self.create_feature_dict(datum = datum) for datum in X]
        super().fit(dicts)
        
    def transform(self, X, y = None):
        return super().transform([self.create_feature_dict(datum) for datum in X])

    def fit_transform(self, X, y = None):
        self.fit(X)
        return self.transform(X)

In [46]:
# Initialize our model and include negated features
lfe = LinguisticFeatureEncoder(use_negative_features=False)

In [47]:
lfe.fit(list(train_df['#2_tweet'].astype(str)))

In [48]:
# alternatively fit and then transform in a single step ...
train_data = lfe.transform(list(train_df['#2_tweet'].astype(str)))
dev_data = lfe.transform(list(dev_df['#2_tweet'].astype(str)))
test_data = lfe.transform(list(test_df['#2_tweet'].astype(str)))

In [49]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
train_y = list(train_df['#3_country_label'])
dev_y = list(dev_df['#3_country_label'])

encoder = LabelEncoder()
encoder.fit(train_y)
y_train = encoder.transform(train_y)
y_dev = encoder.transform(dev_y)

N_CLASSES = np.max(y_train) + 1
N_CLASSES
y_train = to_categorical(y_train, N_CLASSES)
y_dev = to_categorical(y_dev, N_CLASSES)
print('Shape of label tensor:', y_train.shape)
N_CLASSES
y_train[2]

Shape of label tensor: (16718, 20)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

## Build a network

Note that are engineered features have no sequential relationship, so **we do not use an RNN such as an LSTM for this architecture**.  Instead of simple feedforward network with fully connected layers, we could use a CNN for higher level feature extraction.

In [50]:
from tensorflow.keras.layers    import Input
from tensorflow.keras.layers    import Conv1D, MaxPooling1D, Flatten, concatenate, Dropout, Input, Embedding, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models    import Model, Sequential
from tensorflow.keras.utils     import plot_model

In [54]:
def make_model(max_features: int, embedding_dim: int, num_classes: int = 20):
    
    input_c2      = Input(shape=(max_features,))
    embeddings_c2 = Embedding(
        max_features, 
        embedding_dim,
        embeddings_initializer="uniform",
        embeddings_regularizer=None,
        activity_regularizer=None,
        embeddings_constraint=None,
        mask_zero=False,
        #weights=None,
        input_length=max_features,
        trainable=True
    )(input_c2)
    flat_c2       = Flatten()(embeddings_c2)
    hidden_c2_1   = Dense(512, activation="relu")(flat_c2)
    hidden_c2_2   = Dense(256, activation="relu")(hidden_c2_1)
    outputs       = Dense(20, activation="softmax")(hidden_c2_2)#"softmax")(hidden_c2_2)
    
    # model
    model         = Model(inputs=[input_c2], outputs=outputs)
    model.compile(
        loss="categorical_crossentropy", #'categorical_crossentropy',
        optimizer="adam", 
        metrics=["accuracy"]
    )
    # summarize
    model.summary()
    return model

In [55]:
model = make_model(max_features=lfe.size, embedding_dim=64)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 25, 64)            1600      
_________________________________________________________________
flatten_2 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 512)               819712    
_________________________________________________________________
dense_7 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_8 (Dense)              (None, 20)                5140      
Total params: 957,780
Trainable params: 957,780
Non-trainable params: 0
_____________________________________________________

In [56]:
clf   = model.fit(
    [train_data], 
    y_train,
    validation_data=(
        [dev_data], 
        y_dev
    ),
    epochs=3, 
    batch_size=32, 
    callbacks=[EarlyStopping(monitor='val_loss', patience=2)]
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [57]:
#apply to validation set
pred_dev_y = model.predict([test_data], batch_size=50, verbose=1)
pred_dev_y
indexes = np.argsort(pred_dev_y)[::-1]
indexes

# labels for the predicted dev data
labels = np.argmax(pred_dev_y, axis=-1)
print('Labels are: ', labels)

# getting the labels throw (inverse_transform)
dev_y_predicted = encoder.inverse_transform(labels)
print('The length of predicted labels is: ', len(dev_y_predicted))

# save labels to txt file
with open("two_forks_early.txt", "w") as f:
    for s in dev_y_predicted:
        f.write(str(s) + "\n")



Labels are:  [3 3 3 ... 3 3 3]
The length of predicted labels is:  5000
