In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords
import nltk
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import spacy

# Data Loading

In [2]:
data = pd.read_csv("all_annotated.csv")
data.head()

Unnamed: 0,Tweet ID,Country,Date,Tweet,Definitely English,Ambiguous,Definitely Not English,Code-Switched,Ambiguous due to Named Entities,Automatically Generated Tweets
0,434215992731136000,TR,2014-02-14,Bugün bulusmami lazimdiii,0,0,1,0,0,0
1,285903159434563584,TR,2013-01-01,Volkan konak adami tribe sokar yemin ederim :D,0,0,1,0,0,0
2,285948076496142336,NL,2013-01-01,Bed,1,0,0,0,0,0
3,285965965118824448,US,2013-01-01,I felt my first flash of violence at some fool...,1,0,0,0,0,0
4,286057979831275520,US,2013-01-01,Ladies drink and get in free till 10:30,1,0,0,0,0,0


`data` contains all English tweets. `data_modified` contains English tweets exlcuding the ones which contain identifying proper nouns. We will run all our models on both datasets to compare performance to see if there are differences in syntax and grammar rules in the dialects of different countries. 

In [3]:
data.drop(data[data['Definitely Not English '] == 1].index, inplace=True)
data_modified = data.drop(data[data['Definitely Not English '] == 1].index)

In [4]:
data = data.filter(['Tweet ','Country '], axis=1)
data = data.rename(columns={'Tweet ':'tweet','Country ':'country'}).reset_index().drop(columns='index')

data_modified = data_modified.filter(['Tweet ','Country '], axis=1)
data_modified = data_modified.rename(columns={'Tweet ':'tweet','Country ':'country'}).reset_index().drop(columns='index')

data.head()

Unnamed: 0,tweet,country
0,Bed,NL
1,I felt my first flash of violence at some fool...,US
2,Ladies drink and get in free till 10:30,US
3,@Melanynijholtxo ahhahahahah dm!,NL
4,Fuck,US


In [5]:
data.loc[data['country'].value_counts()[data['country']].values < 100, 'country'] = "OTHER"

# Data Processing for NN Using Unmodified Data

In [6]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [7]:
numbering = {}
i = 0
for country in list(data.country.value_counts().index.tolist()):
    numbering[country] = i
    i += 1
print(numbering)
data['country'] = data['country'].apply(lambda x: numbering[x])
data.head()

{'US ': 0, 'OTHER': 1, 'GB ': 2, 'ID ': 3, 'TR ': 4, 'BR ': 5, 'MY ': 6, 'PH ': 7, 'JP ': 8, 'CA ': 9}


Unnamed: 0,tweet,country
0,Bed,1
1,I felt my first flash of violence at some fool...,0
2,Ladies drink and get in free till 10:30,0
3,@Melanynijholtxo ahhahahahah dm!,1
4,Fuck,0


In [8]:
tok = spacy.load("en_core_web_sm")
def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]


In [9]:
counts = Counter()
for index, row in data.iterrows():
    counts.update(tokenize(row['tweet']))

In [10]:
#Counting words

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 18608
num_words after: 4287


In [11]:
# Creating the vocab

vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [12]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

data['encoded'] = data['tweet'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))
data.head()

Unnamed: 0,tweet,country,encoded
0,Bed,1,"[[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,I felt my first flash of violence at some fool...,0,"[[3, 4, 5, 6, 7, 8, 1, 9, 10, 11, 12, 1, 13, 1..."
2,Ladies drink and get in free till 10:30,0,"[[18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, ..."
3,@Melanynijholtxo ahhahahahah dm!,1,"[[17, 1, 1, 26, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,Fuck,0,"[[27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


In [13]:
Counter(data['country']) # the dataset is not very balanced, this may be a problem later

Counter({1: 927,
         0: 2884,
         2: 463,
         9: 114,
         6: 221,
         3: 392,
         5: 250,
         4: 320,
         8: 138,
         7: 147})

In [14]:
X = np.array(data['encoded'], dtype=object)
y = np.array(data['country'], dtype=object)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [15]:
from torch.utils.data import Dataset, DataLoader
class TweetsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]
    
train_set = TweetsDataset(X_train, y_train)
val_set = TweetsDataset(X_valid, y_valid)

In [16]:
def train(model, epochs, lr):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            try:
                loss = F.cross_entropy(y_pred, y)
            except IndexError:
                continue
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = metrics(model, val_dl)
        if i % 5 == 1:
            k = float('inf')
            if total != 0:
                k = round(sum_loss/total, 3)
            print("train loss", k, "val loss", round(float(val_loss), 3), "val acc", round(float(val_acc), 3), "val rmse", round(float(val_rmse), 3))
            

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_23504/988211076.py, line 28)

In [None]:
from sklearn.metrics import mean_squared_error
def metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        try:
            loss = F.cross_entropy(y_hat, y)
        except IndexError:
            continue
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    try:
        return sum_loss/total, correct/total, sum_rmse/total
    except ZeroDivisionError:
        i = float('inf')
        return i, i, i

In [None]:
batch_size = 10
vocab_size = len(words)
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_set, batch_size=batch_size)

In [None]:
def load_glove(glove_file="glove.6B/glove.6B.50d.txt"):
    word_vectors = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [None]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

word_vecs = load_glove()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [None]:
class ClassifierModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
model = ClassifierModel(vocab_size, 50, 50, pretrained_weights)
train(model, epochs=100, lr=0.1)