In [24]:
import pandas as pd
import regex as re
from gensim.models.word2vec import Word2Vec
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

In [2]:
labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

# Extract the DataFrames
train = pd.read_csv('train.csv', delimiter=';')
train['label'] = train['label_description'].map({label: i for i, label in enumerate(labels)})

val = pd.read_csv('val.csv', delimiter=';')
val['label'] = val['label_description'].map({label: i for i, label in enumerate(labels)})

test = pd.read_csv('test.csv', delimiter=';')
test['label'] = test['label_description'].map({label: i for i, label in enumerate(labels)})
    
print(f'Loaded {len(train):,} train samples, {len(val):,} validation samples and {len(test):,} test samples.')
test.head()

Loaded 16,000 train samples, 2,000 validation samples and 2,000 test samples.


Unnamed: 0,text,label_description,label
0,im feeling rather rotten so im not very ambiti...,sadness,0
1,im updating my blog because i feel shitty,sadness,0
2,i never make her separate from me because i do...,sadness,0
3,i left with my bouquet of red and yellow tulip...,joy,1
4,i was feeling a little vain when i did this one,sadness,0


In [7]:
def fit_word2vec(corpus):
    '''Fit Word2Vec vectors on a corpus.
    
    Args:
        corpus (list of str):
            The corpus to fit the embeddings on.
            
    Returns:
        Word2Vec:
            The object containing the fitted embeddings.
    '''
    # Tokenise the corpus
    corpus = [re.split(r' |(?=[\.\,\-\"\'\!])', doc) for doc in corpus]
    
    # Fit the embeddings
    word2vec = Word2Vec(sentences=corpus, sg=1, workers=-1)

    # Return the object containing the fitted embeddings
    return word2vec


word2vec = fit_word2vec(train.text)
word2vec.wv['test']

array([ 0.00665323,  0.00854131,  0.00724919, -0.00223264,  0.00482249,
       -0.00306156, -0.00369475,  0.00128755, -0.00010604,  0.00781875,
       -0.00451267, -0.00236597, -0.00560054,  0.00181707, -0.00884478,
        0.00453514, -0.00466111,  0.00815185,  0.00152302,  0.00194126,
        0.00612788, -0.00019193, -0.00678271, -0.00600964,  0.00153245,
        0.00805468,  0.00631805,  0.0028777 , -0.00890056, -0.00811832,
        0.00629267,  0.00459466, -0.00502497, -0.00138311,  0.00128885,
       -0.00920435, -0.00092909,  0.00592631, -0.0020238 ,  0.00522415,
        0.00463007,  0.00577351,  0.00379759,  0.00147334, -0.00101058,
        0.00967818, -0.00058655,  0.00477082, -0.00327153, -0.00760753,
        0.00574748, -0.00324057, -0.0046496 , -0.00991903,  0.00228867,
       -0.00611513,  0.00263402,  0.00010916,  0.00763065,  0.00171987,
        0.00270949, -0.00473063,  0.00780364,  0.00924661,  0.00286369,
       -0.00265587,  0.00505278,  0.00070206,  0.00039433, -0.00

In [20]:
def word2vec_embed_term(term):
    '''Embeds a term using the fitted Word2vec model.
    
    Args:
        term (str):
            The term to be embedded.
            
    Returns:
        NumPy array:
            The embedding of the term.
    '''
    # We do a `try-except` here to deal with out-of-vocabulary terms,
    try:
        embedding = word2vec.wv[term]
        
    # If the word is not in our vocabulary, then we just embed it as the zero vector
    except KeyError:
        embedding = np.zeros(100)
    
    # Return the embedding
    return embedding

word2vec_embed_term('sad')

array([ 0.00370106, -0.00683988,  0.00919236, -0.00453693, -0.0072382 ,
       -0.00157117,  0.00474488,  0.00060411,  0.00406659,  0.00976933,
       -0.00401155,  0.00908427,  0.00087159,  0.00553001,  0.00729116,
        0.00309779,  0.008886  , -0.0090503 , -0.00106582,  0.00544355,
        0.00938782,  0.00696341,  0.00021974, -0.00021558,  0.00823147,
        0.00375668,  0.00520053, -0.00592092, -0.00020426, -0.00908623,
       -0.0010275 , -0.00879216, -0.00116161,  0.00899828, -0.00754742,
        0.00814291, -0.00663539,  0.00587001,  0.00436277, -0.00695457,
       -0.0077118 , -0.00545783, -0.0088209 , -0.00540636,  0.00124033,
       -0.00850474, -0.00982233,  0.00746922,  0.00382913, -0.00382764,
        0.0042289 , -0.00193098,  0.00914407,  0.0028104 ,  0.00404169,
       -0.00942   , -0.00812456, -0.00245567,  0.0063874 , -0.00817601,
        0.00257316, -0.00077865, -0.0037706 ,  0.00490162, -0.00452152,
        0.00812759, -0.00820326,  0.00282187, -0.00199904, -0.00

In [22]:
def word2vec_embed_doc(doc, aggregation_fn=np.mean):
    '''Embeds a document using the fitted Word2vec model.
    
    Args:
        doc (str):
            The document to be embedded.
        aggregation_fn (callable, optional):
            The function used to aggregate the term embeddings
            in the document. Must be a NumPy function. Defaults to 
            `numpy.mean`, meaning that the average of the term 
            embeddings is returned.
            
    Returns:
        NumPy array:
            The embedding of the document.
    '''
    # Split up the document into a list of terms
    terms = re.split(r' |(?=[\.\,\-\"\'\!])', doc)
    
    # Embed each term using the fitted `word2vec` model.
    embeddings = [word2vec_embed_term(term) for term in terms]
    
    # Aggregate the embeddings according to `aggregation_fn`
    embedding = aggregation_fn(embeddings, axis=0)
    
    # Return the aggregated embedding
    return embedding

word2vec_embed_doc('this is a test')

array([ 4.5711163e-04,  4.5147138e-03,  3.4552026e-03,  1.7170995e-03,
       -2.4658772e-03,  1.5286887e-03, -1.5400040e-03,  1.2311387e-03,
        1.2559954e-04,  1.7002800e-03, -4.2510824e-04, -1.8027722e-04,
        4.7017343e-04, -9.4091170e-04, -3.7611471e-03, -3.8669235e-03,
       -2.2440739e-03,  3.8658944e-04, -2.3945253e-03,  1.0323343e-03,
       -1.4382165e-03, -8.6477457e-04, -2.3490316e-03,  2.9262356e-03,
       -1.2347619e-03,  1.2851731e-03,  2.0692684e-03, -1.3876378e-03,
       -1.9872310e-03, -9.7388460e-04,  4.2622131e-03, -4.4900523e-03,
       -7.0473459e-04, -3.6706894e-03,  4.0059979e-04, -1.6730311e-03,
        1.2301751e-03,  2.9771584e-03,  7.1987446e-04, -3.4074578e-04,
        3.2544734e-03, -6.2678452e-04, -2.3427657e-03,  1.4261820e-03,
       -2.9835878e-03, -2.2870649e-03, -1.7279612e-03,  1.0918805e-03,
       -2.3612874e-03, -1.2422171e-03,  5.3027896e-03, -3.0977703e-03,
       -1.5055535e-03, -2.5990722e-04, -4.6229786e-03, -1.4673702e-03,
      

In [25]:
def word2vec_train_model(train_dataset: pd.DataFrame,
                         val_dataset: pd.DataFrame) -> LogisticRegression:
    '''Trains a logistic regression model on a Word2vec embedded corpus.
    
    Args:
        train_dataset (Pandas DataFrame):
            The dataset on which to train the logistic regression 
            model. Must have a 'text' and 'label' column.
        val_dataset (Pandas DataFrame):
            The dataset on which to evaluate the logistic regression 
            model. Must have a 'text' and 'label' column.
            
    Returns:
        LogisticRegression:
            The trained model.
    '''    
    # Embded datasets
    word2vec_embeddings_train = [word2vec_embed_doc(doc) for doc in train_dataset.text]
    word2vec_embeddings_val = [word2vec_embed_doc(doc) for doc in val_dataset.text]
        
    # Convert the lists of embeddings to one big matrix
    word2vec_embeddings_train = np.stack(word2vec_embeddings_train)
    word2vec_embeddings_val = np.stack(word2vec_embeddings_val)
    
    # Define the classification model
    model = LogisticRegression(max_iter=1_000)
    
    # Train the model
    model.fit(word2vec_embeddings_train, train_dataset.label)
    
    # Evaluate the model on the validation set.
    predictions = model.predict(word2vec_embeddings_val)
    val_balanced_acc = balanced_accuracy_score(val_dataset.label, predictions)
    print(f'The model achieved a {100 * val_balanced_acc:.2f}% '
          f'balanced accuracy on the validation set.')
    
    # Return the trained model
    return model

model = word2vec_train_model(train_dataset=train, val_dataset=val)

The model achieved a 16.67% balanced accuracy on the validation set.
