In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score

In [2]:
labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

# Extract the DataFrames
train = pd.read_csv('train.csv', delimiter=';')
train['label'] = train['label_description'].map({label: i for i, label in enumerate(labels)})

val = pd.read_csv('val.csv', delimiter=';')
val['label'] = val['label_description'].map({label: i for i, label in enumerate(labels)})

test = pd.read_csv('test.csv', delimiter=';')
test['label'] = test['label_description'].map({label: i for i, label in enumerate(labels)})
    
print(f'Loaded {len(train):,} train samples, {len(val):,} validation samples and {len(test):,} test samples.')
test.head()

Loaded 16,000 train samples, 2,000 validation samples and 2,000 test samples.


Unnamed: 0,text,label_description,label
0,im feeling rather rotten so im not very ambiti...,sadness,0
1,im updating my blog because i feel shitty,sadness,0
2,i never make her separate from me because i do...,sadness,0
3,i left with my bouquet of red and yellow tulip...,joy,1
4,i was feeling a little vain when i did this one,sadness,0


In [12]:
def fit_tfidf(corpus, **tfidf_params):
    '''Fit a TF-IDF model on a corpus.
    
    This computes the vocabulary and stores all the
    document frequencies in the class.
    
    Args:
        corpus (list of str):
            The corpus on which the TF-IDF model will
            be fitted.
        **tfidf_params:
            Extra keyword parameters to include in the TF-IDF model.
            
    Returns:
        TfidfVectorizer:
            The fitted TF-IDF model.
    '''

    # Initialise the class computing the TF-IDF embeddings
    tfidf = TfidfVectorizer(**tfidf_params)

    # Fit model to text
    tfidf.fit(corpus)
    
    # Return the fitted TF-IDF model
    return tfidf

# norm=None prevents normalisation occuring
# max_df=0.1 ignores words occuring in more than 10% of the documents
tfidf = fit_tfidf(train.text, norm=None, max_df=0.1)

In [13]:
def tfidf_train_model(tfidf_model, train_dataset, val_dataset):
    '''Trains a logistic regression model on a TF-IDF embedded corpus.
    
    Args:
        tfidf_model (TfidfVectorizer):
            The fitted TF-IDF model.
        train_dataset (Pandas DataFrame):
            The dataset on which to train the logistic regression 
            model. Must have a 'text' and 'label' column.
        val_dataset (Pandas DataFrame):
            The dataset on which to evaluate the logistic regression 
            model. Must have a 'text' and 'label' column.
            
    Returns:
        LogisticRegression:
            The trained model.
    '''
    tfidf_embeddings_train = tfidf_model.transform(train_dataset.text)
    tfidf_embeddings_val = tfidf_model.transform(val_dataset.text)
    
    # Define the model
    model = LogisticRegression(max_iter=1_000)
    
    # Train the model
    model.fit(tfidf_embeddings_train, train_dataset.label)
    
    # Evaluate the model on the validation set.
    predictions = model.predict(tfidf_embeddings_val)
    val_balanced_acc = balanced_accuracy_score(val_dataset.label, predictions)
    print(f'The model achieved a {100 * val_balanced_acc:.2f}% '
          f'balanced accuracy on the validation set.')
    
    # Return the trained model
    return model

model = tfidf_train_model(tfidf_model=tfidf, train_dataset=train, val_dataset=val)

The model achieved a 83.70% balanced accuracy on the validation set.
