In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [63]:
def split_text_data(f):
    corpus = []
    labels = []
    with open(f, 'r') as file:
        for line in file:
            contents = line.split(';')
            corpus.append(contents[0].strip())
            labels.append(contents[1].strip())
    return corpus, labels

In [95]:
def get_data(corpus, labels):
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    
    # Transform the text data into TF-IDF embeddings
    X_tfidf = vectorizer.fit_transform(corpus)

    # get y values that the model will train on
    y = []

    # enumerate the emotion values so that the logistic regression model can
    # train on these labels
    for label in labels:
        if label == 'anger':
            y.append(0)
        elif label == 'sadness':
            y.append(1)
        elif label == 'love':
            y.append(2)
        elif label == 'joy':
            y.append(3)
        elif label == 'surprise':
            y.append(4)
        elif label == 'fear':
            y.append(5)
            
    # split data into a training set and a temporary set
    X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
    
    # split temporary set into a validation set and testing set
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    return X_train, y_train, X_val, y_val, X_test, y_test, vectorizer

In [96]:
# split the text data into class labels and features to be transferred into numerical values to be trained on
corpus_train, labels_train = split_text_data('./emotions-dataset-for-nlp/train.txt')

# split the text data into class labels and features to be transferred into numerical values to be validated on
corpus_val, labels_val = split_text_data('./emotions-dataset-for-nlp/val.txt')

# split the text data into class labels and features to be transferred into numerical values to be tested on
corpus_test, labels_test = split_text_data('./emotions-dataset-for-nlp/test.txt')

# combine the sets together in order to vectorize all of them together then split them off back into their separate sets
corpus = corpus_train + corpus_val + corpus_test
labels = labels_train + labels_val + labels_test
X_train, y_train, X_val, y_val, X_test, y_test, vectorizer = get_data(corpus, labels)

In [97]:
# get Logistic Regression Model
model = LogisticRegression(max_iter=500)

# Define hyperparameters to tune
param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}

# Use GridSearchCV for cross-validation on the training set
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'C': 10, 'solver': 'liblinear'}


In [98]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

val_accuracy = best_model.score(X_val, y_val)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.893


In [99]:
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8905


In [100]:
def convert_pred(prediction):
    if prediction == 0:
            return 'anger'
    elif prediction == 1:
            return 'sadness'
    elif prediction == 2:
            return 'love'
    elif prediction == 3:
            return 'joy'
    elif prediction == 4:
            return 'surpise'
    elif prediction == 5:
            return 'fear'

In [109]:
# test a phrase
phrase = ['I celebrated my graduation.']

phrase_tfidf = vectorizer.transform(phrase)

y_pred = best_model.predict(phrase_tfidf)
print(convert_pred(y_pred))

joy
