In [None]:
# !pip install imbalanced-learn
# !pip install nltk

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

np.random.seed(42)

## Load data

In [None]:
df = pd.read_csv('./data/eminem-madonna-500.csv', index_col=0)
df.head()

In [None]:
# transform data
X = df['lyrics']
y = df['artist']

artists = y.unique()
artists_map = dict(zip(artists, range(len(artists))))
y = y.map(artists_map)
X.shape, y.shape

In [None]:
# vectorize data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X).toarray()

In [None]:
# check percentages of class in target
y.value_counts(normalize=True) * 100

In [None]:
# check counts of class in target
y.value_counts()

In [None]:
# ratio of imbalance
ratio = int(round(y.value_counts()[0] / y.value_counts()[1], 0))
ratio

## Split Data


In [None]:
# train/test split and stratify the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Evaluation function

In [None]:
def print_evaluations(ytrue, ypred, model, artists):
    print(f'How does model {model} score:')
    print(f'The accuracy of the model is: {round(metrics.accuracy_score(ytrue, ypred), 3)}')
    print(f'The precision of the model is: {round(metrics.precision_score(ytrue, ypred, zero_division=0), 3)}')
    print(f'The recall of the model is: {round(metrics.recall_score(ytrue, ypred, zero_division=0), 3)}')
    print(f'The f1-score of the model is: {round(metrics.f1_score(ytrue, ypred, zero_division=0), 3)}')
    
    #print confusion matrix
    fig = plt.figure(figsize=(6, 6))
    cm = metrics.confusion_matrix(ytrue, ypred)
    print(cm)
    
    #plot the heatmap
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(artists); 
    ax.yaxis.set_ticklabels(artists)

#### Naiive Model

In [None]:
#ALWAYS predicts first result
ypred = [0] * y_test.shape[0]

In [None]:
print_evaluations(y_test, ypred, 'Baseline (always first result)', artists)

#### Random Forest Model

In [None]:
# fit and evaluate a default random forest model

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
ypred_rf = rf.predict(X_test)
print_evaluations(y_test, ypred_rf, 'Random Forest', artists)

#### Logistic regression

In [None]:
# use balanced weights for in-training handling of imbalance, fir the model, and evaluate
lr = LogisticRegression(class_weight='balanced', max_iter=1000)

lr.fit(X_train, y_train)
ypred_lr = lr.predict(X_test)

print_evaluations(y_test, ypred_lr, 'Logistic Regression', artists)

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

y_pred_nb = model_nb.predict(X_test)

print_evaluations(y_test, y_pred_nb, 'Naive Beyes', artists)

## Cross validation

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 2]}

g = GridSearchCV(model_nb, parameters, cv=5, scoring='f1', return_train_score=True)
g.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(g.cv_results_).sort_values(by="mean_test_score", ascending=False)
res.head()

In [None]:
model_nb_best = g.best_estimator_
y_pred_nb_best = model_nb_best.predict(X_test)

In [None]:
print_evaluations(y_test, y_pred_nb_best, 'Naive Bayes (optimised)', artists)