# Class imbalance

In [None]:
# Apply this step only once to install and then comment out
# !pip install imbalanced-learn
# !pip install nltk

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

RANDOM_STATE = 42

## Load data

In [None]:
# load fraud data
# df = pd.read_csv('./data/creditcard.zip', index_col=0)
df = pd.read_csv('./data/lyrics_data.csv', index_col=0)
df.head()

In [None]:
# transform data
X = df['lyrics']
y = df['artists']

artists = y.unique()
artists_map = dict(zip(artists, range(len(artists))))
y = y.map(artists_map)
X.shape, y.shape

In [None]:
# vectorize data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X).toarray()

In [None]:
# check percentages of class in target
y.value_counts(normalize=True) * 100

In [None]:
# check counts of class in target
y.value_counts()

In [None]:
# ratio of imbalance
ratio = int(round(y.value_counts()[0] / y.value_counts()[1], 0))
ratio

## Split Data


In [None]:
# X = df.drop('Class', axis=1) #all features minus Class
# y = df['Class'] #just the Class columns

In [None]:
# train/test split and stratify the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Evaluation function

In [None]:
def print_evaluations(ytrue, ypred, model, artists):
    print(f'How does model {model} score:')
    print(f'The accuracy of the model is: {round(metrics.accuracy_score(ytrue, ypred), 3)}')
    print(f'The precision of the model is: {round(metrics.precision_score(ytrue, ypred, zero_division=0), 3)}')
    print(f'The recall of the model is: {round(metrics.recall_score(ytrue, ypred, zero_division=0), 3)}')
    print(f'The f1-score of the model is: {round(metrics.f1_score(ytrue, ypred, zero_division=0), 3)}')
    
    #print confusion matrix
    fig = plt.figure(figsize=(6, 6))
    cm = metrics.confusion_matrix(ytrue, ypred)
    print(cm)
    
    #plot the heatmap
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(artists); 
    ax.yaxis.set_ticklabels(artists)

#### Naiive Model

In [None]:
#ALWAYS predicts non-fraud
ypred = [0] * y_test.shape[0]

In [None]:
print_evaluations(y_test, ypred, 'baseline (always non-fraud)', artists)

#### Random Forest Model

In [None]:
# fit and evaluate a default random forest model

rf = RandomForestClassifier(random_state=RANDOM_STATE)
rf.fit(X_train, y_train)
ypred_rf = rf.predict(X_test)
print_evaluations(y_test, ypred_rf, 'RandomForest', artists)

#### Logistic regression

In [None]:
# use balanced weights for in-training handling of imbalance, fir the model, and evaluate
lr = LogisticRegression(random_state=RANDOM_STATE, class_weight='balanced', max_iter=1000)

# use ranom undersample training data
lr.fit(X_train, y_train)
ypred_lr = lr.predict(X_test)

# evaluate
print_evaluations(y_test, ypred_lr, 'Logistic Regression', artists)

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)

# evaluate
print_evaluations(y_test, y_pred_nb, 'Naive Beyes', artists)

## Post training

In [None]:
# precision and recall vs thresholds
def plot_precision_recall_vs_thresholds(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.grid(b=True, which="both", axis="both", color='gray', linestyle='-', linewidth=1)

### Random Forest

In [None]:
# calculate probabilities
# the second column are the probability of positive class label

ytest_proba_rf = rf.predict_proba(X_test)[:,1]
ytest_proba_rf

In [None]:
# compute precisions, recalls and thresholds
precisions_rf, recalls_rf, thresholds_rf = metrics.precision_recall_curve(y_test, ytest_proba_rf)

In [None]:
# plot precision , recall vs. thresholds
plot_precision_recall_vs_thresholds(precisions_rf, recalls_rf, thresholds_rf)
plt.show()

In [None]:
# plot precision and recall curve 
_ = metrics.plot_precision_recall_curve(rf, X_test, y_test)

In [None]:
# compute area under precision recall curve 
pr_area = metrics.auc(recalls_rf, precisions_rf)
pr_area

### Logistic Regression

In [None]:
# calculate probabilities
# the second column are the probability of positive class label

ytest_proba_lr = lr.predict_proba(X_test)[:,1]

# pr curve data
precisions_lr, recalls_lr, thresholds_lr = metrics.precision_recall_curve(y_test, ytest_proba_lr)

plot_precision_recall_vs_thresholds(precisions_lr, recalls_lr, thresholds_lr)
plt.show()

In [None]:
# plot precision and recall curve 
_ = metrics.plot_precision_recall_curve(lr, X_test, y_test)

In [None]:
pr_area = metrics.auc(recalls_lr, precisions_lr)
pr_area


## How to use this ?

How to use this in this week's project:
- still need to vectorize your data (i.e. CountVectorizer / TfIdf)
- still need to train/test split!
- you can apply class balancing techniques on the vectorized dataframe.
   - ONLY do this on TRAINING!!!! Not testing.
- do the predictive modeling HERE (e.g. LogReg, NB, RF, etc.)
- then at the very end, validate your model's performance on the testing data.

keep in mind that if you want to combine SMOTE with LogReg as part of a pipeline, you need to use the pipeline from IMBLEARN, not Sklearn!

In [None]:
from imblearn.pipeline import make_pipeline

In [None]:
model = make_pipeline(SMOTE(random_state=10), RandomForestClassifier())

In [None]:
# model.fit(...)