![](https://i.imgur.com/EHwHRCJ.jpeg)

**Goal:** Correctly classify Taylor Swift lyric to correct album based on text data alone

In [2]:
# various libraries
# calculation packages
import numpy as np 
import pandas as pd 
from collections import Counter

# textual analysis packages 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

# machine learning packages
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# visualization
import matplotlib.pyplot as plt

In [3]:
# data
data = pd.read_csv('/Users/emma/Desktop/oldschool/ml_perspectives/project/data/thisone.csv')
data = data.drop('Unnamed: 0', axis=1)

In [4]:
# feature engineering
def looking_at_top(X, features):
    '''
    Takes in vectorized data and feature names to show the top 20 most 
    important features
    Inputs:
        X: (numpy array) an array of tf-idf vectorized features
        features: (numpy array) an array of the names of the features
    '''
    
    feature_np = np.asarray(np.sum(X, axis = 0))
    feature_np = feature_np.reshape(-1)

    top = []

    for x in np.argsort(feature_np)[::-1][:21]:
        top.append((features[x], feature_np[x]))

        df = pd.DataFrame(top)
    
    return df

# stop words
stop_words = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 
              'not', 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with', 're', 
              'don', 've', 'll', 'isn', ]

# isolating lyrics
lyric_text = data['lyrics']


#NGRAM(1, 3)
vectorizer = TfidfVectorizer(strip_accents = 'ascii', stop_words = stop_words, ngram_range= (1, 3))

# fitting
X_13 = vectorizer.fit_transform(lyric_text)

# checking shape
print(X_13.shape)

# checking features
feature_names_13 = vectorizer.get_feature_names_out()

looking_at_top(X_13, feature_names_13)


(4584, 49457)


Unnamed: 0,0,1
0,you,148.890813
1,me,73.944305
2,my,63.418646
3,oh,59.750744
4,we,52.2669
5,your,52.208628
6,all,47.852716
7,like,46.305066
8,know,42.28858
9,so,37.412549


![](https://i.imgur.com/8wiA92h.png)

# Decision Trees

- I am using (1,3) ngram length because that is what has traditionally perfomed the best 

In [5]:
album_names = data['album_name']

# non strat version, using (1,3)
X_train, X_test, y_train, y_test = train_test_split(X_13, album_names, test_size = 0.2, random_state = 13)

In [6]:
dtc = tree.DecisionTreeClassifier(random_state = 13)

# fitting
dtc.fit(X_train, y_train)

# evaluating
round(dtc.score(X_test, y_test), 3)

0.371

In [7]:
# poor performance, checking to see depth before attempting to visualize
print('depth:', dtc.get_depth())

print('number of leaves:', dtc.get_n_leaves())

depth: 308
number of leaves: 1624


In [45]:
# addressing the overfitting issues, do not want to visualize, tree is too large
dtc = tree.DecisionTreeClassifier(max_depth = 300, splitter = 'random', min_samples_split = 4, random_state = 13)
dtc.fit(X_train, y_train)
round(dtc.score(X_test, y_test), 3)

0.395

In [46]:
print('depth:', dtc.get_depth())
print('number of leaves:', dtc.get_n_leaves())

depth: 300
number of leaves: 1490


**Stratified DTC**

In [32]:
# stratified data
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_13, album_names, test_size = 0.2, random_state = 13, stratify = album_names)

In [33]:
dtc = tree.DecisionTreeClassifier(random_state = 13)

# fitting
dtc.fit(X_train_s, y_train_s)

# evaluating
round(dtc.score(X_test_s, y_test_s), 3)

0.36

In [29]:
# using strat data on best performing tree
dtc = tree.DecisionTreeClassifier(max_depth = 200, splitter = 'best', min_samples_split = 4, random_state = 13)

# fitting
dtc.fit(X_train_s, y_train_s)

# evaluating
round(dtc.score(X_test_s, y_test_s), 3)

0.364

In [30]:
print('depth:', dtc.get_depth())
print('number of leaves:', dtc.get_n_leaves())

depth: 200
number of leaves: 1284


- When the test size in decreased, the unstratified data performs better

In [50]:
# best performing tree
X_train, X_test, y_train, y_test = train_test_split(X_13, album_names, test_size = 0.2, random_state = 13)
dtc = tree.DecisionTreeClassifier(max_depth = 300, splitter = 'random', min_samples_split = 4, random_state = 13)
dtc.fit(X_train, y_train)
round(dtc.score(X_test, y_test), 3)

0.395

In [51]:
# looking at confusion matrix and classification report to get a better idea of model perfomrance
y_pred = dtc.predict(X_test)
confusion_m = confusion_matrix(y_test, y_pred)
confusion_m

array([[43,  5,  7,  6,  8,  5, 15,  3,  9,  4],
       [12, 21,  3, 11,  5,  9,  7,  0,  7,  1],
       [ 7,  3, 48,  5,  8,  5, 16,  3,  7,  3],
       [11,  1,  5, 22,  5,  0,  8,  4,  0,  4],
       [ 8,  2,  5,  5, 27,  6, 10,  8, 13,  3],
       [ 7,  9,  3,  2,  5, 35,  8,  4,  8,  2],
       [10,  6,  9,  9,  6,  7, 65,  8,  8,  7],
       [ 8,  1,  2,  7,  6,  0,  6, 35,  5,  4],
       [15,  6, 14,  9,  6, 12, 11,  3, 44,  3],
       [ 7,  1,  4,  8,  2,  5,  6,  6,  8, 22]])

In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        1989       0.34      0.41      0.37       105
    evermore       0.38      0.28      0.32        76
    fearless       0.48      0.46      0.47       105
    folklore       0.26      0.37      0.31        60
       lover       0.35      0.31      0.33        87
   midnights       0.42      0.42      0.42        83
         red       0.43      0.48      0.45       135
  reputation       0.47      0.47      0.47        74
   speak now       0.40      0.36      0.38       123
taylor swift       0.42      0.32      0.36        69

    accuracy                           0.39       917
   macro avg       0.39      0.39      0.39       917
weighted avg       0.40      0.39      0.39       917



- The classification report and the confusion matrix confirm the model is not preforming well

In [54]:
# interested in seeing the feature importance of each feature in the DTC

# getting the importance
importance = np.round(dtc.feature_importances_, 4)

# creating df
df = pd.DataFrame({'feature': feature_names_13, 'importance': importance})

# sorting the df
df = df.sort_values(by='importance', ascending=False)

df = df.reset_index(drop=True)

df.head(10)

Unnamed: 0,feature,importance
0,your,0.0097
1,love,0.0096
2,my,0.0086
3,you,0.0082
4,me,0.0078
5,baby,0.0065
6,we,0.0065
7,all,0.0062
8,now,0.0057
9,oh,0.0051


# Random Forest

- Want to see if performance can be increased by using a more robust model

In [55]:
# non strat
rfc = RandomForestClassifier(random_state = 13)

rfc.fit(X_train, y_train)

round(rfc.score(X_test, y_test), 3)

0.547

In [56]:
# strat
rfc = RandomForestClassifier(random_state = 13)

rfc.fit(X_train_s, y_train_s)

round(rfc.score(X_test_s, y_test_s), 3)

0.551

In [59]:
from sklearn.model_selection import KFold

scores = []
X = X_13
y = data['album_name']

kf = KFold(n_splits=6, shuffle = True, random_state = 13)

for train, test in kf.split(X, y):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    rfc = RandomForestClassifier(random_state = 13).fit(X_train, y_train)
    score = round(rfc.score(X_test, y_test),3)
    scores.append(score)

print('scores:', scores)
mean_score = np.mean(scores)
print('average score:', np.round(mean_score, 3))

scores: [0.571, 0.53, 0.52, 0.564, 0.525, 0.545]
average score: 0.542


In [58]:
from sklearn.model_selection import StratifiedKFold

scores = []
X = X_13
y = data['album_name']

strat_kf = StratifiedKFold(n_splits=6, shuffle = True, random_state = 13)

for train, test in strat_kf.split(X, y):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    rfc = RandomForestClassifier(random_state = 13).fit(X_train, y_train)
    score = round(rfc.score(X_test, y_test),3)
    scores.append(score)

print('scores:', scores)
mean_score = np.mean(scores)
print('average score:', np.round(mean_score, 3))

scores: [0.556, 0.522, 0.529, 0.573, 0.562, 0.556]
average score: 0.55


- Interestingly enough, the RF performs slightly better with stratified data, but it's not a big enough difference for me to be convinced. 
- I will do kfold validation to see if it truly is performing better

In [37]:
# looking at confusion matrix and classification report to get a better idea of model perfomrance
y_pred = rfc.predict(X_test_s)
confusion_m = confusion_matrix(y_test_s, y_pred)
confusion_m

array([[ 96,   4,  10,   1,   3,   4,  21,   7,  10,   5],
       [ 15,  57,   1,   3,   4,   3,  19,  11,  10,   2],
       [ 13,   3,  77,   3,   3,   3,  24,   2,  19,   7],
       [ 11,   4,   6,  40,   0,   7,  15,   8,  10,   0],
       [ 11,   3,   3,   0,  68,   3,  19,  13,   4,   0],
       [ 12,   3,   6,   4,   1,  58,  17,  10,  11,   3],
       [ 17,   5,  12,   2,   3,   8, 126,   7,  14,   4],
       [  6,   5,   2,   0,   3,   2,  20,  78,   7,   4],
       [ 13,   5,   9,   3,   2,   5,  26,   9,  83,   5],
       [  4,   2,  17,   0,   3,   2,  19,   9,   6,  39]])

In [38]:
print(classification_report(y_test_s, y_pred))

              precision    recall  f1-score   support

        1989       0.48      0.60      0.53       161
    evermore       0.63      0.46      0.53       125
    fearless       0.54      0.50      0.52       154
    folklore       0.71      0.40      0.51       101
       lover       0.76      0.55      0.64       124
   midnights       0.61      0.46      0.53       125
         red       0.41      0.64      0.50       198
  reputation       0.51      0.61      0.56       127
   speak now       0.48      0.52      0.50       160
taylor swift       0.57      0.39      0.46       101

    accuracy                           0.52      1376
   macro avg       0.57      0.51      0.53      1376
weighted avg       0.55      0.52      0.53      1376



In [57]:
# interested in seeing the feature importance of each feature for the RF
# getting the importance
importance = np.round(rfc.feature_importances_, 4)

# creating df
df_rfc = pd.DataFrame({'feature': feature_names_13, 'importance': importance})

# sorting the df
df_rfc = df_rfc.sort_values(by='importance', ascending=False)

df_rfc = df_rfc.reset_index(drop=True)

df_rfc.head(10)


Unnamed: 0,feature,importance
0,you,0.0059
1,my,0.0035
2,we,0.0034
3,me,0.003
4,oh,0.0027
5,your,0.0027
6,when,0.0026
7,like,0.0023
8,now,0.0021
9,all,0.0021


- Almost all of the same features are present, but with a different order, again confirming that the DTC issue was overfitting.

**Randomized Search**

- I will be using randomized search over grid search here, due to the time/computational complexity of RF.

In [60]:
from sklearn.model_selection import RandomizedSearchCV

In [61]:
# setting possible params 
parameters = {
    "min_samples_split": (2, 4),
    'max_features': ('sqrt', 'log2', None)
}

In [62]:
# rfc object
rfc_s = RandomForestClassifier(random_state = 13)

# random search obj
random_search = RandomizedSearchCV(estimator=rfc_s, param_distributions=parameters, cv=3)

# fitting
random_search.fit(X_train_s, y_train_s)



In [63]:
best = random_search.best_params_
best

{'min_samples_split': 4, 'max_features': 'log2'}

In [64]:
rfc = RandomForestClassifier(random_state = 13, min_samples_split = 4, max_features = 'log2')

rfc.fit(X_train_s, y_train_s)

round(rfc.score(X_test_s, y_test_s), 3)

0.605