# STEP0: Goal

# STEP1: Libraries and Get Data

In [1]:
import numpy as np
import re
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model    import LogisticRegression
from sklearn.tree            import plot_tree, DecisionTreeClassifier
from sklearn.ensemble        import RandomForestClassifier

from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import nltk.data

import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.linear_model      import Lasso, Ridge, ElasticNet, LinearRegression, PoissonRegressor
from sklearn.model_selection   import train_test_split, GridSearchCV
from sklearn.model_selection   import train_test_split, cross_validate, cross_val_score, KFold, ShuffleSplit
from sklearn.compose           import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline          import Pipeline, make_pipeline
from sklearn.preprocessing     import FunctionTransformer, MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute            import SimpleImputer 
from sklearn                   import metrics
from sklearn.metrics           import mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_error
from sklearn.feature_selection import RFE

In [2]:
# Read Data from csv file
artist1 = 'manowar'
artist2 = 'hammerfall'

main_df1 = pd.read_csv(f'{artist1}.csv')
main_df2 = pd.read_csv(f'{artist2}.csv')

In [3]:
# Artists data fairly in balance (do not consider duplications)
df1 = main_df1.copy
df2 = main_df2.copy
print(main_df1.shape, main_df2.shape)

size1 = main_df1.shape[0]
size2 = main_df2.shape[0]
print(f'df1:     {size1}\ndf2:     {size2}\ndf1+df2: {size1+size2}')

(12890, 3) (14072, 3)
df1:     12890
df2:     14072
df1+df2: 26962


In [4]:
#set 1st artist as [0], and 2nd as [1]
ytrain = pd.Series([0]*size1+[1]*size2)
ytrain = pd.DataFrame({'ytrain' : ytrain})
print(ytrain.shape)
ytrain

(26962, 1)


Unnamed: 0,ytrain
0,0
1,0
2,0
3,0
4,0
...,...
26957,1
26958,1
26959,1
26960,1


# STEP2 : Preprocessing & EDA

In [5]:
corpus = pd.concat([main_df1, main_df2],ignore_index=True) 
print(corpus.shape)
corpus2 = pd.concat([corpus, pd.DataFrame(ytrain)], ignore_index=False ,axis=1)
corpus2

(26962, 3)


Unnamed: 0.1,Unnamed: 0,song_names,sentences,ytrain
0,Fighting the World,Fighting the World,fight fight fight,0
1,Fighting the World,Fighting the World,fighting the world every single day,0
2,Fighting the World,Fighting the World,fighting the world for the right to play,0
3,Fighting the World,Fighting the World,heavy metal in my brain,0
4,Fighting the World,Fighting the World,im fighting for metal cause its here to stay,0
...,...,...,...,...
26957,Destined for Glory [2020 Remix],Destined for Glory [2020 Remix],but fight without honor and youre destined to,1
26958,Destined for Glory [2020 Remix],Destined for Glory [2020 Remix],fall,1
26959,Destined for Glory [2020 Remix],Destined for Glory [2020 Remix],fight with your heart and youre destined for,1
26960,Destined for Glory [2020 Remix],Destined for Glory [2020 Remix],glory,1


In [6]:
#Drop Nan values otherwise countvectorizer do not work with NAN values
print(corpus2.isna().sum())
corpus3 = corpus2.dropna()
print(f'\nAfter drop NaN:\n{corpus3.isna().sum()}')

Unnamed: 0     0
song_names     0
sentences     51
ytrain         0
dtype: int64

After drop NaN:
Unnamed: 0    0
song_names    0
sentences     0
ytrain        0
dtype: int64


In [7]:
corpus3 = corpus3[['sentences','ytrain']]
corpus3.drop_duplicates(subset=['sentences'], inplace=True)
corpus3

Unnamed: 0,sentences,ytrain
0,fight fight fight,0
1,fighting the world every single day,0
2,fighting the world for the right to play,0
3,heavy metal in my brain,0
4,im fighting for metal cause its here to stay,0
...,...,...
26943,skilled with steel black as the night,1
26944,normally veiled from the mortal eyes,1
26945,his hammer arose like a magic force,1
26946,with a gaze that turned everything into stone,1


# STEP3: Train/Test Split

In [8]:
X,y = corpus3["sentences"], corpus3["ytrain"]

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.75, random_state=25)

# STEP4: Pipelines

In [194]:
from sklearn import set_config
set_config(display="diagram")

def build_tfid_model(model_abv, train_test_dict, vec_abv):
    '''
    Input: 
      model_abv       : model name to call model code
      train_test_dict : train and test data in dict
    Output:  
      fitted model
      
      
      Build given model with Tfid vectorizer.
      
    ''' 
    
    x_trn = train_test_dict['train'][0][0]
    y_trn = train_test_dict['train'][0][1]
    model = model_dict[model_abv][0]
    
    
    vectorizer = vec_dict[vec_abv][0]
    tfid_pipe  = Pipeline([('vectorizer', vectorizer), 
                     ('model', model)])
    
    
    return tfid_pipe.fit(x_trn, y_trn)

In [195]:
cv_ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
def grid(model, train_test_dict, param_grid):
    '''
    Input: 
      model_abv       : model name to call model code
      train_test_dict : train and test data in dict
      param_grid      : Hyperparameters to optimize by gridsearch
    Output:  
      optimized model
      
      
      Optimite hyperparameters for given model.
      
    '''
    
    x_trn = train_test_dict['train'][0][0]
    y_trn = train_test_dict['train'][0][1]
    
    grid_model = GridSearchCV(model, 
                          param_grid, 
                          cv=cv_ss,
                          scoring='accuracy',
                          n_jobs  = -1, 
                          verbose = 0)      
        
    grid_model.fit(x_trn, y_trn)
    grid_model.scorer_

    
    print(f'\nBest hyperparameters: {grid_model.best_params_}')
    return grid_model

In [196]:
cv_ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
def ss_cross_val(x, y, model):
    '''
    Get Cross Validation score by shufflesplit.
    Output train and test validate scores as STR
    '''

    scores_ss = cross_validate(estimator = model, # model to evaluate
                               X = x,
                               y = y,
                               cv = cv_ss,    # no. of cross-validation split
                               scoring ='r2', # evaluation metric
                               return_train_score=True
                             )
    
    return scores_ss

In [197]:
def get_scores(train_test_dict, model, grid_model, model_abv):
    df = pd.DataFrame()
    for label, dfs in train_test_dict.items():
        model_name = f'{model_dict[model_abv][1]}_{label}'
        
        x = dfs[0][0]
        y = dfs[0][1]
        
        score         = round(model.score(x,y), 2)
        score_grid    = round(grid_model.score(x,y), 2)
        
        y_pred     = grid_model.best_estimator_.predict(x)
        accuracy      = round(metrics.accuracy_score (y, y_pred), 2)
        precision     = round(metrics.precision_score(y, y_pred, pos_label=1), 2)
        recall        = round(metrics.precision_score(y, y_pred, pos_label=1), 2)
        f1            = round(metrics.f1_score       (y, y_pred), 2)
        cv_score      = ss_cross_val(x, y, grid_model)
        
        
        df_temp = pd.DataFrame([[ model_name, score,    score_grid, accuracy  , precision  ,recall   ,f1  , round(cv_score['train_score'].mean(),2), round(cv_score['train_score'].std(),2), round(cv_score['test_score'].mean(),2), round(cv_score['test_score'].std(),2)]],
                             columns=['Model_Name','Score', 'Score Grid','Accuracy', 'Precision', 'Recall', 'F1', 'Cross-Val-train-mean',        'Cross-Val-train-std',          'Cross-Val-test-mean',         'Cross-Val-test-std'])
        df = pd.concat([df, df_temp], axis=0)
        
    return df

In [198]:
def run_models(train_test_dict, param_grid, model_name, vec_abv):

    # Build Pipeline and fit
    model      = build_tfid_model(model_name, train_test_dict, vec_abv) 
    
    # Grid search build model
    grid_model = grid(model, train_test_dict, param_grid)
    
    # Get scores for test and train data
    return get_scores(train_test_dict, model, grid_model, model_name) 

In [199]:
model_dict = {
    "logr": [LogisticRegression(),'Logistic Reg.'],
    "dtc" : [DecisionTreeClassifier(), 'Decision Tree'] ,
    "rfc" : [RandomForestClassifier(), 'Random Forest']
}

vec_dict = {
    "cv"  : [CountVectorizer(lowercase=True, stop_words='english', token_pattern='[A-Za-z]+', ngram_range=(1,1)), 'Count Vectorizer'],
    "tfi" : [TfidfVectorizer(stop_words="english"), 'TFIDF Vectorizer']
}


train_test_dict = {"train": [[Xtrain, ytrain],'train'], "test": [[Xtest, ytest],'test']}

In [200]:
def main():
    
    model_scores_df=pd.DataFrame()
    param_grid    = {'model__max_depth'    : [60, 70, 80],
                     'model__random_state' : [42, 64, 86]
                 }
    model_scores_df = pd.concat([run_models(train_test_dict, param_grid, 'dtc', 'tfi'), model_scores_df])

    
    param_grid    = {'model__max_iter'     : [45, 50, 60],
                     'model__random_state' : [1, 5, 11]
                     }
    
    model_scores_df = pd.concat([run_models(train_test_dict, param_grid, 'logr', 'tfi'), model_scores_df])
    
    

    param_grid    = {'model__max_depth'    : [50, 60], 
                     'model__n_estimators' : [150],
                     'model__random_state' : [42]
                    }
    
    model_scores_df = pd.concat([run_models(train_test_dict, param_grid, 'rfc', 'tfi'), model_scores_df])
    
    return model_scores_df

In [47]:
main()


Best hyperparameters: {'model__max_depth': 70, 'model__random_state': 64}

Best hyperparameters: {'model__max_iter': 45, 'model__random_state': 1}

Best hyperparameters: {'model__max_depth': 60, 'model__n_estimators': 150, 'model__random_state': 42}


Unnamed: 0,Model_Name,Score,Score Grid,Accuracy,Precision,Recall,F1,Cross-Val-train-mean,Cross-Val-train-std,Cross-Val-test-mean,Cross-Val-test-std
0,Random Forest_train,0.99,0.77,0.77,0.71,0.71,0.83,0.07,0.04,-0.43,0.05
0,Random Forest_test,0.69,0.67,0.67,0.65,0.65,0.77,0.15,0.03,-0.57,0.08
0,Logistic Reg._train,0.85,0.85,0.85,0.83,0.83,0.88,0.42,0.02,-0.31,0.04
0,Logistic Reg._test,0.71,0.71,0.71,0.7,0.7,0.77,0.58,0.04,-0.37,0.13
0,Decision Tree_train,0.99,0.76,0.76,0.7,0.7,0.82,0.02,0.03,-0.45,0.07
0,Decision Tree_test,0.65,0.65,0.65,0.64,0.64,0.75,0.38,0.04,-0.47,0.05


# Others

In [225]:
corpus4   = corpus3.copy()
X         = list(corpus4['sentences'])
#y         = corpus4.drop(columns=['sentences'])
y         = corpus4['ytrain']

## Sklearn CountVectorizer

In [226]:
vectorizer     = CountVectorizer(lowercase=True, stop_words='english', token_pattern='[A-Za-z]+', ngram_range=(1,1))
x_cv           = vectorizer.fit_transform(X) #note here that vectorizer.fit alywse take arry or list not DataFram
df_bow_sklearn = pd.DataFrame(x_cv.toarray(), columns=vectorizer.get_feature_names_out())

### I got rid of duplicates, it is now safe to train-split. 
### We are sure there is no data leakage to test data because of duplicates


# STEP3: Train/Test Split

In [227]:
print(type(df_bow_sklearn), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [228]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df_bow_sklearn, y, train_size=0.75, random_state=25)

In [229]:
print(type(Xtrain), type(ytrain))
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
(3329, 3108) (3329,) (1110, 3108) (1110,)


In [230]:
model = make_pipeline(CountVectorizer(stop_words="english",  token_pattern=r"\b[a-zA-Z]{3,}\b", ngram_range=(1,2)),
                      LogisticRegression(max_iter=1000))

In [231]:
Xtrain

Unnamed: 0,abandon,abandoned,abode,absence,abstain,abyss,acabar,accept,ace,achilles,...,z,zeal,zeigt,zero,zeus,ziel,zone,zu,zum,zur
75,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3857,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2934,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2191,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [232]:
Xtrain.iloc[318].sum()

3

In [233]:
Xtrain.shape

(3329, 3108)

In [234]:
ytrain.shape

(3329,)

In [235]:
print(type(Xtrain), type(ytrain))
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
(3329, 3108) (3329,) (1110, 3108) (1110,)


In [239]:
#Xtrain=Xtrain.transpose()
ytrain.astype(str)
type(ytrain[0])

numpy.int64

In [224]:
(ytrain)

'108      0\n14202    1\n21531    1\n24945    1\n14169    1\n        ..\n2349     0\n348      0\n15781    1\n13286    1\n411      0\nName: ytrain, Length: 3329, dtype: int64'

In [221]:
model.fit(Xtrain, ytrain)

ValueError: y should be a 1d array, got an array of shape () instead.

In [74]:
ytrain

[0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,


In [75]:
Xtrain

Unnamed: 0,75,2580,3586,3857,2550,2118,2015,330,4115,3793,...,3325,1881,1924,1160,1970,1175,255,2934,2191,318
abandon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abandoned,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abode,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
absence,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abstain,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ziel,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zu,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zum,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
Xtest

Unnamed: 0,abandon,abandoned,abode,absence,abstain,abyss,acabar,accept,ace,achilles,...,z,zeal,zeigt,zero,zeus,ziel,zone,zu,zum,zur
3219,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2721,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3427,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
546,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
ytest

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
