In [60]:
import s3fs

import os
import json
import time
import pickle
import requests
import traceback
import time
from datetime import datetime
import warnings
# Ignore warnings from scikit-learn to make this notebook a bit nicer
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

import pandas as pd
from pandas import DataFrame
from pandas import plotting
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from tqdm.autonotebook import tqdm
tqdm.pandas(desc="progress-bar", leave=False)
import string

import unicodedata  # might need to pip install unicodedate2 on aws sagemaker
import contractions
from contractions import contractions_dict ## pip installed this
from wordcloud import WordCloud, STOPWORDS #pip install
from textblob import TextBlob
!python -m textblob.download_corpora

import nltk
import nltk.corpus 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import word2vec
import multiprocessing as mp

import sklearn
from sklearn.utils import resample # Covert too much Rock! to just enough
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

%matplotlib inline
sns.set(style='darkgrid',palette='Dark2',rc={'figure.figsize':(9,6),'figure.dpi':90})

# Increase screen size.
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline
sns.set(style='darkgrid',palette='Dark2', rc={'figure.figsize':(9,6), 'figure.dpi':100})
# Set the default figure size for matplotlib
plt.rcParams['figure.figsize'] = (9, 6)

# Visual analysis of model performance
from yellowbrick.classifier import confusion_matrix
from yellowbrick.classifier import classification_report
from yellowbrick.regressor import prediction_error, ResidualsPlot
from yellowbrick.target import ClassBalance

#Pipeline toolset
# Used to divide our dataseets into train/test splits
# Data will be randomly shuffled so running this notebook multiple times may lead to different results
from sklearn.model_selection import train_test_split as tts
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import RobustScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

#Model toolset
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Ridge

#Evaluation toolset
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.features import FeatureImportances

[nltk_data] Downloading package brown to /Users/Gretzky/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Gretzky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Gretzky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Gretzky/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/Gretzky/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/Gretzky/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [2]:
import io
import boto3

from dotenv import load_dotenv
load_dotenv(verbose=True)

def aws_session(region_name='us-east-1'):
    return boto3.session.Session(aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), #looks for any .env file
                                aws_secret_access_key=os.getenv('AWS_ACCESS_KEY_SECRET'), #Has to be in same directory
                                region_name=region_name) #from above

def make_bucket(name, acl): 
    session = aws_session()
    s3_resource = session.resource('s3')
    return s3_resource.create_bucket(Bucket=name, ACL=acl)

def upload_file_to_bucket(bucket_name, file_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    file_dir, file_name = os.path.split(file_path)

    bucket = s3_resource.Bucket(bucket_name)
    bucket.upload_file(
      Filename=file_path,
      Key=file_name,
      ExtraArgs={'ACL': 'private'}
    )

    s3_url = f"https://{bucket_name}.s3.amazonaws.com/{file_name}"
    return s3_url

fs = s3fs.S3FileSystem(anon=False,key='###',secret='###')

g_df = pd.read_csv('g2_df')

In [5]:
#Drop first, useless column.
g_df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [6]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86290 entries, 0 to 86289
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   genre                   86290 non-null  object 
 1   song_name               86290 non-null  object 
 2   lyrics                  86290 non-null  object 
 3   full_word_count         86290 non-null  int64  
 4   full_character_count    86290 non-null  int64  
 5   full_avg_word_length    86290 non-null  float64
 6   med_lyrics              86290 non-null  object 
 7   med_word_count          86290 non-null  int64  
 8   med_character_count     86290 non-null  int64  
 9   med_avg_word_length     86290 non-null  float64
 10  med_content_affin       86290 non-null  float64
 11  med_sent_label          86290 non-null  object 
 12  med_sent_score          86290 non-null  float64
 13  med_vector              86290 non-null  object 
 14  med_rock_genre_count    86290 non-null

In [7]:
g_df.describe(include='all')

Unnamed: 0,genre,song_name,lyrics,full_word_count,full_character_count,full_avg_word_length,med_lyrics,med_word_count,med_character_count,med_avg_word_length,med_content_affin,med_sent_label,med_sent_score,med_vector,med_rock_genre_count,med_rock_bool,med_hiphop_genre_count,med_hiphop_bool,med_pop_genre_count,med_pop_bool,med_genre_count,sml_lyrics,sml_word_count,sml_character_count,sml_avg_word_length,sml_content_affin,sml_sent_label,sml_sent_score,sml_vector,sml_rock_genre_count,sml_rock_bool,sml_hiphop_genre_count,sml_hiphop_bool,sml_pop_genre_count,sml_pop_bool,sml_genre_count
count,86290,86290,86290,86290.0,86290.0,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0
unique,3,66799,86203,,,,85378,,,,,3,,85355,,,,,,,,85286,,,,,3,,85264,,,,,,,
top,Rock,Intro,so so you think you can tell heaven from hel...,,,,merry little christmas let heart light trouble...,,,,,positive,,dreaming white christmas like one know treetop...,,,,,,,,dreaming white christmas like ones know gliste...,,,,,positive,,feel like home feel like feel like young feel ...,,,,,,,
freq,47406,50,3,,,,4,,,,,58208,,4,,,,,,,,4,,,,,58255,,4,,,,,,,
mean,,,,355.093638,1067.958303,3.032924,,108.455858,552.463715,5.120146,0.01088,,0.096838,,0.006332,0.224591,91.105574,0.162128,0.358106,0.099977,91.470013,,102.333666,509.263924,5.011926,0.011444,,0.097358,,0.000238,0.010036,5.672731,0.018067,0.014162,0.000568,5.687131
std,,,,218.656149,651.998326,0.300625,,74.646193,379.487446,0.467359,0.046185,,0.213209,,0.021912,0.417315,351.449939,0.36857,3.012329,0.299971,351.424018,,68.819909,338.19807,0.429539,0.048637,,0.213692,,0.006379,0.099676,110.012447,0.133194,0.774038,0.023823,110.014459
min,,,,1.0,8.0,0.051852,,1.0,4.0,3.0,-0.493151,,-1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,4.0,3.0,-0.503979,,-1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,209.0,638.0,2.853833,,61.0,312.0,4.8125,-0.012953,,-0.035,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,58.0,291.0,4.733333,-0.013661,,-0.034707,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,,,299.0,908.0,3.026846,,88.0,450.0,5.083333,0.008451,,0.092857,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,84.0,421.0,4.982759,0.009302,,0.093636,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,,,436.0,1303.0,3.206452,,129.0,661.0,5.384615,0.032754,,0.234662,,0.0,0.0,0.0,0.0,0.0,0.0,0.12,,123.0,615.0,5.25685,0.0348,,0.235714,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Code to normalize / standardize / MinMax various numeric columns, as desired.

In [8]:
# Scaling various counts.
# Leaving sent_score (-1 to +1, already), content_affin (-.5 to .5, already), genre_bool (0 or 1, already)
# and avg_word_length (no outliers) alone. 
#Will be dropping [size]_genre_count, (was never a good idea).
# I am doing it now, vice in the pipeline, because I am not doing the same thing to all Float64 d.types()

scaler = MinMaxScaler()

column_to_normalize = ['full_word_count','full_character_count',
                       'med_word_count','med_character_count',
                       'med_rock_genre_count','med_hiphop_genre_count','med_pop_genre_count',
                       'sml_word_count','sml_character_count',
                       'sml_rock_genre_count','sml_hiphop_genre_count','sml_pop_genre_count']

# "Scaled_DataFrame" 
s_df = g_df.copy()
s2_df = pd.DataFrame((s_df), columns = column_to_normalize)


x = s2_df.values
x_scaled = scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns = column_to_normalize, index = s_df.index)

s_df[column_to_normalize] = df_temp

In [9]:
s_df.describe(include='all')

Unnamed: 0,genre,song_name,lyrics,full_word_count,full_character_count,full_avg_word_length,med_lyrics,med_word_count,med_character_count,med_avg_word_length,med_content_affin,med_sent_label,med_sent_score,med_vector,med_rock_genre_count,med_rock_bool,med_hiphop_genre_count,med_hiphop_bool,med_pop_genre_count,med_pop_bool,med_genre_count,sml_lyrics,sml_word_count,sml_character_count,sml_avg_word_length,sml_content_affin,sml_sent_label,sml_sent_score,sml_vector,sml_rock_genre_count,sml_rock_bool,sml_hiphop_genre_count,sml_hiphop_bool,sml_pop_genre_count,sml_pop_bool,sml_genre_count
count,86290,86290,86290,86290.0,86290.0,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0,86290.0
unique,3,66799,86203,,,,85378,,,,,3,,85355,,,,,,,,85286,,,,,3,,85264,,,,,,,
top,Rock,Intro,so so you think you can tell heaven from hel...,,,,merry little christmas let heart light trouble...,,,,,positive,,dreaming white christmas like one know treetop...,,,,,,,,dreaming white christmas like ones know gliste...,,,,,positive,,feel like home feel like feel like young feel ...,,,,,,,
freq,47406,50,3,,,,4,,,,,58208,,4,,,,,,,,4,,,,,58255,,4,,,,,,,
mean,,,,0.074988,0.079107,3.032924,,0.077923,0.076091,5.120146,0.01088,,0.096838,,0.006528,0.224591,0.005033,0.162128,0.001818,0.099977,91.470013,,0.079353,0.077949,5.011926,0.011444,,0.097358,,0.000361,0.010036,0.00032,0.018067,0.000156,0.000568,5.687131
std,,,,0.046306,0.04866,0.300625,,0.054131,0.052648,0.467359,0.046185,,0.213209,,0.02259,0.417315,0.019417,0.36857,0.015291,0.299971,351.424018,,0.053892,0.052175,0.429539,0.048637,,0.213692,,0.009666,0.099676,0.006215,0.133194,0.008506,0.023823,110.014459
min,,,,0.0,0.0,0.051852,,0.0,0.0,3.0,-0.493151,,-1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,3.0,-0.503979,,-1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,0.044049,0.047018,2.853833,,0.04351,0.04273,4.8125,-0.012953,,-0.035,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.044636,0.044276,4.733333,-0.013661,,-0.034707,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,,,0.063109,0.067169,3.026846,,0.063089,0.061876,5.083333,0.008451,,0.092857,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.064996,0.064332,4.982759,0.009302,,0.093636,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,,,0.092122,0.096649,3.206452,,0.092821,0.091149,5.384615,0.032754,,0.234662,,0.0,0.0,0.0,0.0,0.0,0.0,0.12,,0.095536,0.094261,5.25685,0.0348,,0.235714,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
s_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86290 entries, 0 to 86289
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   genre                   86290 non-null  object 
 1   song_name               86290 non-null  object 
 2   lyrics                  86290 non-null  object 
 3   full_word_count         86290 non-null  float64
 4   full_character_count    86290 non-null  float64
 5   full_avg_word_length    86290 non-null  float64
 6   med_lyrics              86290 non-null  object 
 7   med_word_count          86290 non-null  float64
 8   med_character_count     86290 non-null  float64
 9   med_avg_word_length     86290 non-null  float64
 10  med_content_affin       86290 non-null  float64
 11  med_sent_label          86290 non-null  object 
 12  med_sent_score          86290 non-null  float64
 13  med_vector              86290 non-null  object 
 14  med_rock_genre_count    86290 non-null

In [11]:
#Decision on what features to include, borne of EDA and visual steering.

df = pd.DataFrame((s_df), columns=['genre','full_word_count','full_character_count',
                                  'med_rock_bool','med_hiphop_bool','med_pop_bool',
                                  'sml_word_count','sml_character_count',
                                   'sml_sent_label','sml_sent_score','sml_vector'])

In [12]:
#Quick code to count the counts, so to speak.
#Returns a .info() looking print with the number of '0' in each column.  Subtracting
#that from total number gives the number of med_pop/rock/hiphop_scores that there are.
df[df == 0].count(axis=0)

genre                       0
full_word_count             1
full_character_count        2
med_rock_bool           66910
med_hiphop_bool         72300
med_pop_bool            77663
sml_word_count             16
sml_character_count         3
sml_sent_label              0
sml_sent_score           1473
sml_vector                  0
dtype: int64

Code to downsample and set up test train splits. 
Your df may vary.

In [13]:
seed = 99

#Separate majority and minority classes, twice.
majority = df[df.genre=='Rock']
minority = df[df.genre=='Hip Hop']

# Downsample majority class
majority_rock_downsampled = resample(majority, 
                                replace=False, # sample without replacement
                                n_samples=len(minority), # to match minority class
                                random_state=seed) # reproducible results

seed = 99

#Separate majority and minority classes, again.
majority = df[df.genre=='Pop']
minority = df[df.genre=='Hip Hop']

# Downsample majority class
majority_pop_downsampled = resample(majority, 
                                replace=False, # sample without replacement
                                n_samples=len(minority), # to match minority class
                                random_state=seed) # reproducible results

# Combine minority class with downsampled majority class
dfd = pd.concat([majority_rock_downsampled, majority_pop_downsampled, minority])

# Display new class counts
dfd['genre'].value_counts()
# dfd = 'data frame downsampled'

Hip Hop    13560
Pop        13560
Rock       13560
Name: genre, dtype: int64

In [14]:
#create a small df to use to get the pipeline working...
s_dfd = dfd.sample(frac=.2)

In [15]:
s_dfd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8136 entries, 59095 to 7801
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genre                 8136 non-null   object 
 1   full_word_count       8136 non-null   float64
 2   full_character_count  8136 non-null   float64
 3   med_rock_bool         8136 non-null   int64  
 4   med_hiphop_bool       8136 non-null   int64  
 5   med_pop_bool          8136 non-null   int64  
 6   sml_word_count        8136 non-null   float64
 7   sml_character_count   8136 non-null   float64
 8   sml_sent_label        8136 non-null   object 
 9   sml_sent_score        8136 non-null   float64
 10  sml_vector            8136 non-null   object 
dtypes: float64(5), int64(3), object(3)
memory usage: 762.8+ KB


In [19]:
s_dfd.genre.value_counts()

Rock       2719
Pop        2717
Hip Hop    2700
Name: genre, dtype: int64

In [21]:
s_dfd.sml_sent_label.value_counts()

positive    5465
negative    2550
neutral      121
Name: sml_sent_label, dtype: int64

In [16]:
# Create X and y.  Obviously, your df name may vary...
X = s_dfd.loc[ : , dfd.columns != 'genre']
y = s_dfd['genre']

In [17]:
# tts.
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6508, 10) (6508,)
(1628, 10) (1628,)


This is the pipeline code that worked, just.  

Issues:
1) visualize_model function tests on the same data that it trained upon.  Bagging classifiers overfit.
2) lst_label, cat_label.  Same thing, called differently in functions.
3) tf-IDF added
4) one hot encoding of sent_label.
5) has an imputer, don't need it.


In [47]:
#make label lists
cat_labels = list(set(s_dfd['genre']))
label_lst = list(set(s_dfd['genre']))
sent_label = ['positive','negative','neutral'] #It hates this, for some reason.

In [67]:
#Merging pipeline, columntransformer and feature union...breaking everywhere...

categorical = make_column_selector(pattern='sml_sent_label')
numeric = make_column_selector(pattern='sml_sent_score')
vector = make_column_selector(pattern='sml_vector')

model = Pipeline([
    ("columns", ColumnTransformer([
        ('ordinal', OrdinalEncoder(categories = 'auto', handle_unknown='use_encoded_value', 
                                  unknown_value=-1), categorical),
        ('scalar', MinMaxScaler(feature_range=(0,2)), numeric),
        ('tfidf', FeatureUnion([
            ('vectorize', Pipeline([
                ('counts', CountVectorizer(), vector),
                ('finally', TfidfTransformer(),vector)
            ])),
    ]), remainder='drop'),
    ], remainder='passthrough'))    
    ('clf', LinearSVC())
])

model.fit(X_train, y_train)

viz = ClassificationReport(model, is_fitted = True)
viz.score(X_test, y_test)
viz.show()

SyntaxError: invalid syntax (<ipython-input-67-a376631021b8>, line 15)

In [75]:
categorical = make_column_selector(pattern='sml_sent_label')
numeric = make_column_selector(pattern='sml_sent_score')
vector = make_column_selector(pattern='sml_vector')

model = Pipeline([
    ('tfidf', FeatureUnion(
        transformer_list = [
            ('vectorize', Pipeline([
                ('columns_1', ColumnTransformer([
                ('counts', CountVectorizer(), vector),
                ('finally', TfidfTransformer(),vector),
                ('really', TruncatedSVD(n_components=5, n_iter=7, random_state=42), vector),
            ], remainder='drop')
            )])),
        ('columns_2', ColumnTransformer([
        ('label', OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', 
                                  unknown_value=-1), categorical),
        ('scalar', MinMaxScaler(feature_range=(0,2)), numeric),
        ('skip_it', OrdinalEncoder(categories = 'auto', handle_unknown='use_encoded_value', 
                                   unknown_value=-1), vector)
    ], remainder='passthrough'))
    ],
        transformer_weights= {
            'vectorize': 0.2,
            'columns_2': 0.8,
        }
    )),
    ('clf', LinearSVC())
])

model.fit(X_train, y_train)

viz = ClassificationReport(model, is_fitted = True)
viz.score(X_test, y_test)
viz.show()

ValueError: could not convert string to float: 'midnight sky shining face feel wave embracing peace mind time tide wait wait yeah hate love trying reality dream hate love afraid need like hate love hate love faded photograph picture perfect love locked inside frame forgiving truth trust betrayed time heals left left hate love trying reality dream hate love afraid need like hate love hate love loneliness innocence heart learned hate truth love tearing apart hate love trying reality dream hate love afraid need like hate love hate love'

In [50]:
X_test.iloc[0]['sml_vector']

'tupac death row motherfucker dear mama caught sickness robbed slipped left witness wonder catch snitch shoot rich want commit murder damn got trapped walkin talkin kind win life wheel fortune chance spin got time cop trip try catch fuckin trigger happy let sucker snatch niggaz gettin jealous jealous tryin stash whip dive pump as peter picked pepper pick punk snatched like threw trunk punk thought bluffin swear nothin nice life wrestle mics listen scream tray deee went insane guess little finally brain new pull sentenced pen remember little bird told friend trouble mind old fiveoh blaow blaow turn fortyniners chorus eminem gone carry mourn rejoice time hear sound voice know lookin smilin feel thing baby feel pain smile tupac dear mama cop understand turned life crime came broken family uncle touch told scared hold kept deep inside let fuel anger homies mercy stranger brother cell hard black trapped livin hell shouldnta let catch instead livin sad jail coulda died free happy raped norm 

In [29]:
# Base pipeline, and first F1 score.

categorical = make_column_selector(pattern='sml_sent_label')
numeric = make_column_selector(pattern='sml_sent_score')
vector = make_column_selector(pattern='sml_vector')

models = [
    SVC(gamma='auto'), NuSVC(gamma='auto'), LinearSVC(),
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(),
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=300)
]


def score_model(X, y, estimator, **kwargs):
    """
    Test various estimators.
    """
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
    ("columns", ColumnTransformer([
        ('ordinal', OrdinalEncoder(categories = sent_label, handle_unknown='use_encoded_value', 
                                  unknown_value=10), categorical),
        ('scalar', MinMaxScaler(feature_range=(0,2)), numeric),
        ('tfidf', TfidfTransformer(),vector),
    ], remainder='passthrough')),
        ("imputer",SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('estimator', estimator)
     )])

    # Instantiate the classification model and visualizer
    model.fit(X, y, **kwargs)

    expected  = y
    predicted = model.predict(X)

    # Compute and return F1 (harmonic mean of precision and recall)
    print("F1 SCORE {}: {}".format(estimator.__class__.__name__, f1_score(expected, predicted,average='micro')))

In [38]:
for model in models:
    try:
        score_model(X, y, model)
        #visualize_model(X, y, model,cat_labels)
        #a_visualize_model(X, y, model,cat_labels)
        #conf_matrix(X, y, model,cat_labels)
    except Exception as exc:
        print(model, traceback.format_exc(), exc)

SVC(gamma='auto') Traceback (most recent call last):
  File "<ipython-input-38-639f895201b5>", line 3, in <module>
    score_model(X, y, model)
  File "<ipython-input-29-05a4600a6ded>", line 21, in score_model
    model = Pipeline([
  File "/opt/anaconda3/envs/lyricsenv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/opt/anaconda3/envs/lyricsenv/lib/python3.9/site-packages/sklearn/pipeline.py", line 118, in __init__
    self._validate_steps()
  File "/opt/anaconda3/envs/lyricsenv/lib/python3.9/site-packages/sklearn/pipeline.py", line 154, in _validate_steps
    names, estimators = zip(*self.steps)
ValueError: too many values to unpack (expected 2)
 too many values to unpack (expected 2)
NuSVC(gamma='auto') Traceback (most recent call last):
  File "<ipython-input-38-639f895201b5>", line 3, in <module>
    score_model(X, y, model)
  File "<ipython-input-29-05a4600a6ded>", line 21, in score_model
    model = Pipeline([


Need to add TF-IDF to the mix.

notes from Carter 

  ('title', TfidfVectorizer(max_features = 6000, stop_words = 'english', ngram_range=(1,1)), 'title'),
     ('content', TfidfVectorizer(max_features = 6000, stop_words = 'english', ngram_range=(1,1)), 'content')], n_jobs=3, verbose=True)
     
     https://github.com/georgetown-analytics/From-Russia-With-Love-fake-news-/blob/mast[…]tep_4_feature_vectorization_and_model_evaluation_nodomain.ipynb
     
     https://www.scikit-yb.org/en/latest/api/text/index.html

#sklearn text feature engineering

In [None]:
# This one is the problem.

def visualize_model(X, y, estimator,label_lst, **kwargs):
    """
    Test various estimators.
    """
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
    ("columns", ColumnTransformer([
        #('onehot', OneHotEncoder(), categorical),
        ('scalar', RobustScaler(), numeric),
        ('scalar2', RobustScaler(), numeric2),
    ], remainder='drop')),
        ("imputer",SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('estimator', estimator)
])

    # Instantiate the classification model and visualizer
    visualizer = ClassificationReport(
        model, classes=label_lst,
        cmap="YlGn", size=(600, 360), **kwargs
    )
    visualizer.fit(X, y)
    visualizer.score(X, y)
    visualizer.show()

In [None]:
def a_visualize_model(X, y, estimator,label_lst, **kwargs):
    """
    Test various estimators.
    """
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
    ("columns", ColumnTransformer([
        #('onehot', OneHotEncoder(), categorical),
        ('scalar', RobustScaler(), numeric),
        ('scalar2', RobustScaler(), numeric2),
    ], remainder='drop')),
        ("imputer",SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('estimator', estimator)
])

    #Create the train and test data
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

    
    # Instantiate the classification model and visualizer
    visualizer = ClassificationReport(
        model, classes=label_lst,
        cmap="YlGn", size=(600, 360), **kwargs
    )
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.show()

In [None]:
def conf_matrix(X,y,estimator,label_lst):
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
    ("columns", ColumnTransformer([
        #('onehot', OneHotEncoder(), categorical),
        ('scalar', RobustScaler(), numeric),
        ('scalar2', RobustScaler(), numeric2),
    ], remainder='drop')),
        ("imputer",SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('estimator', estimator)
])

    #Create the train and test data
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

    # Instantiate the visualizer with the classification model
    confusion_matrix(
        model,
        X_train, y_train, X_test, y_test,
        classes=label_lst
    )
    plt.tight_layout()

In [None]:
print("Current Time =", datetime.now())

In [None]:
# Run it.

for model in models:
    try:
        score_model(X, y, model)
        visualize_model(X, y, model,cat_labels)
        a_visualize_model(X, y, model,cat_labels)
        conf_matrix(X, y, model,cat_labels)
    except Exception as exc:
        print(model, 
              traceback.format_exc(), 
              exc)