### Pitchfork Content Sandbox

#### Vectorization
this section turns a collection of Pitchfork music reviews into numerical feature vectors, including tokenization, counting and normalization. Bag of Words representation where reviews are described by word occurrences while completely ignoring the relative position information of the words in the document.

In [136]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import psycopg2
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

import numpy as np
np.set_printoptions(threshold=np.inf)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from pitchfork_content_functions import vectorize

pd.option_context('display.max_colwidth', -1)
pd.options.display.max_rows = 1000
pd.options.display.max_seq_items = 5000

In [2]:
# create connection
conn = psycopg2.connect("dbname=pitchfork_reviews")
cur = conn.cursor()

# query
cur.execute("""
SELECT genres.genre, content.reviewid, content.content 
FROM content
INNER JOIN genres on content.reviewid = genres.reviewid;
""")

# cast to dataframe
df = pd.DataFrame(cur.fetchall())
df.columns = [i[0] for i in cur.description]

In [3]:
df.head(), len(df), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22690 entries, 0 to 22689
Data columns (total 3 columns):
genre       20319 non-null object
reviewid    22690 non-null int64
content     22690 non-null object
dtypes: int64(1), object(2)
memory usage: 531.9+ KB


(        genre  reviewid                                            content
 0  electronic     22703  “Trip-hop” eventually became a ’90s punchline,...
 1       metal     22721  Eight years, five albums, and two EPs in, the ...
 2        rock     22659  Minneapolis’ Uranium Club seem to revel in bei...
 3        rock     22661  Kleenex began with a crash. It transpired one ...
 4  electronic     22725  It is impossible to consider a given release b...,
 22690,
 None)

In [4]:
data = df['content'].astype(str)
data.head(5)

0    “Trip-hop” eventually became a ’90s punchline,...
1    Eight years, five albums, and two EPs in, the ...
2    Minneapolis’ Uranium Club seem to revel in bei...
3    Kleenex began with a crash. It transpired one ...
4    It is impossible to consider a given release b...
Name: content, dtype: object

In [187]:
df['genre'].value_counts()

rock            9438
electronic      3874
experimental    1815
rap             1559
pop/r&b         1432
metal            862
folk/country     685
jazz             435
global           219
Name: genre, dtype: int64

In [156]:
feature_names = df['genre'].astype(str)
feature_names[:5]

0    electronic
1         metal
2          rock
3          rock
4    electronic
Name: genre, dtype: object

In [188]:
label_encoder = LabelEncoder()
feature_names_arr = label_encoder.fit_transform(feature_names)
feature_names_arr.shape

(22690,)

In [190]:
len(feature_names), type(feature_names)

(22690, pandas.core.series.Series)

In [194]:
np.vstack((feature_names_arr[:10], feature_names[:10]))

array([[1, 6, 9, 9, 1, 0, 8, 1, 2, 8],
       ['electronic', 'metal', 'rock', 'rock', 'electronic', 'None',
        'rap', 'electronic', 'experimental', 'rap']], dtype=object)

In [21]:
# ten classes #TODO (Lee) - drop Nones
label_encoder.classes_, len(label_encoder.classes_)

(array(['None', 'electronic', 'experimental', 'folk/country', 'global',
        'jazz', 'metal', 'pop/r&b', 'rap', 'rock'], dtype=object), 10)

In [17]:
len(data), len(feature_names_arr)

(22690, 22690)

In [22]:
feature_name_arr[0], data[0]

(1,
 "“Trip-hop” eventually became a ’90s punchline, a music-press shorthand for “overhyped hotel lounge music.” But today, the much-maligned subgenre almost feels like a secret precedent. Listen to any of the canonical Bristol-scene albums of the mid-late ’90s, when the genre was starting to chafe against its boundaries, and you’d think the claustrophobic, anxious 21st century started a few years ahead of schedule. Looked at from the right angle, trip-hop\xa0is part of an unbroken chain that runs from the abrasion of ’80s post-punk to the ruminative pop-R&B-dance fusion of the moment.\xa0The best of it has aged far more gracefully (and forcefully) than anything recorded in the waning days of the record industry’s pre-filesharing monomania has any right to. Tricky rebelled against being attached at the hip to a scene he was already looking to shed and decamped for Jamaica to record a more aggressive, bristling-energy mutation of his style in ’96; the name\xa0Pre-Millennium Tension is t

In [None]:
def replace_punctuation_with_spaces(documents):
    """Return text wth all punctuation turned into spaces"""
    output = ''
    for doc in documents:
        for char in doc:
            if char in punctuation:
                char = ' '
            output += char
    print(doc)
    return documents

In [None]:
replace_punctuation_with_spaces(documents)

#### Partition Data

In [24]:
# splits data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(data, feature_name_arr, test_size=0.30, random_state=3)

In [66]:
len(X_train), len(X_test), len(data) * .7

(15883, 6807, 15882.999999999998)

In [26]:
len(X_train), len(X_test), len(y_train), len(y_test)

(15883, 6807, 15883, 6807)

#### Vectorize data

In [None]:
# def vectorize(corpus):
#    """learns vocab dictionary and returns feature names and term-document matrix"""
    
# count_vect = CountVectorizer()X_train_counts = count_vect.fit_transform(twenty_train.data)
  #  X_train_counts.shape

In [None]:
# count_vectorizer = CountVectorizer(lowercase=True)
# X_train_count = count_vectorizer.fit_transform(X_train)
# vectorizer.get_feature_names(), X.toarray()

In [111]:
# returns sparse matrices for every document
count_vect = CountVectorizer(max_features=1000, stop_words='english')
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(15883, 1000)

In [168]:
X_train_counts.item(1)

AttributeError: item not found

In [166]:
X_train_counts[0]

<1x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 120 stored elements in Compressed Sparse Row format>

In [174]:
for i in range(1000):
    print(X_train_counts[0,i])

0
0
0
0
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
3
0
0
0
2
0
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
0
1
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
2
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
4
0
0
0
0
0
0
2
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
4
1
1
0
0
0
0
1
4
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
3
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
5
1
0
0
0
0
0
0
0
0
0


In [163]:
X_train_counts.item[0]

AttributeError: item not found

In [161]:
X_train_counts[1]

<1x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 185 stored elements in Compressed Sparse Row format>

In [112]:
X_train_counts[1]

<1x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 185 stored elements in Compressed Sparse Row format>

In [175]:
count_vect.vocabulary_

{'despite': 203,
 'record': 708,
 'hip': 402,
 'hop': 410,
 'music': 573,
 'truth': 914,
 'likely': 490,
 'artist': 48,
 'case': 120,
 'year': 994,
 'old': 603,
 'chicago': 133,
 'rapper': 695,
 'piece': 635,
 'new': 586,
 'day': 192,
 'half': 378,
 'duo': 240,
 'white': 970,
 'alongside': 34,
 'career': 119,
 'second': 763,
 'release': 719,
 'follow': 319,
 'debut': 197,
 'tape': 875,
 'recorded': 709,
 'result': 729,
 'feel': 298,
 'time': 892,
 'just': 453,
 'good': 363,
 'times': 893,
 'great': 366,
 'project': 672,
 'image': 422,
 'feels': 300,
 'fresh': 329,
 'unique': 928,
 'straightforward': 843,
 'members': 540,
 'recent': 706,
 'producer': 668,
 'west': 969,
 'dj': 220,
 'audience': 54,
 'handful': 380,
 'familiar': 286,
 'building': 111,
 'sounds': 817,
 'like': 489,
 'house': 416,
 'rap': 694,
 'lot': 512,
 'best': 81,
 'example': 274,
 'lead': 477,
 'single': 793,
 'return': 731,
 'produced': 667,
 'track': 907,
 'makes': 525,
 'minutes': 554,
 'sound': 814,
 'simple': 788

In [159]:
# after fitting, the vectorizer, "count_vect" has built a dictionary of feature indices
# The index value of a word in the vocabulary is linked to its frequency in the whole training corpus.
len(count_vect.vocabulary_)

1000

#### Term frequencies (TF) and  term frequency inverse document frequency (TF-IDF) using `TfidfTransformer`
- TF divides the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

- TF-IDF downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.

#### tf

In [115]:
# TODO (Lee) - correct use_idf true here, I think, if tf
# computes TF-IDF using TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) # fits/learns idf vector (global term weights)

In [116]:
tf_transformer

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [117]:
X_train_tf = tf_transformer.transform(X_train_counts) # transform count matrix to a tf representation

In [183]:
# TODO (Lee) (I think) X_train_tf[i,j] is the occurrence of word j in the text i.

X_train_tf[0,0]

0.0

In [180]:
# TODO (Lee) - complete this comment to clarify what we are seeing here
X_train_tf.sum(axis=0).shape, X_train_tf.sum(axis=0)

((1, 1000),
 matrix([[ 137.51015775,  103.135721  ,   96.80358197,   77.37748482,
            78.2205983 ,   79.59455705,   84.3065565 ,   78.03532453,
            78.77535199,   77.4716182 ,   84.96659469,  105.4172408 ,
           120.10951872,  133.49878702,   89.15189344,  105.88899019,
            79.09068884,   92.29527996,  243.55817672,  125.76477213,
           117.33546065,   97.05648937,  234.08323994,   99.79695599,
            80.22277187,   78.0521271 ,   93.07985249,  137.50217039,
           158.45257084,  130.22479728,   78.95446988,  145.43400612,
          1042.41887232,  300.48625761,   77.43882624,  155.59280758,
            90.6535535 ,  186.01823481,   80.84424303,   83.02194401,
           102.21079333,   86.06711324,   77.56557062,  186.49768797,
           163.67911366,   77.11437244,  147.77292728,  208.44706703,
           174.13598211,  191.19473957,  102.08475186,  101.7008138 ,
            95.66464846,  143.39225399,  119.08242407,   89.23218518,
        

In [118]:
# TODO (Lee) - I'm not sure what this did, why it is 35788
X_train_tf.shape
(2257, 35788)

(2257, 35788)

#### tf-idf

In [119]:
# computes TF-IDF using TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) # fits/learns idf vector (global term weights)

In [120]:
tf_transformer

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [121]:
X_train_tfidf = tf_transformer.transform(X_train_counts) # transform count matrix to a tf-idf representation

In [122]:
X_train_tfidf.shape

(15883, 1000)

### Train naive bayes classifier to predict genres on test music reviews

In [123]:
# train Naive Bayes Classifier on training features (X_train_tfidf) and training targets (y_train)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [126]:
# vectorizes X_test set, similar to above for train set EXCEPT call transform, NOT fit_transform, since fit on train set
# since they have already been fit to the training set:

X_test_counts = count_vect.transform(X_test)

In [143]:
X_test_counts

<6807x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 788147 stored elements in Compressed Sparse Row format>

In [144]:
X_test_tfidf = tf_transformer.transform(X_test_counts)

In [129]:
preds = clf.predict(X_test_tfidf)
probas = clf.predict_proba(X_test_tfidf)

In [137]:
tf_vect = TfidfVectorizer()
tf_vect.fit(X_train)
X_train_tf = tf_vect.transform(X_train)

In [138]:
model = MultinomialNB()
model.fit(X_train_tf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [139]:
X_test_tfidf = tf_vect.transform(X_test)

In [150]:
preds = model.predict(X_test_tfidf)
probas = model.predict_proba(X_test_tfidf)

In [151]:
X_test.shape, X_test_tfidf.shape

((6807,), (6807, 1000))

In [152]:
from collections import Counter
Counter(preds)

Counter({9: 5634, 1: 756, 2: 16, 8: 379, 0: 9, 7: 1, 6: 12})

In [153]:
Counter(y_test)

Counter({9: 2857,
         3: 192,
         1: 1134,
         2: 544,
         7: 431,
         0: 720,
         6: 276,
         4: 72,
         8: 459,
         5: 122})

In [154]:
Counter(probas)

TypeError: unhashable type: 'numpy.ndarray'

In [132]:
# command to look up
np.vstack((y_test[:20], preds[:20]))

array([[9, 3, 1, 1, 2, 7, 1, 9, 9, 9, 1, 0, 6, 0, 0, 9, 9, 4, 8, 9],
       [9, 9, 9, 1, 1, 9, 1, 9, 9, 9, 9, 9, 9, 9, 1, 9, 9, 2, 8, 9]])

In [184]:
for doc, category in zip(X_test[:20], predicted[:20]):
    print('%r => %s' % (doc[:20], category))

'So many bands have r' => [4.86136731e-05 2.32277598e-03 2.78910882e-05 8.58862600e-09
 1.86598698e-11 5.97124146e-10 5.39447446e-08 1.44967186e-06
 2.23686710e-06 9.97596970e-01]
'To this point in the' => [4.42808636e-04 4.37290380e-03 1.24089048e-04 2.18955817e-06
 2.17771975e-09 5.33267279e-08 9.28196161e-07 1.85594843e-05
 1.64447993e-05 9.95022021e-01]
'When Iggy Pop comman' => [5.11845014e-05 1.08177411e-03 4.58659235e-06 4.92078359e-09
 1.37305240e-11 6.18764448e-10 2.93508005e-08 5.70115762e-07
 1.12514015e-06 9.98860725e-01]
"John Roberts isn't m" => [4.52951610e-04 2.20200676e-02 1.01694221e-04 3.47234841e-07
 6.39373934e-10 3.43587162e-08 3.93735917e-07 1.42280179e-05
 1.70691915e-05 9.77393213e-01]
'Everyone who reads a' => [1.01779702e-05 2.60649796e-03 3.96898503e-06 1.19399253e-10
 4.27709584e-14 5.93069416e-12 5.29060039e-10 8.05002214e-08
 1.38594034e-07 9.97379135e-01]
'For such a simple wo' => [3.80820405e-04 4.21960154e-03 5.83771942e-05 4.99608357e-07
 4.40265495e-

#### Evaluation Metrics for Naive Bayes Classifier

In [None]:
np.mean(predicted == twenty_test.target)      

In [None]:
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)

In [None]:
#scikit learn naive bayes
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))



In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [None]:
SANDBOX

In [None]:
# WHERE genres.genre = 'metal' OR genres.genre = 'jazz

In [None]:
import numpy as np
import sklearn.datasets

examples = []
examples.append('some text')
examples.append('another example text')
examples.append('example 3')

target = np.zeros((3,), dtype=np.int64)
target[0] = 0
target[1] = 1
target[2] = 0
dataset = sklearn.datasets.base.Bunch(data=examples, target=target)

In [76]:
len(X_test), len(y_test)

(6807, 6807)

In [74]:
y_test

array([9, 3, 1, 1, 2, 7, 1, 9, 9, 9, 1, 0, 6, 0, 0, 9, 9, 4, 8, 9, 4, 9,
       2, 9, 2, 7, 9, 9, 3, 1, 9, 9, 1, 3, 9, 8, 7, 7, 1, 1, 9, 0, 9, 1,
       0, 1, 9, 2, 1, 7, 9, 8, 9, 8, 9, 1, 9, 9, 9, 1, 7, 9, 2, 1, 2, 9,
       7, 1, 7, 9, 9, 2, 9, 9, 9, 1, 1, 2, 5, 2, 2, 0, 3, 8, 0, 9, 9, 9,
       9, 9, 7, 9, 2, 0, 1, 8, 8, 9, 0, 9, 9, 7, 1, 9, 1, 0, 1, 2, 9, 0,
       2, 0, 1, 9, 9, 2, 9, 9, 1, 9, 9, 2, 9, 9, 9, 1, 1, 1, 0, 9, 9, 4,
       1, 9, 6, 8, 7, 9, 8, 6, 9, 9, 9, 9, 9, 0, 9, 9, 7, 1, 1, 9, 9, 1,
       9, 1, 9, 1, 9, 9, 1, 9, 9, 9, 9, 2, 1, 8, 9, 3, 1, 9, 8, 0, 2, 9,
       9, 7, 1, 2, 9, 1, 9, 0, 8, 9, 9, 9, 1, 9, 6, 9, 6, 3, 5, 1, 0, 9,
       1, 3, 0, 9, 2, 9, 2, 6, 1, 1, 2, 1, 8, 8, 9, 8, 2, 9, 0, 9, 1, 1,
       7, 9, 9, 9, 4, 9, 6, 0, 5, 9, 9, 9, 0, 8, 6, 9, 9, 9, 2, 9, 1, 3,
       9, 9, 9, 1, 1, 9, 9, 9, 6, 1, 9, 8, 9, 0, 9, 7, 1, 1, 7, 9, 1, 1,
       9, 6, 9, 1, 9, 8, 9, 9, 9, 3, 0, 9, 9, 9, 9, 8, 9, 9, 0, 1, 1, 6,
       9, 9, 9, 9, 7, 4, 7, 0, 9, 8, 9, 8, 9, 8, 9,

In [155]:
predicted

array([[4.86136731e-05, 2.32277598e-03, 2.78910882e-05, 8.58862600e-09,
        1.86598698e-11, 5.97124146e-10, 5.39447446e-08, 1.44967186e-06,
        2.23686710e-06, 9.97596970e-01],
       [4.42808636e-04, 4.37290380e-03, 1.24089048e-04, 2.18955817e-06,
        2.17771975e-09, 5.33267279e-08, 9.28196161e-07, 1.85594843e-05,
        1.64447993e-05, 9.95022021e-01],
       [5.11845014e-05, 1.08177411e-03, 4.58659235e-06, 4.92078359e-09,
        1.37305240e-11, 6.18764448e-10, 2.93508005e-08, 5.70115762e-07,
        1.12514015e-06, 9.98860725e-01],
       [4.52951610e-04, 2.20200676e-02, 1.01694221e-04, 3.47234841e-07,
        6.39373934e-10, 3.43587162e-08, 3.93735917e-07, 1.42280179e-05,
        1.70691915e-05, 9.77393213e-01],
       [1.01779702e-05, 2.60649796e-03, 3.96898503e-06, 1.19399253e-10,
        4.27709584e-14, 5.93069416e-12, 5.29060039e-10, 8.05002214e-08,
        1.38594034e-07, 9.97379135e-01],
       [3.80820405e-04, 4.21960154e-03, 5.83771942e-05, 4.99608357e-07,
   