### Pitchfork Content Sandbox

#### Vectorization
this section turns a collection of Pitchfork music reviews into numerical feature vectors, including tokenization, counting and normalization. Bag of Words representation where reviews are described by word occurrences while completely ignoring the relative position information of the words in the document.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import psycopg2
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
np.set_printoptions(threshold=np.inf)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from pitchfork_content_functions import vectorize

pd.option_context('display.max_colwidth', -1)
pd.options.display.max_rows = 1000
pd.options.display.max_seq_items = 5000

In [2]:
# create connection
conn = psycopg2.connect("dbname=pitchfork_reviews")
cur = conn.cursor()

# query
cur.execute("""
SELECT genres.genre, content.reviewid, content.content 
FROM content
INNER JOIN genres on content.reviewid = genres.reviewid
WHERE genres.genre = 'metal' OR genres.genre = 'jazz'
LIMIT 100000;
""")

# cast to dataframe
df = pd.DataFrame(cur.fetchall())
df.columns = [i[0] for i in cur.description]

In [5]:
corpus = df[0:10000]
corpus.head()

Unnamed: 0,genre,reviewid,content
0,metal,22721,"Eight years, five albums, and two EPs in, the ..."
1,metal,22702,The Bay Area metal lifers who comprise Worm Ou...
2,jazz,22707,All is not well with Ray Charles’ catalog nowa...
3,metal,22559,"“Fuck your magazine,” growls Pantera frontman ..."
4,jazz,22664,What does the concept “slight freedom” mean wh...


In [7]:
type(corpus['content'].head())

pandas.core.series.Series

In [4]:
documents = [str(x) for x in corpus['content']]

In [None]:
documents[0]

In [None]:
def replace_punctuation_with_spaces(documents):
    """Return text wth all punctuation turned into spaces"""
    output = ''
    for doc in documents:
        for char in doc:
            if char in punctuation:
                char = ' '
            output += char
    print(doc)
    return documents

In [None]:
replace_punctuation_with_spaces(documents)

#### Partition Data

In [None]:
from sklearn.model_selection import train_test_split
#Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'],spam_data['target'],random_state=0)

In [None]:
def vectorize(corpus):
    """learns vocab dictionary and returns feature names and term-document matrix"""
    vectorizer = CountVectorizer(lowercase=True)
    X = vectorizer.fit_transform(documents)
    return vectorizer.get_feature_names(), X.toarray()

In [None]:
vectorized = vectorize(corpus)

In [None]:
len(vectorized)

In [None]:
type(vectorized), len(vectorized[1][0]), vectorized[1][0].shape

In [None]:
vectorized[1][0]

In [None]:
vectorized[0]

In [None]:
tf_transformer = TfidfTransformer(use_idf=False).fit(vectorized[1])

In [None]:
X_train_tf = tf_transformer.transform(vectorized[1])
X_train_tf.shape
(2257, 35788)

In [None]:
X_train_tf.shape

In [None]:
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)

In [None]:
#scikit learn naive bayes
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))



In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])