# ArtistReviewAnalyzer Class Tester

Because I'm too lazy to run the file itself so I'm just forcefully doing it through jupyter lol

In [32]:
from pprint import pprint
import numpy as np

## helper_functions.py

In [1]:
import json

import pandas as pd
import sqlite3

import re
import unidecode as ud


def clean_str(input_artist, cap_code=0):
    """
    Takes in string, returns a unicode-friendly and stripped version of the string.
    """
    return_artist_str = input_artist

    # === REGEX REPLACE ===
    repl_tuples = [(r'(^\s+)|(\s+$)', ''),  # whitespace at beg/end of string
                   (r'\s+', ' '),  # Remove double spaces
                   (r'[\n|\r|\t|\0]+', ' ')
                   ]
    for ptn, repl_str in repl_tuples:
        return_artist_str = re.sub(ptn, repl_str, return_artist_str)

    # === UNICODE HANDLING ===
    return_artist_str = ud.unidecode(return_artist_str)

    if cap_code == -1:
        return_artist_str = return_artist_str.lower()
    elif cap_code == 1:
        return_artist_str = return_artist_str.upper()

    return return_artist_str


def read_mard_json(input_filename):
    """
    Takes in file name of JSON, returns a list of dictionaries in which
    each element is a row with columns (as the keys).
    """
    loaded_data_ = []
    with open(input_filename, 'r') as file_:
        loaded_string_ = file_.read()
        loaded_data_ = [json.loads(s) for s in loaded_string_.split('\n') if s is not None and len(s) > 0]
        file_.close()
    return loaded_data_


def read_mard_json_as_df(input_filename):
    """
    Takes in file name of JSON, returns a list of dictionaries in which
    each element is a row with columns (as the keys).
    """
    loaded_data_ = []
    with open(input_filename, 'r') as file_:
        loaded_string_ = file_.read()
        loaded_data_ = [json.loads(s) for s in loaded_string_.split('\n') if s is not None and len(s) > 0]
        file_.close()
    return pd.DataFrame(loaded_data_)


def run_query_on_sqlite_db(input_query, input_filename):
    """

    Returns a Pandas DataFrame object containing the query results,
    given the user's query and the filename for the sqlite database.

    Input:
     - input_query: string representation of the SQL query to run on the sqlite db
     - input_filename: the file location of the sqlite database

    """
    conn_ = sqlite3.connect(input_filename)
    df_ = pd.read_sql_query(input_query, conn_)
    conn_.close()
    return df_


## ArtistReviewAnalyzer.py

In [62]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
import spacy
import itertools


class ArtistReviewAnalyzer:
    """
    Analyzes artist reviews. Forms into useful data structures for NLP and topic modeling.
    """
    def __init__(self):
        # Basic data
        self.file_loc = None
        self.raw = None
        self.artists_list = None
        self.all_tokens = None

        # Stop-Words for tokenization
        self.stop_words = stopwords.words('english')  # Basic stop words
        self.update_stopwords(['album', 'music', 'cd', 'track', 'song', 'sound'])

        # Vectorizers
        self.tfidf_vectorizer = None
        self.count_vectorizer = None
        self.tfidf_matrix = None
        self.count_matrix = None
        self.__tokenizer = None
        self.tokenized_reviews = None

        # NLP tools
        self.lda_model = None
        self.bigrams = None
        self.lemmatized_text = None
        self.nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])  # Spacy tool


    def load_data(self, file_loc):
        """
        Loads reviews data (JSON file) into a dictionary.
        """
        # Save file location
        self.file_loc = file_loc

        with open(file_loc, 'r') as file:
            d = json.load(file)
            file.close()
        self.raw = d  # Save raw dictionary

        self.artists_list = list(self.raw.keys())  # Save artists list

        self.build_count_vectorizer(2, 0.8)  # Build count vec so we have a tokenizer

        # Handling tokenized list of reviews
        self.tokenized_reviews = self.__build_reviews_list(do_tokenize=True)
        self.remove_stopwords_from_tokenized_list()
        self.set_all_tokens()  # Get tokenized corpus

        return self.raw

    def refresh(self):
        self.build_count_vectorizer(2, 0.8)
        self.tokenized_reviews = self.__build_reviews_list(do_tokenize=True)
        self.remove_stopwords_from_tokenized_list()
        self.set_all_tokens()

    def build_count_vectorizer(self, min_df, max_df):
        if self.count_vectorizer is not None:
            print("WARNING: Count vectorizer has already been built.")
            return self.count_vectorizer
        self.count_vectorizer = CountVectorizer(analyzer='word', stop_words=self.stop_words, min_df=min_df, max_df=max_df)
        self.__tokenizer = self.count_vectorizer.build_tokenizer()
        return self.count_vectorizer

    def get_count_matrix(self):
        if self.count_vectorizer is None:
            return None

        if self.count_matrix is None:
            # TODO: self.raw is not truly reflective of the data since we clean it.
            self.count_matrix = self.count_vectorizer.fit_transform(self.all_tokens)
        return self.count_matrix

    def build_tfidf_vectorizer(self, min_df, max_df):
        self.tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=self.stop_words, min_df=min_df, max_df=max_df)
        return self.tfidf_vectorizer

    def get_tfidf_matrix(self):
        if self.tfidf_vectorizer is None:
            return None

        if self.tfidf_matrix is None:
            # TODO: self.raw is not truly reflective of the data since we clean it.
            self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.all_tokens)
        return self.tfidf_matrix

    def tokenize(self, input_string):
        if self.__tokenizer is None:
            print("WARNING: Tokenizer not set.")
            return None
        return self.__tokenizer(input_string)

    def update_stopwords(self, word_list):
        self.stop_words.extend(word_list)
        return None

    def remove_stopwords_from_tokenized_list(self):
        self.tokenized_reviews = \
            [[w for w in artist_review if w not in self.stop_words] for artist_review in self.tokenized_reviews]

    def build_lda_model(self, num_topics=20):
        self.bigrams = self.make_bigrams()
        self.lemmatized_text = self.lemmatize()
        id2word = corpora.Dictionary(self.lemmatized_text)
        corpus = [id2word.doc2bow(w) for w in self.lemmatized_text]
        self.lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
        return self.lda_model

    def make_bigrams(self):
        bg = gensim.models.Phrases(self.tokenized_reviews, min_count=5, threshold=100)
        bg_mod = gensim.models.phrases.Phraser(bg)
        return [bg_mod[d] for d in self.tokenized_reviews]

    def lemmatize(self, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        texts_out = []
        for review_words in self.tokenized_reviews:
            joined_words = self.nlp(" ".join(review_words))
            texts_out.append([token.lemma_ for token in joined_words if token.pos_ in allowed_postags])
        return texts_out

    def set_all_tokens(self):
        self.all_tokens = list(itertools.chain.from_iterable(self.tokenized_reviews))
        return self.all_tokens

    def get_all_tokens(self):
        return self.all_tokens

    def __build_reviews_list(self, do_tokenize=False):
        """
        Builds a list of reviews for each artist.
        """
        consolidated_reviews = []
        for a in self.artists_list:
            if do_tokenize:
                consolidated_reviews.append(self.tokenize(clean_str(" ".join(self.raw[a]))))
            else:
                consolidated_reviews.append(clean_str(" ".join(self.raw[a])))
        return consolidated_reviews

## Test runs

In [4]:
json_file_location = "../data/processed/artist_reviews.json"

In [8]:
analyzer = ArtistReviewAnalyzer()

In [10]:
analyzer.load_data(json_file_location);

In [12]:
analyzer.build_lda_model();

In [15]:
pprint(analyzer.lda_model.print_topics())

[(0,
  '0.012*"quot" + 0.008*"good" + 0.007*"make" + 0.006*"song" + 0.006*"well" + '
  '0.005*"great" + 0.005*"even" + 0.005*"hear" + 0.005*"work" + 0.004*"first"'),
 (1,
  '0.008*"get" + 0.008*"good" + 0.008*"great" + 0.007*"song" + 0.006*"well" + '
  '0.006*"record" + 0.005*"first" + 0.005*"love" + 0.005*"make" + '
  '0.005*"band"'),
 (2,
  '0.010*"quot" + 0.008*"song" + 0.007*"great" + 0.007*"good" + 0.006*"well" + '
  '0.006*"get" + 0.006*"record" + 0.005*"time" + 0.005*"make" + 0.005*"band"'),
 (3,
  '0.011*"song" + 0.009*"good" + 0.008*"love" + 0.008*"make" + 0.007*"get" + '
  '0.006*"time" + 0.006*"great" + 0.005*"cd" + 0.005*"well" + 0.005*"hear"'),
 (4,
  '0.008*"make" + 0.006*"record" + 0.006*"band" + 0.006*"good" + 0.005*"well" '
  '+ 0.005*"go" + 0.005*"time" + 0.004*"get" + 0.004*"song" + 0.004*"first"'),
 (5,
  '0.008*"make" + 0.007*"song" + 0.006*"get" + 0.006*"well" + 0.006*"good" + '
  '0.006*"quot" + 0.006*"band" + 0.006*"time" + 0.005*"record" + 0.004*"say"'),
 (6,
 

In [18]:
analyzer = ArtistReviewAnalyzer()
analyzer.update_stopwords(['good', 'band', 'record', 'make', 'great', 'hear'])
analyzer.load_data(json_file_location);

In [19]:
analyzer.build_lda_model(num_topics=40);

In [20]:
pprint(analyzer.lda_model.print_topics())

[(10,
  '0.006*"even" + 0.005*"song" + 0.005*"get" + 0.005*"time" + 0.004*"well" + '
  '0.004*"work" + 0.004*"make" + 0.004*"sound" + 0.004*"come" + 0.004*"go"'),
 (22,
  '0.008*"get" + 0.006*"time" + 0.006*"song" + 0.006*"go" + 0.005*"quot" + '
  '0.005*"first" + 0.005*"well" + 0.004*"love" + 0.004*"make" + 0.004*"good"'),
 (39,
  '0.007*"song" + 0.006*"get" + 0.005*"come" + 0.005*"time" + 0.005*"new" + '
  '0.004*"play" + 0.004*"first" + 0.004*"love" + 0.004*"work" + 0.004*"good"'),
 (1,
  '0.010*"song" + 0.008*"quot" + 0.007*"well" + 0.006*"time" + 0.006*"get" + '
  '0.005*"go" + 0.005*"love" + 0.005*"first" + 0.004*"come" + 0.004*"year"'),
 (38,
  '0.008*"song" + 0.006*"go" + 0.005*"even" + 0.005*"first" + 0.005*"make" + '
  '0.004*"well" + 0.004*"work" + 0.004*"get" + 0.004*"be" + 0.004*"voice"'),
 (11,
  '0.008*"get" + 0.007*"song" + 0.006*"love" + 0.006*"quot" + 0.005*"make" + '
  '0.005*"well" + 0.005*"time" + 0.004*"find" + 0.004*"go" + 0.004*"work"'),
 (12,
  '0.008*"quot" + 

In [28]:
a2 = ArtistReviewAnalyzer()
a2.load_data(json_file_location)
a2.build_count_vectorizer(min_df=2,max_df=0.8)
a2.build_tfidf_vectorizer(min_df=2,max_df=0.8)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.8, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [30]:
a2_tfidf = a2.get_tfidf_matrix()

In [33]:
np.sum(a2_tfidf)

871.0782534277441

In [34]:
a2_count = a2.get_count_matrix()

In [35]:
np.sum(a2_count)

1008

In [37]:
a2.tfidf_vectorizer.get_feature_names()

['aaron', 'adrian', 'aguilar', 'alberto', 'alejandro', 'alex', 'alexander', 'alexandra', 'alexandre', 'ali', 'alice', 'america', 'ana', 'anderson', 'andre', 'andrew', 'andrews', 'andy', 'antonio', 'band', 'banda', 'bang', 'beach', 'bear', 'beck', 'belinda', 'ben', 'benjamin', 'benny', 'big', 'bill', 'billy', 'black', 'blanca', 'blind', 'blue', 'blues', 'bob', 'bobby', 'boy', 'brian', 'bronco', 'brown', 'bruce', 'carl', 'carlos', 'cartel', 'cecilia', 'chain', 'charlie', 'chico', 'chris', 'christina', 'church', 'circle', 'city', 'claudio', 'club', 'cold', 'cole', 'collective', 'conjunto', 'country', 'crazy', 'cruz', 'culture', 'da', 'daddy', 'dan', 'daniel', 'dave', 'david', 'day', 'de', 'dead', 'death', 'del', 'di', 'die', 'diego', 'dj', 'dog', 'donna', 'dr', 'dragon', 'duncan', 'durango', 'earth', 'eddie', 'edgar', 'el', 'electric', 'elliott', 'elvis', 'emilio', 'english', 'enrique', 'eric', 'erik', 'faith', 'fernandez', 'fernando', 'ferreira', 'fiona', 'francisco', 'frankie', 'franz',

In [38]:
a2.count_vectorizer.get_feature_names()

['aaron', 'adrian', 'aguilar', 'alberto', 'alejandro', 'alex', 'alexander', 'alexandra', 'alexandre', 'ali', 'alice', 'america', 'ana', 'anderson', 'andre', 'andrew', 'andrews', 'andy', 'antonio', 'band', 'banda', 'bang', 'beach', 'bear', 'beck', 'belinda', 'ben', 'benjamin', 'benny', 'big', 'bill', 'billy', 'black', 'blanca', 'blind', 'blue', 'blues', 'bob', 'bobby', 'boy', 'brian', 'bronco', 'brown', 'bruce', 'carl', 'carlos', 'cartel', 'cecilia', 'chain', 'charlie', 'chico', 'chris', 'christina', 'church', 'circle', 'city', 'claudio', 'club', 'cold', 'cole', 'collective', 'conjunto', 'country', 'crazy', 'cruz', 'culture', 'da', 'daddy', 'dan', 'daniel', 'dave', 'david', 'day', 'de', 'dead', 'death', 'del', 'di', 'die', 'diego', 'dj', 'dog', 'donna', 'dr', 'dragon', 'duncan', 'durango', 'earth', 'eddie', 'edgar', 'el', 'electric', 'elliott', 'elvis', 'emilio', 'english', 'enrique', 'eric', 'erik', 'faith', 'fernandez', 'fernando', 'ferreira', 'fiona', 'francisco', 'frankie', 'franz',

In [39]:
print(len(a2.count_vectorizer.get_feature_names()))

329


In [40]:
a2_count.shape

(1784, 329)

In [56]:
w_freq = np.array(np.sum(a2_count, axis=0))

In [46]:
i_to_w = a2.count_vectorizer.get_feature_names()
w_to_i = {w:i for i,w in enumerate(a2.count_vectorizer.get_feature_names())}

In [47]:
i_to_w[20]

'banda'

In [63]:
a3 = ArtistReviewAnalyzer()
a3.load_data(json_file_location)
a3.build_count_vectorizer(min_df=2,max_df=0.8)
a3.build_tfidf_vectorizer(min_df=2,max_df=0.8)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.8, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [64]:
a3_tfidf = a3.get_tfidf_matrix()
a3_count = a3.get_count_matrix()

In [66]:
a3.count_vectorizer.get_feature_names();

In [107]:
def get_top_n(input_analyzer, n):
    top_n = []
    feats = input_analyzer.count_vectorizer.get_feature_names()
    w_counts = np.array(np.sum(input_analyzer.get_count_matrix(), axis=0))[0]
    sorted_indices = np.argsort(w_counts)[::-1][:n]
    
    for i in sorted_indices:
        top_n.append((feats[i], w_counts[i]))
    return top_n

def get_top_n_words(input_analyzer, n):
    top_n = []
    feats = input_analyzer.count_vectorizer.get_feature_names()
    w_counts = np.array(np.sum(input_analyzer.get_count_matrix(), axis=0))[0]
    sorted_indices = np.argsort(w_counts)[::-1][:n]
    
    for i in sorted_indices:
        top_n.append(feats[i])
    return top_n

def get_bottom_n_words(input_analyzer, n):
    top_n = []
    feats = input_analyzer.count_vectorizer.get_feature_names()
    w_counts = np.array(np.sum(input_analyzer.get_count_matrix(), axis=0))[0]
    sorted_indices = np.argsort(w_counts)[:n]
    
    for i in sorted_indices:
        top_n.append(feats[i])
    return top_n

In [100]:
pprint(get_top_n(a3, 50))

[('like', 13878),
 ('one', 12812),
 ('songs', 9463),
 ('quot', 7580),
 ('great', 6798),
 ('love', 6094),
 ('band', 5880),
 ('good', 5873),
 ('first', 5803),
 ('time', 5604),
 ('best', 5572),
 ('even', 5272),
 ('de', 4862),
 ('new', 4819),
 ('much', 4652),
 ('would', 4491),
 ('well', 4426),
 ('get', 4336),
 ('also', 4168),
 ('still', 4035),
 ('two', 3994),
 ('rock', 3792),
 ('tracks', 3716),
 ('really', 3672),
 ('way', 3529),
 ('record', 3344),
 ('pop', 3328),
 ('years', 3326),
 ('voice', 3256),
 ('could', 3192),
 ('many', 3064),
 ('recording', 3057),
 ('work', 3052),
 ('back', 3039),
 ('never', 3024),
 ('sounds', 2974),
 ('la', 2930),
 ('live', 2918),
 ('better', 2876),
 ('make', 2762),
 ('albums', 2677),
 ('though', 2553),
 ('ever', 2522),
 ('guitar', 2521),
 ('little', 2511),
 ('version', 2472),
 ('heard', 2470),
 ('know', 2464),
 ('listen', 2457),
 ('every', 2443)]


In [102]:
a3.update_stopwords(get_top_n_words(a3, 50))

In [103]:
a3.refresh()



In [105]:
a3.build_lda_model();

In [106]:
pprint(a3.lda_model.print_topics())

[(0,
  '0.005*"come" + 0.004*"take" + 0.004*"play" + 0.004*"say" + 0.004*"make" + '
  '0.004*"go" + 0.004*"give" + 0.004*"get" + 0.003*"feel" + 0.003*"seem"'),
 (1,
  '0.004*"be" + 0.004*"come" + 0.003*"release" + 0.003*"think" + 0.003*"make" '
  '+ 0.003*"feel" + 0.003*"cd" + 0.003*"sing" + 0.003*"go" + 0.003*"find"'),
 (2,
  '0.007*"go" + 0.006*"cd" + 0.006*"come" + 0.005*"make" + 0.005*"play" + '
  '0.004*"find" + 0.004*"say" + 0.004*"fan" + 0.003*"want" + 0.003*"single"'),
 (3,
  '0.006*"make" + 0.006*"come" + 0.006*"go" + 0.005*"feel" + 0.005*"play" + '
  '0.005*"take" + 0.004*"cd" + 0.004*"get" + 0.004*"say" + 0.004*"be"'),
 (4,
  '0.005*"make" + 0.005*"play" + 0.005*"cd" + 0.004*"go" + 0.004*"say" + '
  '0.004*"get" + 0.003*"feel" + 0.003*"release" + 0.003*"see" + 0.003*"come"'),
 (5,
  '0.006*"make" + 0.005*"go" + 0.004*"take" + 0.004*"release" + 0.004*"say" + '
  '0.004*"get" + 0.004*"come" + 0.004*"give" + 0.003*"feel" + 0.003*"cd"'),
 (6,
  '0.005*"make" + 0.004*"cd" + 0.004

In [108]:
a3.update_stopwords(get_top_n_words(a3, int(0.3*len(a3.count_vectorizer.get_feature_names()))))

In [110]:
a3.refresh()
a3.build_lda_model();



In [113]:
from gensim.test.utils import datapath
import pickle

In [117]:
with open("test_data.pk", "wb") as f:
    pickle.dump(a3.lda_model, f)

In [119]:
test_lda = pickle.load(open("test_data.pk", "rb"))

In [121]:
pprint(test_lda.print_topics())

[(0,
  '0.019*"cd" + 0.008*"be" + 0.006*"here" + 0.006*"there" + 0.005*"have" + '
  '0.004*"now" + 0.004*"so" + 0.004*"do" + 0.004*"even" + 0.003*"make"'),
 (1,
  '0.020*"be" + 0.014*"cd" + 0.009*"there" + 0.006*"so" + 0.004*"when" + '
  '0.004*"have" + 0.003*"hit" + 0.003*"do" + 0.003*"let" + 0.003*"go"'),
 (2,
  '0.018*"be" + 0.010*"cd" + 0.006*"there" + 0.005*"when" + 0.005*"where" + '
  '0.004*"even" + 0.004*"so" + 0.003*"have" + 0.003*"just" + 0.003*"find"'),
 (3,
  '0.015*"there" + 0.013*"cd" + 0.009*"be" + 0.007*"when" + 0.005*"so" + '
  '0.004*"get" + 0.004*"even" + 0.004*"now" + 0.003*"have" + 0.003*"make"'),
 (4,
  '0.012*"cd" + 0.010*"there" + 0.008*"be" + 0.005*"even" + 0.004*"so" + '
  '0.004*"when" + 0.003*"come" + 0.003*"still" + 0.003*"now" + 0.003*"have"'),
 (5,
  '0.020*"be" + 0.015*"cd" + 0.015*"there" + 0.007*"when" + 0.005*"go" + '
  '0.005*"so" + 0.004*"here" + 0.004*"even" + 0.004*"live" + 0.003*"however"'),
 (6,
  '0.013*"be" + 0.009*"cd" + 0.006*"there" + 0.004

In [125]:
get_bottom_n_words(a3, 100)

['canibus', 'sorprender', 'sorprendente', 'paquin', 'coasters', 'sorgo', 'sorcerer', 'fubert', 'parables', 'fruta', 'paradice', 'sophistipop', 'paradiso', 'paradisum', 'paradoja', 'cobalt', 'frumpy', 'cobbling', 'fuckers', 'coarseness', 'fuckery', 'papydave', 'fuese', 'papercuts', 'sostenuto', 'papered', 'cluttering', 'cmg', 'papitour', 'cnicamente', 'paragons', 'papoose', 'pappalardi', 'coaches', 'coaching', 'sorprendio', 'coalesced', 'pappou', 'coalescing', 'coalition', 'sorriso', 'fugee', 'parakato', 'cobrastyle', 'frontwomen', 'param', 'frontrunner', 'sonne', 'parameter', 'sonidito', 'soni', 'songwritter', 'codify', 'songwritery', 'frontloads', 'coe', 'frontload', 'coexisted', 'songs1', 'songon', 'coffeemaker', 'cocorosie', 'cocooned', 'frosting', 'cocoband', 'coburn', 'frugal', 'paralelas', 'froze', 'cocciante', 'coche', 'frowned', 'cocked', 'cobrasnake', 'sooooooooooooo', 'sooooooooo', 'cockney', 'cocks', 'sooooooo', 'frown', 'cocktails', 'frothing', 'frothier', 'cockeyed', 'song

In [126]:
def get_bottom_n_words_tfidf(input_analyzer, n):
    top_n = []
    feats = input_analyzer.tfidf_vectorizer.get_feature_names()
    w_counts = np.array(np.sum(input_analyzer.get_tfidf_matrix(), axis=0))[0]
    sorted_indices = np.argsort(w_counts)[:n]
    
    for i in sorted_indices:
        top_n.append(feats[i])
    return top_n

In [129]:
get_bottom_n_words_tfidf(a3, 10)

['canibus', 'sorprender', 'sorprendente', 'paquin', 'coasters', 'sorgo', 'sorcerer', 'fubert', 'parables', 'fruta']

In [130]:
get_bottom_n_words(a3, 10)

['canibus', 'sorprender', 'sorprendente', 'paquin', 'coasters', 'sorgo', 'sorcerer', 'fubert', 'parables', 'fruta']

In [133]:
test_n = 1000
len(set(get_bottom_n_words_tfidf(a3, test_n)).intersection(set(get_bottom_n_words(a3, test_n))))

1000

# On just pitchfork

In [162]:
pf_file = "../data/processed/artist_reviews_pf_only.json"

In [163]:
pfanalyzer = ArtistReviewAnalyzer()

In [164]:
pfanalyzer.load_data(pf_file);

In [165]:
pfanalyzer.build_lda_model(num_topics=40);

In [166]:
with open("pf_lda_initial.pk", "xb") as f:
    pickle.dump(pfanalyzer.lda_model, f)

In [167]:
pf_lda_model = pfanalyzer.lda_model
pprint(pf_lda_model.print_topics())

[(0,
  '0.008*"band" + 0.008*"song" + 0.006*"record" + 0.006*"make" + 0.005*"get" + '
  '0.005*"even" + 0.004*"go" + 0.004*"sound" + 0.004*"take" + 0.004*"rock"'),
 (7,
  '0.008*"band" + 0.006*"make" + 0.006*"song" + 0.006*"get" + 0.006*"record" + '
  '0.005*"well" + 0.005*"go" + 0.004*"pop" + 0.004*"good" + 0.004*"time"'),
 (15,
  '0.006*"make" + 0.006*"song" + 0.005*"band" + 0.005*"record" + 0.005*"year" '
  '+ 0.005*"even" + 0.005*"get" + 0.004*"time" + 0.004*"feel" + 0.004*"come"'),
 (27,
  '0.010*"band" + 0.007*"make" + 0.006*"song" + 0.005*"record" + 0.005*"sound" '
  '+ 0.005*"get" + 0.005*"even" + 0.004*"feel" + 0.004*"rock" + 0.004*"pop"'),
 (17,
  '0.009*"make" + 0.006*"get" + 0.006*"record" + 0.005*"even" + 0.004*"way" + '
  '0.004*"come" + 0.004*"time" + 0.004*"well" + 0.004*"band" + 0.004*"take"'),
 (5,
  '0.007*"record" + 0.005*"band" + 0.005*"song" + 0.005*"even" + 0.005*"get" + '
  '0.004*"year" + 0.004*"new" + 0.004*"good" + 0.004*"time" + 0.004*"feel"'),
 (24,
  '0.00

In [171]:
pfanalyzer.build_count_vectorizer(min_df=30,max_df=0.8)
pfanalyzer.build_tfidf_vectorizer(min_df=30,max_df=0.8)
pfanalyzer.get_tfidf_matrix()
pfanalyzer.get_count_matrix()



<1212467x34937 sparse matrix of type '<class 'numpy.int64'>'
	with 1118590 stored elements in Compressed Sparse Row format>

In [173]:
get_top_n_words(pfanalyzer, 100)

['like', 'one', 'band', 'songs', 'even', 'new', 'time', 'rock', 'first', 'record', 'pop', 'still', 'much', 'two', 'way', 'love', 'would', 'best', 'years', 'could', 'tracks', 'sounds', 'back', 'get', 'also', 'never', 'though', 'make', 'good', 'well', 'something', 'guitar', 'work', 'year', 'last', 'live', 'little', 'made', 'might', 'life', 'rap', 'long', 'world', 'albums', 'every', 'voice', 'around', 'old', 'many', 'self', 'come', 'single', 'people', 'take', 'ever', 'since', 'know', 'man', 'three', 'better', 'great', 'enough', 'makes', 'always', 'big', 'set', 'less', 'hard', 'really', 'young', 'another', 'feel', 'go', 'thing', 'without', 'yet', 'almost', 'end', 'lyrics', 'point', 'got', 'title', 'often', 'full', 'early', 'release', 'right', 'seems', 'kind', 'sense', 'half', 'records', 'feels', 'things', 'career', 'may', 'part', 'second', 'group', 'debut']

In [174]:
get_bottom_n_words(pfanalyzer, 100)

['preme', 'resequenced', 'ledbetter', 'chocha', 'reservoirs', 'chivalry', 'chitlins', 'chitlin', 'elt', 'reshaped', 'reshapes', 'elucidate', 'reshuffled', 'residual', 'chipmunks', 'residuals', 'elusiveness', 'resiliency', 'resin', 'eluvium', 'lebanese', 'learner', 'resourcefulness', 'respecting', 'chimed', 'leaped', 'chocolates', 'resentful', 'resented', 'leeroy', 'elitism', 'chording', 'leithauser', 'chorales', 'elitists', 'chopsticks', 'chopsquad', 'republicans', 'elixir', 'elizabethtown', 'choppers', 'leigh', 'emailed', 'repulsed', 'ell', 'lego', 'choosin', 'reputed', 'ellipsis', 'rerecorded', 'rerelease', 'leftrightleftrightleft', 'chokeholds', 'chokehold', 'researched', 'resemblances', 'elkland', 'reprisal', 'leant', 'restaging', 'retailing', 'retell', 'retirements', 'retool', 'layla', 'retorts', 'retrace', 'retracing', 'lawless', 'retracting', 'lavelle', 'chesney', 'cheryl', 'retreading', 'retrieving', 'cherries', 'cherrelle', 'laureates', 'launay', 'embraceable', 'cheri', 'cherc

In [175]:
len(pfanalyzer.count_vectorizer.get_feature_names())

34937

In [180]:
from singernlp.ArtistReviewAnalyzer import ArtistReviewAnalyzer

In [176]:
pfanalyzer.update_stopwords(get_top_n_words(pfanalyzer, 1000))

In [177]:
pfanalyzer.refresh()



In [181]:
pfanalyzer.build_lda_model(num_topics=40)
pprint(pfanalyzer.lda_model.print_topics())

[(10,
  '0.005*"be" + 0.002*"there" + 0.002*"even" + 0.001*"so" + 0.001*"when" + '
  '0.001*"love" + 0.001*"get" + 0.001*"keep" + 0.001*"back" + 0.001*"rise"'),
 (22,
  '0.004*"be" + 0.002*"there" + 0.002*"when" + 0.002*"even" + 0.001*"add" + '
  '0.001*"love" + 0.001*"go" + 0.001*"drop" + 0.001*"provide" + 0.001*"let"'),
 (13,
  '0.004*"be" + 0.002*"when" + 0.002*"let" + 0.002*"there" + 0.002*"so" + '
  '0.001*"get" + 0.001*"here" + 0.001*"even" + 0.001*"drop" + 0.001*"grow"'),
 (35,
  '0.006*"be" + 0.002*"there" + 0.002*"when" + 0.002*"even" + 0.002*"love" + '
  '0.001*"speak" + 0.001*"so" + 0.001*"let" + 0.001*"include" + 0.001*"get"'),
 (29,
  '0.005*"be" + 0.002*"when" + 0.002*"even" + 0.001*"get" + 0.001*"so" + '
  '0.001*"add" + 0.001*"there" + 0.001*"speak" + 0.001*"keep" + 0.001*"tell"'),
 (36,
  '0.004*"be" + 0.003*"there" + 0.002*"when" + 0.002*"so" + 0.001*"even" + '
  '0.001*"let" + 0.001*"drop" + 0.001*"add" + 0.001*"provide" + 0.001*"die"'),
 (19,
  '0.003*"be" + 0.002*"

In [183]:
id2word = corpora.Dictionary(pfanalyzer.lemmatized_text)
doc_lda = pfanalyzer.lda_model[[id2word.doc2bow(w) for w in pfanalyzer.lemmatized_text]]

In [187]:
topics = pfanalyzer.lda_model.print_topics()

In [189]:
all_topics = []
for _,s in topics:
    all_topics.append(re.findall(r'\"[a-z]+\"', s))

In [192]:
pprint(all_topics)

[['"be"',
  '"when"',
  '"there"',
  '"add"',
  '"even"',
  '"so"',
  '"let"',
  '"get"',
  '"speak"',
  '"grow"'],
 ['"be"',
  '"when"',
  '"let"',
  '"there"',
  '"so"',
  '"get"',
  '"here"',
  '"even"',
  '"drop"',
  '"grow"'],
 ['"be"',
  '"when"',
  '"there"',
  '"get"',
  '"provide"',
  '"love"',
  '"serve"',
  '"even"',
  '"reveal"',
  '"shift"'],
 ['"be"',
  '"there"',
  '"so"',
  '"when"',
  '"even"',
  '"love"',
  '"drop"',
  '"fall"',
  '"name"',
  '"get"'],
 ['"be"',
  '"there"',
  '"when"',
  '"love"',
  '"let"',
  '"even"',
  '"so"',
  '"drop"',
  '"serve"',
  '"know"'],
 ['"be"',
  '"there"',
  '"even"',
  '"when"',
  '"so"',
  '"get"',
  '"serve"',
  '"happen"',
  '"drop"',
  '"cash"'],
 ['"be"',
  '"there"',
  '"when"',
  '"even"',
  '"get"',
  '"so"',
  '"add"',
  '"serve"',
  '"let"',
  '"speak"'],
 ['"be"',
  '"there"',
  '"when"',
  '"even"',
  '"love"',
  '"speak"',
  '"so"',
  '"let"',
  '"include"',
  '"get"'],
 ['"be"',
  '"when"',
  '"so"',
  '"there"',
  '"e

# Attempt 2 ... with pitchfork

In [193]:
from singernlp.ArtistReviewAnalyzer import ArtistReviewAnalyzer