# ArtistReviewAnalyzer Class Tester

Because I'm too lazy to run the file itself so I'm just forcefully doing it through jupyter lol

In [14]:
from pprint import pprint

## helper_functions.py

In [1]:
import json

import pandas as pd
import sqlite3

import re
import unidecode as ud


def clean_str(input_artist, cap_code=0):
    """
    Takes in string, returns a unicode-friendly and stripped version of the string.
    """
    return_artist_str = input_artist

    # === REGEX REPLACE ===
    repl_tuples = [(r'(^\s+)|(\s+$)', ''),  # whitespace at beg/end of string
                   (r'\s+', ' '),  # Remove double spaces
                   (r'[\n|\r|\t|\0]+', ' ')
                   ]
    for ptn, repl_str in repl_tuples:
        return_artist_str = re.sub(ptn, repl_str, return_artist_str)

    # === UNICODE HANDLING ===
    return_artist_str = ud.unidecode(return_artist_str)

    if cap_code == -1:
        return_artist_str = return_artist_str.lower()
    elif cap_code == 1:
        return_artist_str = return_artist_str.upper()

    return return_artist_str


def read_mard_json(input_filename):
    """
    Takes in file name of JSON, returns a list of dictionaries in which
    each element is a row with columns (as the keys).
    """
    loaded_data_ = []
    with open(input_filename, 'r') as file_:
        loaded_string_ = file_.read()
        loaded_data_ = [json.loads(s) for s in loaded_string_.split('\n') if s is not None and len(s) > 0]
        file_.close()
    return loaded_data_


def read_mard_json_as_df(input_filename):
    """
    Takes in file name of JSON, returns a list of dictionaries in which
    each element is a row with columns (as the keys).
    """
    loaded_data_ = []
    with open(input_filename, 'r') as file_:
        loaded_string_ = file_.read()
        loaded_data_ = [json.loads(s) for s in loaded_string_.split('\n') if s is not None and len(s) > 0]
        file_.close()
    return pd.DataFrame(loaded_data_)


def run_query_on_sqlite_db(input_query, input_filename):
    """

    Returns a Pandas DataFrame object containing the query results,
    given the user's query and the filename for the sqlite database.

    Input:
     - input_query: string representation of the SQL query to run on the sqlite db
     - input_filename: the file location of the sqlite database

    """
    conn_ = sqlite3.connect(input_filename)
    df_ = pd.read_sql_query(input_query, conn_)
    conn_.close()
    return df_


## ArtistReviewAnalyzer.py

In [17]:

import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
import spacy


class ArtistReviewAnalyzer:
    """
    Analyzes artist reviews. Forms into useful data structures for NLP and topic modeling.
    """
    def __init__(self):
        # Basic data
        self.file_loc = None
        self.raw = None
        self.artists_list = None

        # Stop-Words for tokenization
        self.stop_words = stopwords.words('english')  # Basic stop words
        self.update_stopwords(['album', 'music', 'cd', 'track', 'song', 'sound'])

        # Vectorizers
        self.tfidf_vectorizer = None
        self.count_vectorizer = None
        self.tfidf_matrix = None
        self.count_matrix = None
        self.__tokenizer = None
        self.tokenized_reviews = None

        # NLP tools
        self.lda_model = None
        self.bigrams = None
        self.lemmatized_text = None
        self.nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])  # Spacy tool


    def load_data(self, file_loc):
        """
        Loads reviews data (JSON file) into a dictionary.
        """
        # Save file location
        self.file_loc = file_loc

        with open(file_loc, 'r') as file:
            d = json.load(file)
            file.close()
        self.raw = d  # Save raw dictionary

        self.artists_list = list(self.raw.keys())  # Save artists list

        self.build_count_vectorizer(2, 0.8)  # Build count vec so we have a tokenizer

        # Handling tokenized list of reviews
        self.tokenized_reviews = self.__build_reviews_list(do_tokenize=True)
        self.remove_stopwords_from_tokenized_list()

        return self.raw

    def refresh(self):
        self.build_count_vectorizer(2, 0.8)
        self.tokenized_reviews = self.__build_reviews_list(do_tokenize=True)
        self.remove_stopwords_from_tokenized_list()

    def build_count_vectorizer(self, min_df, max_df):
        if self.count_vectorizer is not None:
            print("WARNING: Count vectorizer has already been built.")
            return self.count_vectorizer
        self.count_vectorizer = CountVectorizer(analyzer='word', stop_words=self.stop_words, min_df=min_df, max_df=max_df)
        self.__tokenizer = self.count_vectorizer.build_tokenizer()
        return self.count_vectorizer

    def get_count_matrix(self):
        if self.count_vectorizer is None:
            return None

        if self.count_matrix is None:
            # TODO: self.raw is not truly reflective of the data since we clean it.
            self.count_matrix = self.count_vectorizer.fit_transform(self.raw)

    def build_tfidf_vectorizer(self, min_df, max_df):
        self.tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=self.stop_words, min_df=min_df, max_df=max_df)
        return self.tfidf_vectorizer

    def get_tfidf_matrix(self):
        if self.tfidf_vectorizer is None:
            return None

        if self.tfidf_matrix is None:
            # TODO: self.raw is not truly reflective of the data since we clean it.
            self.count_matrix = self.tfidf_vectorizer.fit_transform(self.raw)

    def tokenize(self, input_string):
        if self.__tokenizer is None:
            print("WARNING: Tokenizer not set.")
            return None
        return self.__tokenizer(input_string)

    def update_stopwords(self, word_list):
        self.stop_words.extend(word_list)
        return None

    def remove_stopwords_from_tokenized_list(self):
        self.tokenized_reviews = \
            [[w for w in artist_review if w not in self.stop_words] for artist_review in self.tokenized_reviews]

    def build_lda_model(self, num_topics=20):
        self.bigrams = self.make_bigrams()
        self.lemmatized_text = self.lemmatize()
        id2word = corpora.Dictionary(self.lemmatized_text)
        corpus = [id2word.doc2bow(w) for w in self.lemmatized_text]
        self.lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
        return self.lda_model

    def make_bigrams(self):
        bg = gensim.models.Phrases(self.tokenized_reviews, min_count=5, threshold=100)
        bg_mod = gensim.models.phrases.Phraser(bg)
        return [bg_mod[d] for d in self.tokenized_reviews]

    def lemmatize(self, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        texts_out = []
        for review_words in self.tokenized_reviews:
            joined_words = self.nlp(" ".join(review_words))
            texts_out.append([token.lemma_ for token in joined_words if token.pos_ in allowed_postags])
        return texts_out

    def get_all_words(self, do_tokenize=False):
        return self.tokenize(" ".join(self.raw)) if do_tokenize else " ".join(self.raw)

    def __build_reviews_list(self, do_tokenize=False):
        """
        Builds a list of reviews for each artist.
        """
        consolidated_reviews = []
        for a in self.artists_list:
            if do_tokenize:
                consolidated_reviews.append(self.tokenize(clean_str(" ".join(self.raw[a]))))
            else:
                consolidated_reviews.append(clean_str(" ".join(self.raw[a])))
        return consolidated_reviews


## Test runs

In [4]:
json_file_location = "../data/processed/artist_reviews.json"

In [8]:
analyzer = ArtistReviewAnalyzer()

In [10]:
analyzer.load_data(json_file_location);

In [12]:
analyzer.build_lda_model();

In [15]:
pprint(analyzer.lda_model.print_topics())

[(0,
  '0.012*"quot" + 0.008*"good" + 0.007*"make" + 0.006*"song" + 0.006*"well" + '
  '0.005*"great" + 0.005*"even" + 0.005*"hear" + 0.005*"work" + 0.004*"first"'),
 (1,
  '0.008*"get" + 0.008*"good" + 0.008*"great" + 0.007*"song" + 0.006*"well" + '
  '0.006*"record" + 0.005*"first" + 0.005*"love" + 0.005*"make" + '
  '0.005*"band"'),
 (2,
  '0.010*"quot" + 0.008*"song" + 0.007*"great" + 0.007*"good" + 0.006*"well" + '
  '0.006*"get" + 0.006*"record" + 0.005*"time" + 0.005*"make" + 0.005*"band"'),
 (3,
  '0.011*"song" + 0.009*"good" + 0.008*"love" + 0.008*"make" + 0.007*"get" + '
  '0.006*"time" + 0.006*"great" + 0.005*"cd" + 0.005*"well" + 0.005*"hear"'),
 (4,
  '0.008*"make" + 0.006*"record" + 0.006*"band" + 0.006*"good" + 0.005*"well" '
  '+ 0.005*"go" + 0.005*"time" + 0.004*"get" + 0.004*"song" + 0.004*"first"'),
 (5,
  '0.008*"make" + 0.007*"song" + 0.006*"get" + 0.006*"well" + 0.006*"good" + '
  '0.006*"quot" + 0.006*"band" + 0.006*"time" + 0.005*"record" + 0.004*"say"'),
 (6,
 

In [18]:
analyzer = ArtistReviewAnalyzer()
analyzer.update_stopwords(['good', 'band', 'record', 'make', 'great', 'hear'])
analyzer.load_data(json_file_location);

In [19]:
analyzer.build_lda_model(num_topics=40);

In [20]:
pprint(analyzer.lda_model.print_topics())

[(10,
  '0.006*"even" + 0.005*"song" + 0.005*"get" + 0.005*"time" + 0.004*"well" + '
  '0.004*"work" + 0.004*"make" + 0.004*"sound" + 0.004*"come" + 0.004*"go"'),
 (22,
  '0.008*"get" + 0.006*"time" + 0.006*"song" + 0.006*"go" + 0.005*"quot" + '
  '0.005*"first" + 0.005*"well" + 0.004*"love" + 0.004*"make" + 0.004*"good"'),
 (39,
  '0.007*"song" + 0.006*"get" + 0.005*"come" + 0.005*"time" + 0.005*"new" + '
  '0.004*"play" + 0.004*"first" + 0.004*"love" + 0.004*"work" + 0.004*"good"'),
 (1,
  '0.010*"song" + 0.008*"quot" + 0.007*"well" + 0.006*"time" + 0.006*"get" + '
  '0.005*"go" + 0.005*"love" + 0.005*"first" + 0.004*"come" + 0.004*"year"'),
 (38,
  '0.008*"song" + 0.006*"go" + 0.005*"even" + 0.005*"first" + 0.005*"make" + '
  '0.004*"well" + 0.004*"work" + 0.004*"get" + 0.004*"be" + 0.004*"voice"'),
 (11,
  '0.008*"get" + 0.007*"song" + 0.006*"love" + 0.006*"quot" + 0.005*"make" + '
  '0.005*"well" + 0.005*"time" + 0.004*"find" + 0.004*"go" + 0.004*"work"'),
 (12,
  '0.008*"quot" + 