## 1. Data Collection and Cleaning

### 1.1 Import Libraries and Packages

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob, Word
from imblearn.over_sampling import ADASYN
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from datetime import datetime
from sklearn.svm import SVC
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from tqdm import tqdm

%matplotlib inline

### 1.2 Load Metacritic Album Review Data

__This data set consists of user reviews that were web scraped from the albums listed on www.metacritic.com. Please refer to the "Metacritic Scraper" notebook in this respository for the code used to collect this data.__



In [2]:
df = pd.read_csv('critic_reviews.csv', engine='python')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,label,release_date,metascore,user_score,genre,summary,name,date,rating,review
0,0,\n\nAgricultural Tragic\n\n,Corb Lund,New West,"Jun 26, 2020",80,\ntbd\n,Country,The latest full-length release for the Canadia...,Exclaim,"Jun 30, 2020",90,\n Despite its ...
1,1,\n\nAgricultural Tragic\n\n,Corb Lund,New West,"Jun 26, 2020",80,\ntbd\n,Country,The latest full-length release for the Canadia...,Mojo,"Jul 21, 2020",80,\n Highlights i...
2,2,\n\nAgricultural Tragic\n\n,Corb Lund,New West,"Jun 26, 2020",80,\ntbd\n,Country,The latest full-length release for the Canadia...,Glide Magazine,"Jun 30, 2020",80,\n Agricultural...
3,3,\n\nAgricultural Tragic\n\n,Corb Lund,New West,"Jun 26, 2020",80,\ntbd\n,Country,The latest full-length release for the Canadia...,AllMusic,"Jun 30, 2020",80,\n All of these...
4,4,\n\nAgricultural Tragic\n\n,Corb Lund,New West,"Jun 26, 2020",80,\ntbd\n,Country,The latest full-length release for the Canadia...,PopMatters,"Jul 16, 2020",70,\n At times Agr...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110836 entries, 0 to 110835
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    110836 non-null  int64 
 1   title         110836 non-null  object
 2   artist        110836 non-null  object
 3   label         110423 non-null  object
 4   release_date  110836 non-null  object
 5   metascore     110836 non-null  int64 
 6   user_score    110836 non-null  object
 7   genre         110836 non-null  object
 8   summary       109147 non-null  object
 9   name          110836 non-null  object
 10  date          43892 non-null   object
 11  rating        110836 non-null  int64 
 12  review        110836 non-null  object
dtypes: int64(3), object(10)
memory usage: 11.0+ MB


### 1.3 Data Cleaning

In [5]:
df = df[df['title'].notna()]

In [6]:
df = df.drop(columns = ['Unnamed: 0'])

In [7]:
def dropn(x):
    return x.replace('\n', '')

In [8]:
df['title'] = df['title'].apply(dropn)
df['user_score'] = df['user_score'].apply(dropn)
df['name'] = df['name'].apply(dropn)
df['review'] = df['review'].apply(dropn)

In [9]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['date'] = pd.to_datetime(df['date'])

In [10]:
df = df.sort_values(['genre', 'release_date'], ascending = (True,False)).reset_index(drop = True)

In [11]:
df.head()

Unnamed: 0,title,artist,label,release_date,metascore,user_score,genre,summary,name,date,rating,review
0,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Mojo,2011-09-27,80,Like all the b...
1,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Boston Globe,2011-09-22,80,Jones furthers...
2,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,AllMusic,2011-09-22,80,If Jones' perf...
3,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Drowned In Sound,2011-09-22,80,The Wanting bo...
4,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,PopMatters,2011-09-22,80,This is music ...


In [12]:
def clean_rev(text):
    text = text.str.replace("<br/>", "")
    text = text.str.replace("'", '')
    text = text.str.replace("-", '')
    text = text.str.replace('(<a).*(>).*(</a>)', '')
    text = text.str.replace('&amp', '')
    text = text.str.replace('&gt', '')
    text = text.str.replace('&lt', '')
    text = text.str.replace('\xa0', ' ')
    text = text.str.replace('[^\w\s]', ' ')
    text = text.str.replace('[0-9]', ' ')
    text = text.str.lower() 
    return text


In [13]:
df['clean_review'] = clean_rev(df['review'])

## 2. Preprocessing & Feature Engineering

In [14]:
stopwords_list=stopwords.words('english')+list(string.punctuation)+['album','albums', 'songs', 'song', 'music', 'like', 'one']

In [15]:
df['clean_review'] = df['clean_review'].apply(lambda text_list: ' '.join([x for x in text_list.split() if x not in stopwords_list]))

In [16]:
df['length'] = df['review'].astype(str).apply(len)
df['word_count'] = df['review'].apply(lambda x: len(str(x).split()))

Classifying review sentiment using Vader

In [17]:
analyzer = SentimentIntensityAnalyzer()


In [18]:
df['sentiment'] = [analyzer.polarity_scores(x)['compound'] for x in df['clean_review']]
df['negative'] = [analyzer.polarity_scores(x)['neg'] for x in df['clean_review']]
df['neutral'] = [analyzer.polarity_scores(x)['neu'] for x in df['clean_review']]
df['positive'] = [analyzer.polarity_scores(x)['pos'] for x in df['clean_review']]

In [19]:
df['sent_class'] = np.where(df['sentiment'] >= 0.05, 1, df['sentiment'])
df['sent_class'] = np.where((df['sentiment'] > -0.05) & (df['sentiment'] < 0.05), 0, df['sent_class'])
df['sent_class'] = np.where(df['sentiment'] <= -0.05, -1, df['sent_class'])

In [20]:
df.genre.value_counts()

Indie                29214
Rock                 21466
Electronic           20329
Rap                  13763
Country               5060
Pop                   4718
R&B                   4632
Folk                  3760
Jazz                  2150
Dance                 1437
Experimental           664
Alternative            583
Blues                  530
Alt-Country            328
Bluegrass              315
Reggae                 312
Punk                   267
World                  245
Latin                  222
Soul                   160
Soundtrack             158
Metal                  123
Adult Alternative      102
Comedy                  81
House                   55
Singer-Songwriter       51
Classical               31
Electronica             19
Avant-Garde             16
Psychedelic             12
Adult-Alternative       11
Trip-Hop                10
Live                     8
Singer/Songwriter        4
Name: genre, dtype: int64

Lemmatizing with spaCy

In [21]:
sp = spacy.load('en_core_web_md')
lookups = Lookups()
lemm = Lemmatizer(lookups)

In [22]:
def lem_function(text):
    dummy = []
    for word in sp(text):
        dummy.append(word.lemma_)
    return ' '.join(dummy)

In [23]:
tqdm.pandas()

  from pandas import Panel


In [24]:
df['sp_lm'] = df['clean_review'].progress_apply(lambda x: lem_function(x))

100%|██████████| 110836/110836 [14:20<00:00, 128.74it/s]


In [25]:
df['sp_lm'] = df['sp_lm'].progress_apply(lambda x: x.replace('-PRON-', ' '))

100%|██████████| 110836/110836 [00:00<00:00, 820235.37it/s]


In [26]:
df.head()

Unnamed: 0,title,artist,label,release_date,metascore,user_score,genre,summary,name,date,...,review,clean_review,length,word_count,sentiment,negative,neutral,positive,sent_class,sp_lm
0,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Mojo,2011-09-27,...,Like all the b...,best dreams oct p,103,8,0.7845,0.0,0.225,0.775,1.0,good dream oct p
1,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Boston Globe,2011-09-22,...,Jones furthers...,jones furthers exploratory path hes committed ...,351,43,0.9274,0.078,0.493,0.428,1.0,jones further exploratory path s commit tran...
2,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,AllMusic,2011-09-22,...,If Jones' perf...,jones performances compositions years touched ...,372,56,0.4767,0.0,0.812,0.188,1.0,jones performances composition year touch them...
3,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,Drowned In Sound,2011-09-22,...,The Wanting bo...,wanting boasts technical excellence cosy welco...,197,20,0.8932,0.0,0.457,0.543,1.0,want boast technical excellence cosy welcoming...
4,The Wanting,Glenn Jones,Thrill Jockey,2011-09-13,77,tbd,Adult Alternative,The fourth solo acoustic album for the guitari...,PopMatters,2011-09-22,...,This is music ...,makes space creates landscape invites,151,17,0.2732,0.0,0.656,0.344,1.0,make space create landscape invite
