In [34]:
import pandas as pd
import sqlite3
import pickle as pk
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


### Aim: To predict the rating of a song review based on the review (categorical)

8.5-10 : Amazing
5.5-8.5 : Good
0-5.5 : Bad




In [35]:
class Score:
    '''
    enum class for score labels
    '''
    AMAZING = "AMAZING"
    GOOD = "GOOD"
    ALRIGHT = "ALRIGHT"
    BAD = "BAD"

def label_score(score):
    '''
    takes in a numeric score, and returns the corresponding score label
    '''
    if score <= 10 and score >= 8.5:
        return Score.AMAZING
    if score < 8.5 and score >= 5.5:
        return Score.GOOD
    if score < 5.5:
        return Score.BAD

def wordRemoval(text, wordList):
    '''
    removes each word in a provided list from a String
    '''
    wordsToRemove = wordList
    words = word_tokenize(text)
    newText = ' '.join([word for word in words if word not in wordsToRemove])

    return newText
    

#### Load File and Add Labels

In [36]:
dat = sqlite3.connect('./Music/database.sqlite')

data_file = pd.read_sql_query("SELECT title, artist, score, genre, year, content FROM reviews JOIN genres ON reviews.reviewid = genres.reviewid JOIN years ON reviews.reviewid = years.reviewid JOIN content ON reviews.reviewid = content.reviewid", dat)
data_file.dropna(how = "all")
data_file['label'] = data_file['score'].apply(label_score)

dat.close()

#### Make a dataset with equal amounts of labels

In [37]:
data_equalized = pd.DataFrame()

data_equalized = pd.concat([data_equalized, data_file.loc[data_file['label'] == 'BAD'][0:2118]])
data_equalized = pd.concat([data_equalized, data_file.loc[data_file['label'] == 'GOOD'][0:2118]])
data_equalized = pd.concat([data_equalized, data_file.loc[data_file['label'] == 'AMAZING'][0:2118]])

### Process "content"

Remove common words

In [68]:
commonWords = pd.Series(' '.join(data_equalized['content']).lower().split()).value_counts()[:15] #get the 15 most common words from reviews
wordList = commonWords.index.tolist()

data_equalized['content'] = data_equalized['content'].apply(wordRemoval, args = (wordList,))


#### Data File Stats

In [63]:

print("Number of Bad Reviews: ", len(data_file[data_file['label']=='BAD']))
print("Number of Good Reviews: ", len(data_file[data_file['label']=='GOOD']))
print("Number of Amazing Reviews: ", len(data_file[data_file['label']=='AMAZING']))

print("Number of Bad Reviews: ", len(data_equalized[data_equalized['label']=='BAD']))
print("Number of Good Reviews: ", len(data_equalized[data_equalized['label']=='GOOD']))
print("Number of Amazing Reviews: ", len(data_equalized[data_equalized['label']=='AMAZING']))

Number of Bad Reviews:  2494
Number of Good Reviews:  19021
Number of Amazing Reviews:  2118
Number of Bad Reviews:  2118
Number of Good Reviews:  2118
Number of Amazing Reviews:  2118


In [69]:
# Creating features
from sklearn.model_selection import train_test_split

training, test = train_test_split(data_equalized, test_size = 0.33, random_state = 42)

# define which columns are features
feature_cols = ['content']
label_cols = ['label']

train_x = training.loc[:, feature_cols]
test_x = test.loc[:, feature_cols]

train_y = training.loc[:, label_cols]
test_y = test.loc[:, label_cols]

Vectorize reviews

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer() # object used to transform words into vectors
train_x_vectors = vectorizer.fit_transform(train_x['content']) # applying the transformation to our training data
test_x_vectors = vectorizer.transform(test_x['content']) # we don't want to fit (learn the vocabulary of) the test data

In [71]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear') # define the SVM object

clf_svm.fit(train_x_vectors, train_y.values.ravel()) # fit the data (ravel is needed to turn a column vector to a row vector)

clf_svm.predict(test_x_vectors[0]) # predict y based on trained data

array(['GOOD'], dtype=object)

#### Tests and Scores

In [77]:
custom_reviews = ["Porter Robinson’s music has often treasured escapism. Whether making ecstatic dance or cinematic pop, the producer has long imagined his work as a portal to other realms, as if, through enough head-spinning drops and intricate synth programming, he could create a better world. His debut full-length album, 2014’s Worlds, was explicitly about the transportive power of fiction, the way that art can take us into an “imaginary universe.” And in the video for his Madeon collaboration “Shelter,” a father creates a bright, beautiful simulation for his daughter to live in while the real world around her crumbles and burns. It’s a sentimental idea, but a powerful one. In a lonely world, who couldn’t use an escape?But seven years of creative blocks, self-doubt, and mental-health struggles have altered Robinson’s perspective. His second album Nurture explores the difficulty of finding fulfillment and plumbs the joyful realization that the world he wanted to create was always right in front of him. The album’s tagline underscores its aims succinctly: “Everything we need is already here.”Robinson sings with a newfound clarity on Nurture, writing directly about his struggles and the ecstatic realizations that have come from hard times. “Look at the Sky,” the first vocal track after an ambient piano-led opener, is a ballad of hard-won optimism. Over the kaleidoscopic chirps of a handful of interlocking synth parts, he sings affirmations, looking ahead to a better future. “Look at the sky, I’m still here,” he sings with quiet confidence. “I’ll be alive next year. I can make something good.”This disposition runs through the record. On “Musician,” Robinson mulls the difficulty of making art during tough times, flickering between despair and reassurance: “I just can't stop, I'm sorry,” he sings, then: “I can feel a new day dawning.” Nurture feels careful to mirror the twists and turns of his headspace. There’s murky, tone-setting ambient pieces (“Lifelike”), blissed-out digital shoegaze (the Totally Enormous Extinct Dinosaurs collaboration “Unfold”), and anxious electronic collages (“dullscythe”) all interspersed between euphoric pop songs like “Get Your Wish” and “Something Comforting.” Even when a song is bright and buoyant, there’s a restlessness beneath, a sense that the feeling he’s describing might only last for a moment—that any solid comfort might just slip through your fingers.Across the record, Robinson distorts, pitch-shifts, and otherwise mutates his vocals, which is meant to add a layer of what he calls “corruption and artificiality” to the songs. The technique introduces a note of lingering doubt into songs about beauty and hope. On the closing track “Trying to Feel Alive,” he emerges from the fog with the realization that struggle gives life its color in the first place. “Maybe I don’t really need to feel satisfied,” he sings. “Maybe it's a gift that I spend all this time just trying to feel alive.”"]
custom_tests = vectorizer.transform(custom_reviews)

clf_svm.predict(custom_tests)

array(['GOOD'], dtype=object)

In [72]:
print('SVM Score:', clf_svm.score(test_x_vectors, test_y)) 



SVM Score 0.8426323319027181


In [73]:
from sklearn.metrics import f1_score

print("SVM Score:", f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels = [Score.AMAZING, Score.GOOD, Score.BAD]))


SVM Score  [0.83189655 0.82344633 0.         0.87301587]


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
