In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from scipy import *
import pandas as pd
import re
import unicodedata
import os
import codecs
import nltk
import multiprocessing

from nltk.corpus import *
from collections import *
from gensim import *
from gensim.models import *
from sklearn import *
from sklearn.metrics.pairwise import *
from nltk.stem import *



metadatadf = pd.read_csv('data/metadata.csv', skipinitialspace=True)
reviewsdf = pd.read_csv('data/reviews.csv', skipinitialspace=True)
# subdf = pd.read_csv('data/subgenres.csv',skipinitialspace=True)

dictionary = corpora.Dictionary.load('models/reviewsDict.dict')
corpus = corpora.MmCorpus('models/reviewsDict.mm')
originaldict = corpora.Dictionary.load('models/orig_text.dict')
originalcorp = corpora.MmCorpus('models/orig_text.mm')

wem = Word2Vec.load('models/wem/truedata.wem')
lda = LdaModel.load('models/lda/lda.lda')
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
genres = ['rap','electronic','metal','rock','experimental','pop','r&b','folk','country','jazz','global']

In [None]:
def generate_models(topic_num):
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_num)
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_num)
    return {'lsi':lsi,'lda':lda}

In [None]:
def analogy(model,a,b,c):
    return model.wv.most_similar(positive=[c, b], negative=[a])

In [None]:
def similarity(model, a,b):
    return model.wv.similarity(a,b)

analogy(wem,'rap','hip-hop','folk')

similarity(wem,'rap','country')

In [None]:
def get_similar(model,word,n):
    return model.wv.most_similar(word,topn=n)

subgenres = {}
for i in genres:
    subgenres[i]=get_similar(wem,i,25)

genre_distances = {'rows':genres}
for i in genres:
    genre_distances[i]=[]
    for o in genres:
        genre_distances[i].append(o,similarity(wem,i,o))
 for i in genre_distances:
     genre_distances[i].sort(key=lambda tup: tup[1])
distancesdf = pd.DataFrame.from_dict(genre_distances)
distancesdf.to_csv('genre_distances.csv')

distinctiveness = []
for i in genre_distances:
    distinctiveness.append((i,np.mean([1-o[1] for o in genre_distances[i]])))
distinctiveness.sort(key=lambda tup: tup[1])

distinctiveness

subdf = pd.DataFrame()
genrestemp = []
substemp = []
valstemp = []
for i in subgenres:
    genrestemp.extend(((i+' ')*(len(subgenres[i])-1)).split(' '))
    substemp.extend([o[0] for o in subgenres[i]])
    
subdf['genre'] = genrestemp
subdf['subgenre'] = substemp
subdf.to_csv('subgenres.csv')

subgenres = {}
for row in subdf.iterrows():
    if row[1].genre not in subgenres:
        subgenres[row[1].genre] = []
    subgenres[row[1].genre].append(row[1].subgenre)

subs_corr = {}
for i in subgenres:
    subs_corr[i] = {}
    for o in subgenres[i]:
        subs_corr[i][o]=[]
        for p in subgenres[i]:
            subs_corr[i][o].append(round(similarity(wem,o,p),5))


for i in genres:
    if i=='global':continue
    tempdf = pd.DataFrame()
    for o in subgenres[i]:
        tempdf[o] = subs_corr[i][o]
    tempdf.set_axis(subgenres[i])
    tempdf.to_csv(i+'.csv')

In [16]:
reviews = reviewsdf.review
reviews = [str(i).split() for i in reviews]

In [47]:
vectorized_corpus = lsa[corpus]

In [104]:
genreslist = [metadatadf.loc[i].genre for i in list(range(19555))]

In [165]:
coordinates = []
for i in np.arange(len(vectorized_corpus)):
    try:
        coordinates.append((metadatadf.loc[i].album,metadatadf.loc[i].genre,vectorized_corpus[i][0][1],vectorized_corpus[i][1][1]))
    except:
        print(i)

13300
17166


In [166]:
coordinatesdf = pd.DataFrame.from_records(coordinates)

In [167]:
coordinatesdf.columns = ['album','genre','x','y']
coordinatesdf.set_index('album')

Unnamed: 0_level_0,genre,x,y
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A.M./Being There,Rock,8.646844,0.493461
No Shame,Rap,3.474052,2.101561
Material Control,Rock,3.498922,0.153278
Weighing of the Heart,Pop/R&B,3.940656,1.029091
The Visitor,Rock,3.268955,-0.191249
Perfect Angel,Pop/R&B,4.268568,0.437881
Everyday Is Christmas,Pop/R&B,2.584147,0.204624
Zaytown Sorority Class of 2017,Rap,2.940709,-0.499059
Songs of Experience,Rock,3.650971,0.377429
Post Self,Metal,1.978547,-0.015314


In [164]:
coordinatesdf.to_csv('models/lsa_coords.csv')

In [15]:
def get_nice_topics(model):
    num_topics = len(model.get_topics())
    return str(num_topics)+'\n'+('\n\n'.join([model.print_topic(i) for i in list(range(num_topics))]))

models_list = [generate_models(i) for i in list(range(5,40))]
lsafile = open('models/lsa.txt', 'w')
ldafile = open('models/lda.txt', 'w')
for i in list(range(len(models_list))):
    lsafile.write(get_nice_topics(models_list[i]['lsi']))
    ldafile.write(get_nice_topics(models_list[i]['lda']))
    lsafile.write('\n\n')
    ldafile.write('\n\n')
lsafile.close()
ldafile.close()
for i in list(range(len(models_list))):
    (models_list[i]['lsi']).save('models/lsa/'+str(i)+'.lsi')
    (models_list[i]['lda']).save('models/lda/'+str(i)+'.lda')

print(get_nice_topics(lda))