In [226]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from scipy import *
import pandas as pd
import re
import unicodedata
import os
import codecs
import nltk
import multiprocessing
import gensim

from nltk.corpus import *
from collections import *
from gensim import *
from gensim.models import *
from sklearn import *
from sklearn.metrics.pairwise import *
from nltk.stem import *



metadatadf = pd.read_csv('data/metadata.csv', skipinitialspace=True)
reviewsdf = pd.read_csv('data/reviews.csv', skipinitialspace=True)
# subdf = pd.read_csv('data/subgenres.csv',skipinitialspace=True)

dictionary = corpora.Dictionary.load('models/reviewsDict.dict')
corpus = corpora.MmCorpus('models/reviewsDict.mm')
originaldict = corpora.Dictionary.load('models/orig_text.dict')
originalcorp = corpora.MmCorpus('models/orig_text.mm')

wem = Word2Vec.load('models/wem/truedata.wem')
lda = LdaModel.load('models/lda/lda.lda')
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
genres = ['rap','electronic','metal','rock','experimental','pop','r&b','folk','country','jazz','global']

In [None]:
def generate_models(topic_num):
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_num)
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_num)
    return {'lsi':lsi,'lda':lda}

In [None]:
def analogy(model,a,b,c):
    return model.wv.most_similar(positive=[c, b], negative=[a])

In [None]:
def similarity(model, a,b):
    return model.wv.similarity(a,b)

analogy(wem,'rap','hip-hop','folk')

similarity(wem,'rap','country')

In [None]:
def get_similar(model,word,n):
    return model.wv.most_similar(word,topn=n)

subgenres = {}
for i in genres:
    subgenres[i]=get_similar(wem,i,25)

genre_distances = {'rows':genres}
for i in genres:
    genre_distances[i]=[]
    for o in genres:
        genre_distances[i].append(o,similarity(wem,i,o))
 for i in genre_distances:
     genre_distances[i].sort(key=lambda tup: tup[1])
distancesdf = pd.DataFrame.from_dict(genre_distances)
distancesdf.to_csv('genre_distances.csv')

distinctiveness = []
for i in genre_distances:
    distinctiveness.append((i,np.mean([1-o[1] for o in genre_distances[i]])))
distinctiveness.sort(key=lambda tup: tup[1])

distinctiveness

subdf = pd.DataFrame()
genrestemp = []
substemp = []
valstemp = []
for i in subgenres:
    genrestemp.extend(((i+' ')*(len(subgenres[i])-1)).split(' '))
    substemp.extend([o[0] for o in subgenres[i]])
    
subdf['genre'] = genrestemp
subdf['subgenre'] = substemp
subdf.to_csv('subgenres.csv')

subgenres = {}
for row in subdf.iterrows():
    if row[1].genre not in subgenres:
        subgenres[row[1].genre] = []
    subgenres[row[1].genre].append(row[1].subgenre)

subs_corr = {}
for i in subgenres:
    subs_corr[i] = {}
    for o in subgenres[i]:
        subs_corr[i][o]=[]
        for p in subgenres[i]:
            subs_corr[i][o].append(round(similarity(wem,o,p),5))


for i in genres:
    if i=='global':continue
    tempdf = pd.DataFrame()
    for o in subgenres[i]:
        tempdf[o] = subs_corr[i][o]
    tempdf.set_axis(subgenres[i])
    tempdf.to_csv(i+'.csv')

In [16]:
reviews = reviewsdf.review
reviews = [str(i).split() for i in reviews]

In [179]:
lsa = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
lsa.save('models/twodlsa.lsi')
vectorized_corpus = lsa[corpus]

In [180]:
genreslist = [metadatadf.loc[i].genre for i in list(range(19555))]

In [181]:
coordinates = []
for i in np.arange(len(vectorized_corpus)):
    try:
        coordinates.append((i,metadatadf.loc[i].album,metadatadf.loc[i].genre,vectorized_corpus[i][0][1],vectorized_corpus[i][1][1]))
    except:
        print(i)
coordinatesdf = pd.DataFrame.from_records(coordinates)
coordinatesdf.columns = ['doc_id','album','genre','x','y']
coordinatesdf.set_index('doc_id')
coordinatesdf.to_csv('models/lsa_coords.csv')

13300
17166


In [190]:
def get_nice_topics(model):
    num_topics = len(model.get_topics())
    return str(num_topics)+'\n'+('\n\n'.join([model.print_topic(i) for i in list(range(num_topics))]))

models_list = [generate_models(i) for i in list(range(5,40))]
lsafile = open('models/lsa.txt', 'w')
ldafile = open('models/lda.txt', 'w')
for i in list(range(len(models_list))):
    lsafile.write(get_nice_topics(models_list[i]['lsi']))
    ldafile.write(get_nice_topics(models_list[i]['lda']))
    lsafile.write('\n\n')
    ldafile.write('\n\n')
lsafile.close()
ldafile.close()
for i in list(range(len(models_list))):
    (models_list[i]['lsi']).save('models/lsa/'+str(i)+'.lsi')
    (models_list[i]['lda']).save('models/lda/'+str(i)+'.lda')

print(get_nice_topics(lda))

In [185]:
with open("stopwords.txt") as f:
    stopwords = [word for line in f for word in line.split()]
def remove_stop(m):
    return '' if m.group() in stopwords else m.group()
def clean_text(text):
    temp = text
    temp=re.sub('\S+[\'\’\‘]\S+','',(temp))
    temp=re.sub('\w*\d\S*','',(temp))
    temp=re.sub('(?<!^|$)(?<!([(\.)(\!)(\?)(\“)]\s))([A-Z]\S+)','',(temp))
    temp=re.sub('[^A-z\s\-\–\&]',' ',(temp))
    temp=re.sub('\su\ss\s',' U.S. ',(temp))
    temp=re.sub('\[#.+]\|+','',(temp))
    temp = temp.lower()
    return temp

In [308]:
with open("stopwords.txt") as f:
    stopwords = [word for line in f for word in line.split()]
def remove_stop(m):
    return '' if m.group() in stopwords else m.group()
def clean_text(text):
    temp = text
    temp=re.sub('\S+[\'\’\‘]\S+','',(temp))
    temp=re.sub('\w*\d\S*','',(temp))
    temp=re.sub('(?<!^|$)(?<!([(\.)(\!)(\?)(\“)]\s))([A-Z]\S+)','',(temp))
    temp=re.sub('[^A-z\s\-\–\&]',' ',(temp))
    temp=re.sub('\su\ss\s',' U.S. ',(temp))
    temp=re.sub('\[#.+]\|+','',(temp))
    temp = temp.lower()
    return temp
yackreview = re.sub('[\s\n\r\t]+',' ',re.sub(r'\w+(\-|\—|\.|\&|\’)?(\w+)?', remove_stop, clean_text(yackreview))).split()

In [338]:
totaltext = ' '.join([' '.join(reviews[i]) for i in list(range(len(reviews)))])
totaltext = totaltext.split()

yackvec_bow = dictionary.doc2bow(yackreview)
yackvec_lda = lda[yackvec_bow]
totalvec_lda = [(0, 0.032843836), (1, 0.04208189), (2, 0.10474158), (5, 0.042373378), (6, 0.013250324), (9, 0.036808446), (11, 0.023320341), (12, 0.021665176), (14, 0.10429024), (15, 0.030837856), (16, 0.017426088), (18, 0.018971419), (20, 0.049614906), (21, 0.018747063), (22, 0.11087632), (26, 0.14842194), (28, 0.010651322), (33, 0.032480825), (35, 0.07782937)]

scores = []
for i in np.arange(len(reviews)):
    sampvec_bow = dictionary.doc2bow(reviews[i])
    sampvec_lda = lda[sampvec_bow]
    dense1 = gensim.matutils.sparse2full(sampvec_lda, lda.num_topics)
    dense2 = gensim.matutils.sparse2full(totalvec_lda, lda.num_topics)
    sim = np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2))**2).sum())
    scores.append(sim)