# Objectives: 

* compare topics across years
* compare topics across locations

    
## What joins/aggregations do we need to achieve objectives?

### Yearly

1. Join topics with yearly on track_id
2. Average over each year

### Location

1. Join topics with artists on track_id
2. Average over each artist
3. Join artist average with artist loc on artist_id


In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [118]:
song_topic_weights = pd.read_feather('../../data/clean/song_topic_weights_ranked.feather')
topic_words = pd.read_feather('../../data/transform/topic_words.feather')
tracks_per_year = pd.read_feather('../../data/transform/tracks_per_year.feather')
unique_artists = pd.read_feather('../../data/transform/unique_artists.feather')
unique_tracks = pd.read_feather('../../data/transform/unique_tracks.feather')
artist_loc = pd.read_feather('../../data/transform/artist_location.feather')

In [119]:
#yearly
weights_yearly = song_topic_weights.set_index('track_id').join(tracks_per_year.set_index('track_id'), how='inner').reset_index()

weights_yearly_clean = {}
weights_yearly_clean['love'] = [0] * len(weights_yearly.year.unique())
weights_yearly_clean['death'] = [0] * len(weights_yearly.year.unique())
weights_yearly_clean['religion'] = [0] * len(weights_yearly.year.unique())
weights_yearly_clean['year'] = [0] * len(weights_yearly.year.unique())

for i, val in enumerate(weights_yearly.year.unique()):
    temp = weights_yearly[weights_yearly.year==val]
    weights_yearly_clean['year'][i] = val
    weights_yearly_clean['love'][i] = temp.love.mean()
    weights_yearly_clean['death'][i] = temp.death.mean()
    weights_yearly_clean['religion'][i] = temp.religion.mean()

topics_per_year = pd.DataFrame(weights_yearly_clean).sort_values(by='year').reset_index(drop=True)

In [120]:
#by artist/location

weights_artist = song_topic_weights.set_index('track_id').join(unique_artists.set_index('track_id'), how='inner').reset_index()

weights_artist_clean = {}
weights_artist_clean['love'] = [0] * len(weights_artist.artist_id.unique())
weights_artist_clean['death'] = [0] * len(weights_artist.artist_id.unique())
weights_artist_clean['religion'] = [0] * len(weights_artist.artist_id.unique())
weights_artist_clean['artist_id'] = [0] * len(weights_artist.artist_id.unique())
weights_artist_clean['artist_name'] = [0] * len(weights_artist.artist_id.unique())


for i, val in enumerate(weights_artist.artist_id.unique()):
    temp = weights_artist[weights_artist.artist_id==val]
    weights_artist_clean['artist_id'][i] = val
    weights_artist_clean['artist_name'][i] = temp.artist_name.iloc[0]
    weights_artist_clean['love'][i] = temp.love.mean()
    weights_artist_clean['death'][i] = temp.death.mean()
    weights_artist_clean['religion'][i] = temp.religion.mean()

topics_per_artist = pd.DataFrame(weights_artist_clean).sort_values(by='artist_name').reset_index(drop=True)

In [121]:
location_weight = topics_per_artist.set_index('artist_id').drop('artist_name',axis=1).join(artist_loc.set_index('artist_id'), how='inner').reset_index()

In [122]:
location_weight.to_feather('../../data/clean/location_topic_weights.feather')
topics_per_artist.to_feather('../../data/clean/artist_topic_weights.feather')
topics_per_year.to_feather('../../data/clean/year_topic_weights.feather')
