In [None]:
import os
import numpy as np
import pandas as pd
import time
import datetime
import cPickle as pickle
from matplotlib.pylab import subplots
import matplotlib.pylab as mpylab
import matplotlib.ticker as mticker
import matplotlib.dates as mdates

In [None]:
def UTC_to_UnixTime(dt):
    '''
    Given a UTC time, find the Unix time

    :param dt, UTC datetime
    :type message: datetime.datetime
    :returns: Unix time of the UTC datetime, like 1200835411
    :type returns: int
    '''
    return int((dt - datetime.datetime(1970,1,1)).total_seconds())

def UnixTime_to_UTC(timestamp):

    '''
    converts Unix time to UTC time
    
    :param timestamp: unix timestamp
    :type timestamp: int 
    :return corresponding date and time
    :type: datetime.datetime
    '''
    # can show it use .strftime('%Y-%m-%d %H:%M:%S')
    return datetime.datetime.utcfromtimestamp(int(timestamp))

def find_period(UnixTime, periods):
    '''
    find the periods that UTCtime is in 
    :param UTCtime, like 1200835411
    :type message: int
    :param UTCtime, like [(start_UTCtime, end_UTCtime), ]
    :type message: int
    :returns: index of the period that UTCtime is in in periods, return -1 if cannot find a 
    period that UTCtime is in
    :type returns: int
    '''
    for i in range(len(periods)):
        if periods[i][0] <= UnixTime < periods[i][1]:
            return i
    
    # now we cannot find a period that UTCtime is in 
    return -1


In [None]:
MBID_dictionary = pickle.load(open(path + '/' + 'MBID_dictionary.p','rb'))
df_demographics = pd.read_csv(path + '/' + 'MLHD_demographics_scrobble.csv', delimiter = '\t')
df_demographics.head()

In [None]:
def listeners_vs_time_for_genres(genres, periods, path):
    '''
    For several genres， count the number of listeners in different periods
    
    :param genres, a list of genres selected
    :type message: list
    :param periods, a list of periods selected, with format [(datetime, datetime), ]
    :type message: list
    :param path, a string represents path to the diretory that contains user dataset 
    :type message: str
    :returns: a dictionary contains all the statistics information, with format 
    {genres: (periods, #listeners)}
    :type returns: dict
    
    Global variable df_demographics and MBID_dictionary are used
    '''
    
    # dictionaries: {genres: (periods, #listeners)} 
    # initialization of dictionary
    statistics = {}
    for genre in genres:
        statistics[genre] = (periods, [0] * len(periods))
    
    for filename in os.listdir(path): # iterate over all user files
        # user file has file name: uuid.txt.gz
        if 'gz' in filename:
            with gzip.open(new_path + '/' + filename) as f:
                # clearing the data
                user = pd.read_csv(f, delimiter = '\t', header = None)
                user = user.drop([2,3],axis=1) 
                user = user.dropna()
                user.columns = ['time', 'artist_MBID']

                '''
                one user can only be counted once for one period of every genre.So 
                we use tag_added to denote whether one user is added as a listener
                for a genre of some period. 
                
                tag_added has format {genre: list of booleans, }
                tag_added[genre][i] == True indicates that this user has been added as 
                a listener for genre in periods[i]. False indicates hasn't been added.
                '''
                tag_added = {}
                for genre in genres:
                    tag_added[genre] =  [False] * len(periods)

                '''
                user_used is a boolean indicating whether we use this user to do statistics
                because the data of user is obtained in different time periods, so we need to
                ensure that in the periods we used to do statistics, the user is continuing to
                contribute data to our dataset
                '''
                user_uuid = filename[:-7]
                user_demographics = df_demographics[df_demographics.uuid == user_uuid]
                user_used = False
                if user_demographics.shape[0] > 0: # if the user has demographics
                    user_used = user_demographics.iat[0,4] <= periods[0][0] \
                        and user_demographics.iat[0,5] >= periods[len(periods) - 1][1]


                for i in range(user.shape[0]): #iterate all logs in one user file
                    if user_used:
                        period_index = find_period(user.iat[i,0], periods)
                        if  period_index != -1:
                            if user.iat[i,1] in MBID_dictionary:
                                taglist = MBID_dictionary[user.iat[i,1]]
                                for tag in taglist: 
                                    for genre in genres:
                                        # tag can be something like 'hardrock' and 'pop/rock'
                                        if genre in tag and not tag_added[genre][period_index]:
                                            statistics[genre][1][period_index] += 1
                                            tag_added[genre][period_index] = True
    return statistics

In [None]:
def plot_listeners_vs_time_for_genres(statistics, datemin, datemax):
    '''
    User statistical information produced in statistics_1(genres, periods, path) 
    and plot correspoding graph, for several genres，the number of listeners vs time period
    
    :param statistics, the output of statistics_1(genres, periods, path), a dictionary of
    format: {genres: (periods, #listeners)}
    :type statistics: dict
    :param datemin, the small bound of x we plot
    :type message: datetime.datetime
    :param datemax, the large bound of x we plot
    :type message: datetime.datetime
    :returns: no return
    
    '''
    years = mdates.YearLocator()   # every year
    months = mdates.MonthLocator()  # every month
    yearsFmt = mdates.DateFormatter('%Y')

    fig,ax = subplots()
    for genre in iter(statistics):
        dates = [UnixTime_to_UTC(period[0]) for period in statistics[genre][0]]
        ax.plot(dates,statistics[genre][1], label = genre)


    ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    ax.set_xlabel('date')
    ax.set_ylabel('number of listeners')
    ax.set_title('number of listeners vs date for different genres', fontsize=14)

    # format the ticks
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(yearsFmt)
    ax.xaxis.set_minor_locator(months)
    ax.yaxis.set_major_locator(mticker.MaxNLocator(integer=True))

    ax.set_xlim(datemin, datemax)
    ax.grid(True)
    # rotates and right aligns the x labels, and moves the bottom of the
    # axes up to make room for them
    fig.autofmt_xdate()

    fig.show()
    mpylab.savefig('1_(%d,%d).png' %(datemin.year,datemax.year), bbox_inches='tight')
    
    return

In [None]:
'''
Plot how number of listeners change with time for selected genres.
We use 1000 user files and one month as the time period. Besides,
the whole period is selected empirically. 

When you run codes, please change path, which is the local directory 
that you store user files
'''

# first select point-in-time and transfer them to unix times
selected_datetimes = []
for i in range(5):
    for j in range(12):
        selected_datetimes.append(datetime.datetime(2008 + i, j + 1, 1))
UnixTimes = [UTC_to_UnixTime(dt) for dt in selected_datetimes]

# then form periods
# periods = [(start_UnixTime, end_UnixTime), ]
periods = []
for i in range(len(UnixTimes) - 1):
    periods.append((UnixTimes[i], UnixTimes[i + 1])) 
    
genres = ['pop', 'rock', 'jazz', 'folk', 'blue', 'kpop', 'hiphop', 'disco'] # list of genres
path = '/Users/yjt/Downloads/MLHD_000'

start_time = time.time()
statistics1 = statistics_1(genres, periods, path)
print("--- %s seconds ---" % (time.time() - start_time)) 

In [None]:
%matplotlib inline

In [None]:
datemin = datetime.datetime(2010, 1, 1)
datemax = datetime.datetime(2011, 1, 1)
plot_graph1(statistics1, datemin, datemax)