In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

In [4]:
dirname = os.environ['LASTFM_1K_DIRNAME']

In [5]:
filepath = os.path.join(dirname, 'userid-timestamp-artid-artname-traid-traname.tsv')

In [6]:
%%time

df = pd.read_csv(
    filepath, sep='\t', header=None,
    names=[
        'user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name'
    ],
    skiprows=[
        2120260-1, 2446318-1, 11141081-1,
        11152099-1, 11152402-1, 11882087-1,
        12902539-1, 12935044-1, 17589539-1
    ]
)
df["timestamp"] = pd.to_datetime(df.timestamp)
df.sort_values(['user_id', 'timestamp'], ascending=True, inplace=True)
print(f'Number of Records: {len(df):,}\nUnique Users: {df.user_id.nunique()}\nUnique Artist:{df.artist_id.nunique():,}')
df.head(5)

Number of Records: 19,098,853
Unique Users: 992
Unique Artist:107,295
CPU times: user 50.2 s, sys: 5.71 s, total: 55.9 s
Wall time: 56.1 s


Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
16684,user_000001,2006-08-13 13:59:20+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,c4633ab1-e715-477f-8685-afa5f2058e42,The Launching Of Big Face
16683,user_000001,2006-08-13 14:03:29+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,bc2765af-208c-44c5-b3b0-cf597a646660,Zn Zero
16682,user_000001,2006-08-13 14:10:43+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,aa9c5a80-5cbe-42aa-a966-eb3cfa37d832,The Return Of Super Barrio - End Credits
16681,user_000001,2006-08-13 14:17:40+00:00,67fb65b5-6589-47f0-9371-8a40eb268dfb,Tommy Guerrero,d9b1c1da-7e47-4f97-a135-77260f2f559d,Mission Flats
16680,user_000001,2006-08-13 14:19:06+00:00,1cfbc7d1-299c-46e6-ba4c-1facb84ba435,Artful Dodger,120bb01c-03e4-465f-94a0-dce5e9fac711,What You Gonna Do?


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19098853 entries, 16684 to 19080480
Data columns (total 6 columns):
 #   Column       Dtype              
---  ------       -----              
 0   user_id      object             
 1   timestamp    datetime64[ns, UTC]
 2   artist_id    object             
 3   artist_name  object             
 4   track_id     object             
 5   track_name   object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 1020.0+ MB


In [9]:
save_filepath = os.path.join(dirname, 'userid-timestamp-artid-artname-traid-traname.snappy.parquet')
df.to_parquet(save_filepath, compression='snappy', index=False)

In [10]:
del df

In [11]:
%%time

df = pd.read_parquet(save_filepath)
print(f'Number of Records: {len(df):,}\nUnique Users: {df.user_id.nunique()}\nUnique Artist:{df.artist_id.nunique():,}')
df.head(5)

Number of Records: 19,098,853
Unique Users: 992
Unique Artist:107,295
CPU times: user 12.7 s, sys: 3.9 s, total: 16.6 s
Wall time: 17.6 s


Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
0,user_000001,2006-08-13 13:59:20+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,c4633ab1-e715-477f-8685-afa5f2058e42,The Launching Of Big Face
1,user_000001,2006-08-13 14:03:29+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,bc2765af-208c-44c5-b3b0-cf597a646660,Zn Zero
2,user_000001,2006-08-13 14:10:43+00:00,09a114d9-7723-4e14-b524-379697f6d2b5,Plaid & Bob Jaroc,aa9c5a80-5cbe-42aa-a966-eb3cfa37d832,The Return Of Super Barrio - End Credits
3,user_000001,2006-08-13 14:17:40+00:00,67fb65b5-6589-47f0-9371-8a40eb268dfb,Tommy Guerrero,d9b1c1da-7e47-4f97-a135-77260f2f559d,Mission Flats
4,user_000001,2006-08-13 14:19:06+00:00,1cfbc7d1-299c-46e6-ba4c-1facb84ba435,Artful Dodger,120bb01c-03e4-465f-94a0-dce5e9fac711,What You Gonna Do?


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19098853 entries, 0 to 19098852
Data columns (total 6 columns):
 #   Column       Dtype              
---  ------       -----              
 0   user_id      object             
 1   timestamp    datetime64[ns, UTC]
 2   artist_id    object             
 3   artist_name  object             
 4   track_id     object             
 5   track_name   object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 874.3+ MB
