# Data Exploration

A simple exploration of the train and test parquet files.

In [1]:
import pandas as pd

# Load the data
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

## Dataset Shapes

In [2]:
print(f"Train: {train.shape[0]:,} rows, {train.shape[1]} columns")
print(f"Test: {test.shape[0]:,} rows, {test.shape[1]} columns")

Train: 17,499,636 rows, 19 columns
Test: 4,393,179 rows, 19 columns


## Column Overview

In [3]:
train.dtypes

status                    int64
gender                   object
firstName                object
level                    object
lastName                 object
userId                   object
ts                        int64
auth                     object
page                     object
sessionId                 int64
location                 object
itemInSession             int64
userAgent                object
method                   object
length                  float64
song                     object
artist                   object
time             datetime64[us]
registration     datetime64[us]
dtype: object

## Sample Data

In [4]:
train.head(10)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
0,200,M,Shlok,paid,Johnson,1749042,1538352001000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21
992,200,M,Shlok,paid,Johnson,1749042,1538352525000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21
1360,200,M,Shlok,paid,Johnson,1749042,1538352703000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21
1825,200,M,Shlok,paid,Johnson,1749042,1538352935000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21
2366,200,M,Shlok,paid,Johnson,1749042,1538353200000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21
3271,200,M,Shlok,paid,Johnson,1749042,1538353671000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",283,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,266.86649,Who Can Compare,Foolish Things,2018-10-01 00:27:51,2018-08-08 13:22:21
3802,200,M,Shlok,paid,Johnson,1749042,1538353937000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",284,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,1400.65914,Angel Dust,Gil Scott Heron,2018-10-01 00:32:17,2018-08-08 13:22:21
6585,200,M,Shlok,paid,Johnson,1749042,1538355337000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",285,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,186.98404,Sweet And Dandy,Toots & The Maytals,2018-10-01 00:55:37,2018-08-08 13:22:21
6675,200,M,Shlok,paid,Johnson,1749042,1538355388000,Logged In,Downgrade,22683,"Dallas-Fort Worth-Arlington, TX",286,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",GET,,,,2018-10-01 00:56:28,2018-08-08 13:22:21
6961,200,M,Shlok,paid,Johnson,1749042,1538355523000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",287,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,306.05016,On The Moon,Peter Cincotti,2018-10-01 00:58:43,2018-08-08 13:22:21


## Missing Values

In [5]:
# Missing values in train
missing_train = train.isnull().sum()
missing_train[missing_train > 0]

length    3208203
song      3208203
artist    3208203
dtype: int64

In [6]:
# Missing values in test
missing_test = test.isnull().sum()
missing_test[missing_test > 0]

gender        653681
firstName     653681
lastName      653681
location      653681
userAgent     653681
length       1331368
song         1331368
artist       1331368
dtype: int64

## Basic Statistics

In [7]:
train.describe()

Unnamed: 0,status,ts,sessionId,itemInSession,length,time,registration
count,17499640.0,17499640.0,17499640.0,17499640.0,14291430.0,17499636,17499636
mean,209.1387,1540428000000.0,84802.94,105.5937,248.7135,2018-10-25 00:47:01.161927,2018-08-25 04:40:21.543066
min,200.0,1538352000000.0,1.0,0.0,0.522,2018-10-01 00:00:01,2017-10-14 22:05:25
25%,200.0,1539340000000.0,25159.0,26.0,199.8885,2018-10-12 10:33:57.750000,2018-08-10 21:14:59
50%,200.0,1540397000000.0,79038.0,66.0,234.0828,2018-10-24 15:58:54,2018-09-05 18:35:50
75%,200.0,1541500000000.0,138368.0,144.0,276.8714,2018-11-06 10:25:35,2018-09-20 17:24:57
max,404.0,1542672000000.0,207003.0,1426.0,3024.666,2018-11-20 00:00:00,2018-11-19 23:34:34
std,30.2305,1233485000.0,61414.27,116.8854,97.22845,,


## Categorical Columns

In [8]:
# Page types
train['page'].value_counts()

page
NextSong                     14291433
Thumbs Up                      789391
Home                           645259
Add to Playlist                409606
Roll Advert                    284837
Add Friend                     262147
Logout                         204700
Thumbs Down                    164964
Downgrade                      124248
Settings                       101191
Help                            89035
Upgrade                         37696
About                           33117
Save Settings                   20370
Error                           17294
Submit Upgrade                  11381
Submit Downgrade                 4425
Cancellation Confirmation        4271
Cancel                           4271
Name: count, dtype: int64

In [9]:
# User levels
train['level'].value_counts()

level
paid    13506659
free     3992977
Name: count, dtype: int64

In [10]:
# Gender distribution
train['gender'].value_counts()

gender
M    9191364
F    8308272
Name: count, dtype: int64

## Unique Counts

In [11]:
print(f"Unique users (train): {train['userId'].nunique():,}")
print(f"Unique users (test): {test['userId'].nunique():,}")
print(f"Unique songs (train): {train['song'].nunique():,}")
print(f"Unique artists (train): {train['artist'].nunique():,}")
print(f"Unique sessions (train): {train['sessionId'].nunique():,}")

Unique users (train): 19,140
Unique users (test): 2,904
Unique songs (train): 239,299
Unique artists (train): 37,264
Unique sessions (train): 161,194


## Time Range

In [12]:
print(f"Train time range: {train['time'].min()} to {train['time'].max()}")
print(f"Test time range: {test['time'].min()} to {test['time'].max()}")

Train time range: 2018-10-01 00:00:01 to 2018-11-20 00:00:00
Test time range: 2018-10-01 00:00:06 to 2018-11-20 00:00:00
