# Data Understanding - Stats

## Setup

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
df = pd.read_csv("/content/Spotify_Youtube_Sample.csv")
df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Views,Likes,Comments,Licensed,official_video,Stream
0,Gorillaz,Feel Good Inc.,Demon Days,album,693555221.0,6220896.0,169907.0,True,True,1040235000.0
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,72011645.0,1079128.0,31003.0,True,True,310083700.0
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,8435055.0,282142.0,7399.0,True,True,63063470.0
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,211754952.0,1788577.0,55229.0,True,True,434663600.0
4,Gorillaz,Clint Eastwood,Gorillaz,album,618480958.0,6197318.0,155930.0,True,True,617259700.0


## General Idea

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Artist          20718 non-null  object 
 1   Track           20718 non-null  object 
 2   Album           20718 non-null  object 
 3   Album_type      20718 non-null  object 
 4   Views           20248 non-null  float64
 5   Likes           20177 non-null  float64
 6   Comments        20149 non-null  float64
 7   Licensed        20248 non-null  object 
 8   official_video  20248 non-null  object 
 9   Stream          20142 non-null  float64
dtypes: float64(4), object(6)
memory usage: 1.6+ MB


In [4]:
df.describe()

Unnamed: 0,Views,Likes,Comments,Stream
count,20248.0,20177.0,20149.0,20142.0
mean,93937820.0,663341.1,27518.99,135942200.0
std,274644300.0,1789324.0,193234.7,244132100.0
min,0.0,0.0,0.0,6574.0
25%,1826002.0,21581.0,509.0,17674860.0
50%,14501100.0,124481.0,3277.0,49682980.0
75%,70399750.0,522148.0,14360.0,138358100.0
max,8079649000.0,50788650.0,16083140.0,3386520000.0


## No-Numerical Attributes

In [5]:
df['Artist'].value_counts()

Gorillaz                  10
Die drei !!!              10
Hollywood Undead          10
Empire of the Sun         10
White Noise for Babies    10
                          ..
NewJeans                   6
Alfonso Herrera            6
Jimin                      3
Stars Music Chile          1
Bootie Brown               1
Name: Artist, Length: 2079, dtype: int64

In [6]:
df['Artist'].unique()

array(['Gorillaz', 'Red Hot Chili Peppers', '50 Cent', ..., 'LE SSERAFIM',
       'ThxSoMch', 'SICK LEGEND'], dtype=object)

In [7]:
df['Artist'].nunique()

2079

In [8]:
nonnumericalcols = ['Artist', 'Track', 'Album', 'Album_type', 'Licensed', 'official_video']
df[nonnumericalcols].nunique()

Artist             2079
Track             17841
Album             11937
Album_type            3
Licensed              2
official_video        2
dtype: int64

### Categorical Attributes

In [9]:
album_type = pd.DataFrame({'Album_type' : df['Album_type'].value_counts()})
album_type

Unnamed: 0,Album_type
album,14926
single,5004
compilation,788


In [10]:
licensed = pd.DataFrame({'Licensed' : df['Licensed'].value_counts()})
licensed

Unnamed: 0,Licensed
True,14140
False,6108


In [11]:
official_video = pd.DataFrame({'official_video' : df['official_video'].value_counts()})
official_video

Unnamed: 0,official_video
True,15723
False,4525


## Numerical Attributes

### Central Tendency
> min, max, median, mode, midrange

In [12]:
col = 'Views'
min = df[col].min()
max = df[col].max()
median = df[col].median()
mode = df[col].mode()[0]
midrange = (max - min)/2
print('col:',col, 
      '\n\tmin:', min, 
      'max:',max, 
      'median:', median, 
      'mode:', mode, 
      'midrange:', midrange)

col: Views 
	min: 0.0 max: 8079649362.0 median: 14501095.0 mode: 6639.0 midrange: 4039824681.0


In [13]:
def getCentralTendency(col):
    min = df[col].min()
    max = df[col].max()
    median = df[col].median()
    mode = df[col].mode()[0]
    midrange = (max - min)/2
    print('col:',col, 
      '\n\tmin:', min, 
      'max:',max, 
      'median:', median, 
      'mode:', mode, 
      'midrange:', midrange)

numericalcols = ['Views', 'Likes', 'Comments', 'Stream']

for col in numericalcols:
    getCentralTendency(col)

col: Views 
	min: 0.0 max: 8079649362.0 median: 14501095.0 mode: 6639.0 midrange: 4039824681.0
col: Likes 
	min: 0.0 max: 50788652.0 median: 124481.0 mode: 0.0 midrange: 25394326.0
col: Comments 
	min: 0.0 max: 16083138.0 median: 3277.0 mode: 0.0 midrange: 8041569.0
col: Stream 
	min: 6574.0 max: 3386520288.0 median: 49682981.5 mode: 169769959.0 midrange: 1693256857.0


### Dispersion
> range, quantiles, var, std

In [14]:
col = 'Views'
range = df[col].max() - df[col].min()
quantiles = df[col].quantile([0.25, 0.5, 0.75])
IQR = quantiles[0.75] - quantiles[0.25]
var = df[col].var()
std = df[col].std()

print('col:',col, 
      '\n\trange:', range, 
      'Q1:',quantiles[0.25], 
      'Q2:', quantiles[0.5], 
      'Q3:', quantiles[0.75], 
      'IQR:', IQR,
      'var:', var,
      'std:', std)

col: Views 
	range: 8079649362.0 Q1: 1826001.5 Q2: 14501095.0 Q3: 70399749.0 IQR: 68573747.5 var: 7.542950360937822e+16 std: 274644322.0046215


In [15]:
def getDispersion(col):
    range = df[col].max() - df[col].min()
    quantiles = df[col].quantile([0.25, 0.5, 0.75])
    IQR = quantiles[0.75] - quantiles[0.25]
    var = df[col].var()
    std = df[col].std()
    print('col:',col, 
      '\n\trange:', range, 
      'Q1:',quantiles[0.25], 
      'Q2:', quantiles[0.5], 
      'Q3:', quantiles[0.75], 
      'IQR:', IQR,
      'var:', var,
      'std:', std)
numericalcols = ['Views', 'Likes', 'Comments', 'Stream']

for col in numericalcols:
    getDispersion(col)

col: Views 
	range: 8079649362.0 Q1: 1826001.5 Q2: 14501095.0 Q3: 70399749.0 IQR: 68573747.5 var: 7.542950360937822e+16 std: 274644322.0046215
col: Likes 
	range: 50788652.0 Q1: 21581.0 Q2: 124481.0 Q3: 522148.0 IQR: 500567.0 var: 3201681265274.244 std: 1789324.2482217257
col: Comments 
	range: 16083138.0 Q1: 509.0 Q2: 3277.0 Q3: 14360.0 IQR: 13851.0 var: 37339645168.43132 std: 193234.68935062183
col: Stream 
	range: 3386513714.0 Q1: 17674864.25 Q2: 49682981.5 Q3: 138358065.25 IQR: 120683201.0 var: 5.960047142258919e+16 std: 244132077.82384762


### Correlation

In [16]:
df[numericalcols].corr()

Unnamed: 0,Views,Likes,Comments,Stream
Views,1.0,0.891101,0.431185,0.601905
Likes,0.891101,1.0,0.63167,0.654247
Comments,0.431185,0.63167,1.0,0.267737
Stream,0.601905,0.654247,0.267737,1.0
