# Discretization

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Spotify_Youtube_Sample.csv")
df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Views,Likes,Comments,Licensed,official_video,Stream
0,Gorillaz,Feel Good Inc.,Demon Days,album,693555221.0,6220896.0,169907.0,True,True,1040235000.0
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,72011645.0,1079128.0,31003.0,True,True,310083700.0
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,8435055.0,282142.0,7399.0,True,True,63063470.0
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,211754952.0,1788577.0,55229.0,True,True,434663600.0
4,Gorillaz,Clint Eastwood,Gorillaz,album,618480958.0,6197318.0,155930.0,True,True,617259700.0


In [3]:
df.describe()

Unnamed: 0,Views,Likes,Comments,Stream
count,20248.0,20177.0,20149.0,20142.0
mean,93937820.0,663341.1,27518.99,135942200.0
std,274644300.0,1789324.0,193234.7,244132100.0
min,0.0,0.0,0.0,6574.0
25%,1826002.0,21581.0,509.0,17674860.0
50%,14501100.0,124481.0,3277.0,49682980.0
75%,70399750.0,522148.0,14360.0,138358100.0
max,8079649000.0,50788650.0,16083140.0,3386520000.0


## Discretizing views

In [5]:
df["Views"].describe()

count    2.024800e+04
mean     9.393782e+07
std      2.746443e+08
min      0.000000e+00
25%      1.826002e+06
50%      1.450110e+07
75%      7.039975e+07
max      8.079649e+09
Name: Views, dtype: float64

In [9]:
condition = [df["Views"] < df["Views"].quantile(0.5), df["Views"] >= df["Views"].quantile(0.5)]
result = ["low", "high"]

In [10]:
df["Views_state"] = np.select(condition, result)

In [12]:
df["Views_state"]

0        high
1        high
2         low
3        high
4        high
         ... 
20713     low
20714     low
20715     low
20716     low
20717     low
Name: Views_state, Length: 20718, dtype: object

In [13]:
df["Views_state"].value_counts()

Views_state
high    10125
low     10123
0         470
Name: count, dtype: int64

Probably, zeros are from rows with NaN:

In [14]:
df.dropna(inplace = True)

In [15]:
df["Views_state"].value_counts()

Views_state
high    9852
low     9698
Name: count, dtype: int64

## Discretizing Comments and Likes

We can have four attributes:
1. Highly commented 
2. Highly liked
3. Weakly commented
4. Weakly liked

In [16]:
condition = [
    ((df["Likes"] >= df["Likes"].quantile(0.5)) & (df["Comments"] >= df["Comments"].quantile(0.5))),
    ((df["Likes"] < df["Likes"].quantile(0.5)) & (df["Comments"] < df["Comments"].quantile(0.5))),
    ((df["Likes"] >= df["Likes"].quantile(0.5)) & (df["Comments"] < df["Comments"].quantile(0.5))),
    ((df["Likes"] < df["Likes"].quantile(0.5)) & (df["Comments"] >= df["Comments"].quantile(0.5)))
]

In [17]:
result = ["high and high", "low and low", "high and low", "low and high"]

In [18]:
df["comments_likes"] = np.select(condition, result)

In [19]:
df["comments_likes"]

0        high and high
1        high and high
2        high and high
3        high and high
4        high and high
             ...      
20713      low and low
20714      low and low
20715      low and low
20716      low and low
20717      low and low
Name: comments_likes, Length: 19550, dtype: object

In [20]:
df["comments_likes"].value_counts()

comments_likes
high and high    8933
low and low      8933
high and low      842
low and high      842
Name: count, dtype: int64

## Discretizing Stream

In [23]:
df["Stream"].describe()

count    1.955000e+04
mean     1.371088e+08
std      2.463527e+08
min      6.574000e+03
25%      1.781149e+07
50%      4.979139e+07
75%      1.390790e+08
max      3.386520e+09
Name: Stream, dtype: float64

In [24]:
def discretizing_stream(value):
    if value < df["Stream"].quantile(0.25):
        return "low"
    elif value > df["Stream"].quantile(0.75):
        return "high"
    else:
        return "medium"

In [25]:
df["stream_state"] = df["Stream"].apply(discretizing_stream)

In [26]:
df["stream_state"]

0          high
1          high
2        medium
3          high
4          high
          ...  
20713       low
20714       low
20715       low
20716       low
20717       low
Name: stream_state, Length: 19550, dtype: object

In [27]:
df["stream_state"].value_counts()

stream_state
medium    9774
high      4888
low       4888
Name: count, dtype: int64

## Saving the data

In [28]:
df

Unnamed: 0,Artist,Track,Album,Album_type,Views,Likes,Comments,Licensed,official_video,Stream,Views_state,comments_likes,stream_state
0,Gorillaz,Feel Good Inc.,Demon Days,album,693555221.0,6220896.0,169907.0,True,True,1.040235e+09,high,high and high,high
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,72011645.0,1079128.0,31003.0,True,True,3.100837e+08,high,high and high,high
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,8435055.0,282142.0,7399.0,True,True,6.306347e+07,low,high and high,medium
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,211754952.0,1788577.0,55229.0,True,True,4.346636e+08,high,high and high,high
4,Gorillaz,Clint Eastwood,Gorillaz,album,618480958.0,6197318.0,155930.0,True,True,6.172597e+08,high,high and high,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20713,SICK LEGEND,JUST DANCE HARDSTYLE,JUST DANCE HARDSTYLE,single,71678.0,1113.0,0.0,True,True,9.227144e+06,low,low and low,low
20714,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE,SET FIRE TO THE RAIN HARDSTYLE,single,164741.0,2019.0,0.0,True,True,1.089818e+07,low,low and low,low
20715,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP,OUTSIDE HARDSTYLE SPED UP,single,35646.0,329.0,0.0,True,True,6.226110e+06,low,low and low,low
20716,SICK LEGEND,ONLY GIRL HARDSTYLE,ONLY GIRL HARDSTYLE,single,6533.0,88.0,0.0,True,True,6.873961e+06,low,low and low,low


In [32]:
df.to_csv("Spotify_Youtube_new.csv", index = False)

In [33]:
df_new = pd.read_csv("Spotify_Youtube_new.csv")

In [34]:
df_new

Unnamed: 0,Artist,Track,Album,Album_type,Views,Likes,Comments,Licensed,official_video,Stream,Views_state,comments_likes,stream_state
0,Gorillaz,Feel Good Inc.,Demon Days,album,693555221.0,6220896.0,169907.0,True,True,1.040235e+09,high,high and high,high
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,72011645.0,1079128.0,31003.0,True,True,3.100837e+08,high,high and high,high
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,8435055.0,282142.0,7399.0,True,True,6.306347e+07,low,high and high,medium
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,211754952.0,1788577.0,55229.0,True,True,4.346636e+08,high,high and high,high
4,Gorillaz,Clint Eastwood,Gorillaz,album,618480958.0,6197318.0,155930.0,True,True,6.172597e+08,high,high and high,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19545,SICK LEGEND,JUST DANCE HARDSTYLE,JUST DANCE HARDSTYLE,single,71678.0,1113.0,0.0,True,True,9.227144e+06,low,low and low,low
19546,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE,SET FIRE TO THE RAIN HARDSTYLE,single,164741.0,2019.0,0.0,True,True,1.089818e+07,low,low and low,low
19547,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP,OUTSIDE HARDSTYLE SPED UP,single,35646.0,329.0,0.0,True,True,6.226110e+06,low,low and low,low
19548,SICK LEGEND,ONLY GIRL HARDSTYLE,ONLY GIRL HARDSTYLE,single,6533.0,88.0,0.0,True,True,6.873961e+06,low,low and low,low
