# Anomaly Detection (Unsupervised)

## Isolation Forest

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14
plt.style.use("fivethirtyeight")
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
raw_data = pd.read_csv('../data/spotify_data.csv')
data = raw_data.drop(labels=['artists', 'id', 'name', 'release_date'], axis=1)

In [6]:
raw_data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
X = StandardScaler().fit_transform(data)

In [5]:
from sklearn.ensemble import IsolationForest

In [7]:
iso = IsolationForest(n_estimators=50, contamination=0.1)

In [8]:
iso.fit(X)

IsolationForest(contamination=0.1, n_estimators=50)

In [9]:
raw_data['iso_scores'] = iso.decision_function(X)
raw_data['iso_anomaly'] = iso.predict(X)

In [10]:
anomalies = raw_data[raw_data.iso_anomaly == -1]

In [11]:
anomalies.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,iso_scores,iso_anomaly
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954,-0.058588,-1
11,0.578,1921,0.994,['Ignacio Corsini'],0.378,155413,0.115,0,0F30WM8qRpO8kdolepZqdM,0.906,10,0.11,-27.039,0,Por Que Me Dejaste - Remasterizado,0,1921-03-20,0.0414,70.37,-0.009415,-1
14,0.493,1921,0.0175,['Zay Gatsby'],0.527,205072,0.691,1,0MJZ4hh60zwsYleWWxT5yW,0.384,7,0.358,-7.298,1,Power Is Power,0,1921-03-27,0.0326,159.935,-0.023798,-1
18,0.0778,1921,0.148,['THE GUY'],0.604,204957,0.418,1,0QQmUf4aPFaN9U2yRko595,0.0382,4,0.102,-11.566,0,When We Die,0,1921-09-11,0.0417,80.073,-0.017133,-1
32,0.185,1921,0.505,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.233,686664,0.00817,0,0yxeAe7MlS0UWZVdNZVZWw,0.000203,8,0.109,-37.311,1,Raja Manggala,5,1921,0.0305,108.143,-0.029599,-1


In [12]:
anomalies.sort_values(by=['iso_scores'], ascending=True).head(20)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,iso_scores,iso_anomaly
37714,0.0,2015,0.00948,"['Sleep Baby Sleep', 'Ocean Sounds', 'Ocean Wa...",0.0,93452,0.000252,0,0tIkLggEJ9QriD7KFz1GGA,0.877,9,0.806,-17.746,0,Beach Waves and Soothing Brown Noise,62,2015-01-06,0.0,0.0,-0.146422,-1
141112,0.611,1940,0.441,['Эрнест Хемингуэй'],0.633,115800,0.0894,1,47egG37AkJJcjftdg7aO2k,0.0,11,0.808,-21.55,0,Часть 39.2 - По ком звонит колокол,1,1940,0.964,167.38,-0.13935,-1
57170,0.0,2019,0.0926,['Microdynamic Recordings'],0.0,143583,0.00216,0,1RXXLvEKXmz2c6TG1EDUz2,0.982,10,0.821,-22.55,1,Calm Pour,70,2019-10-07,0.0,0.0,-0.13794,-1
141207,0.691,1940,0.308,['Эрнест Хемингуэй'],0.606,101900,0.0925,1,4QgbPvap5nzdemVdW1haJW,0.0,11,0.567,-25.508,0,Часть 72.2 - По ком звонит колокол,0,1940,0.962,192.914,-0.134827,-1
149308,0.0475,1988,6e-06,['Metallica'],0.176,660760,0.982,1,3AxLlRyXcDcpQN6rp1bpPB,0.574,9,0.994,-6.312,1,...And Justice for All (Live - Seattle '89),25,1988-09-07,0.215,173.801,-0.134482,-1
126922,0.177,1950,0.994,['President Franklin Delano Roosevelt'],0.669,1601620,0.677,0,1OOOjuRfvzeTIC0t0KJ4ln,0.886,11,0.813,-14.981,1,Fireside Chat #6 - On Government and Capitalis...,0,1950,0.926,102.354,-0.133982,-1
59199,0.735,1940,0.31,['Эрнест Хемингуэй'],0.626,118700,0.0861,1,19sEYV7TdBAXUsRYJpkABn,0.0,11,0.693,-17.933,0,Часть 95.3 - По ком звонит колокол,0,1940,0.953,176.349,-0.133624,-1
55691,0.00812,2012,0.0596,['Nature Sounds'],0.105,570203,0.951,0,34n0Ecmkfdxlejb6R3VzLL,0.91,11,0.882,-24.641,0,Healthful Rolling Thunder with Distant Rumbles,57,2012-09-25,0.0623,91.487,-0.132038,-1
141116,0.413,1940,0.347,['Эрнест Хемингуэй'],0.606,95200,0.0859,1,47tKj0XXDxozc8oIeIOAQm,0.0,11,0.544,-23.009,0,Часть 173.3 - По ком звонит колокол,0,1940,0.958,171.546,-0.131283,-1
59235,0.726,1940,0.121,['Эрнест Хемингуэй'],0.626,118770,0.104,1,1GU3vbEffUvB3QN97eBwC9,0.0,11,0.659,-20.065,0,Часть 69.4 & Часть 70.1 - По ком звонит колокол,0,1940,0.96,169.529,-0.130487,-1


In [13]:
from sklearn.neighbors import LocalOutlierFactor

In [14]:
lof = LocalOutlierFactor(n_neighbors=50, contamination=0.1)
lof.fit(X)

LocalOutlierFactor(contamination=0.1, n_neighbors=50)

In [15]:
scores = lof.negative_outlier_factor_

In [16]:
raw_data['LOF_scores'] = scores

In [17]:
top_lof = raw_data.sort_values(by=['LOF_scores'], ascending=True).head(100)

In [18]:
top_lof.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,iso_scores,iso_anomaly,LOF_scores
107341,0.0,2013,0.111,['Sound Dreamer'],0.0,5403500,9.9e-05,0,7foc25ig7dibxvULPU2kBG,0.392,2,0.137,-21.669,1,Brown Noise - 90 Minutes,50,2013-06-05,0.0,0.0,-0.102767,-1,-3.06574
4549,0.214,1944,0.879,['NNN'],0.421,37571,0.0034,1,0GUAMAp9HUmE7cl6MTJYYN,0.94,11,0.36,-32.341,1,FUCKMEFUCKMEFUCKME (.intro),0,1944-06-09,0.122,122.328,-0.050742,-1,-2.580929
110894,0.0788,1950,0.838,['Sundef'],0.662,216346,0.0245,1,0wU59kR7B1wCouDUHZu0ZK,0.833,1,0.108,-18.178,1,Intro,0,1950,0.0732,98.99,-0.020943,-1,-2.499313
88113,0.95,1999,0.275,['Eminem'],0.648,33227,0.199,1,3GhcNCBIEMyrcpCJ8ccvXr,0.0,1,0.136,-25.565,1,Public Service Announcement,45,1999-02-23,0.883,149.548,-0.090793,-1,-2.47884
121871,0.95,2005,0.253,['Kanye West'],0.731,31360,0.412,1,466XcSgGAiQxcDrcV0yoMP,0.0,11,0.164,-11.59,0,Skit #2,44,2005-08-30,0.946,66.512,-0.099761,-1,-2.454861


In [19]:
top_iso = anomalies.sort_values(by=['iso_scores'], ascending=True).head(100)

In [20]:
lofs = top_lof.name.to_list()
isos = top_iso.name.to_list()

In [21]:
list(set(lofs).intersection(isos))

['Have 2 Charge Now, Vol. 4']