### 1. Data Exploration and Preprocessing
- Combine all five datasets into a new, labeled one

In [3]:
import numpy as np
import pandas as pd

In [4]:
# Read all data files
songs_2019 = pd.read_csv('./Data/TikTok_songs_2019.csv')
songs_2020 = pd.read_csv('./Data/TikTok_songs_2020.csv')
songs_2021 = pd.read_csv('./Data/TikTok_songs_2021.csv')
songs_2022 = pd.read_csv('./Data/TikTok_songs_2022.csv')
unpopular_songs = pd.read_csv('./Data/unpopular_songs.csv')


In [5]:
unpopular_songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,explicit,popularity,track_name,track_artist,track_id
0,0.53,0.77,4,-6.633,0,0.0389,0.284,0.501,0.744,0.623,120.144,225696,False,2,No Regrets,James Reeder,6f2c4a9lNx8aowZJngv7cJ
1,0.565,0.73,1,-6.063,1,0.073,0.365,0.0,0.237,0.511,130.026,158093,False,2,Wild Life,James Reeder,3fTs52jsDzSuVLsifxNKO8
2,0.427,0.546,4,-8.727,1,0.0849,0.539,0.0152,0.368,0.435,78.345,167262,False,2,Fangs,James Reeder,6NPafqavrv0icaIHMQnXDy
3,0.421,0.531,7,-5.516,1,0.0262,0.706,0.000208,0.11,0.383,85.08,236832,False,2,Afterburner,James Reeder,3vGmhxveURgmlZStvo0uc1
4,0.537,0.804,8,-7.378,0,0.157,0.379,0.000489,0.323,0.543,139.95,239400,False,2,Hellfire Rising,James Reeder,4O2qRbfCHzMMgfbw9DBdGf


In [6]:
unpopular_songs.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'explicit', 'popularity', 'track_name', 'track_artist',
       'track_id'],
      dtype='object')

In [7]:
# Confirm that all of the tiktok datasets have the same features
list(songs_2019.columns) == list(songs_2020.columns) == list(songs_2021.columns) == list(songs_2022.columns)

True

In [8]:
# Rename a column from the unpopular songs dataset so that everything matches with the TikTok song datasets
unpopular_songs.rename(columns = {'track_artist':'artist_name'}, inplace = True)

In [9]:
unpopular_songs.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'explicit', 'popularity', 'track_name', 'artist_name',
       'track_id'],
      dtype='object')

In [10]:
# Label both the unpopular songs and the popular TikTok songs
unpopular_songs['is_popular'] = False
songs_2019["is_popular"] = True
songs_2020["is_popular"] = True
songs_2021["is_popular"] = True
songs_2022["is_popular"] = True

In [11]:
# Ranually columnhe right columns
cols_of_interest = ['track_name', 'artist_name', 'danceability', 'energy', 
       'loudness', 'mode', 'key', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo','duration_ms', 'is_popular']

# Concatenate/combine all 5 datasets into one
combined_df = pd.concat([songs_2019[cols_of_interest], songs_2020[cols_of_interest], songs_2021[cols_of_interest], 
       songs_2022[cols_of_interest], unpopular_songs[cols_of_interest]])

# Concatenate / combine all 5 datasets into one
combined_df

Unnamed: 0,track_name,artist_name,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,is_popular
0,Shake It,Metro Station,0.618,0.9550,-3.836,1,4,0.0798,0.00221,0.000003,0.486,0.7900,150.034,179947,True
1,Chinese New Year,SALES,0.744,0.8450,-7.422,0,4,0.2530,0.75900,0.232000,0.100,0.7490,75.221,160000,True
2,Baby I'm Yours,Breakbot,0.829,0.7920,-3.755,0,2,0.0668,0.72600,0.000006,0.122,0.7580,118.050,215507,True
3,The Git Up,Blanco Brown,0.847,0.6780,-8.635,1,9,0.1090,0.06690,0.000000,0.274,0.8110,97.984,200594,True
4,Say Hey (I Love You),Michael Franti & Spearhead,0.738,0.9830,-4.374,0,5,0.0855,0.03800,0.000006,0.183,0.9570,92.998,235760,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10872,Bugs n Toes,"Good Girl, Bad Boy",0.686,0.1320,-20.363,1,0,0.0339,0.26800,0.001050,0.181,0.4910,95.987,285286,False
10873,Scattered Thoughts,"Good Girl, Bad Boy",0.532,0.0573,-20.476,0,11,0.0479,0.90000,0.087700,0.111,0.0731,114.123,360729,False
10874,Travelin Bug,"Good Girl, Bad Boy",0.589,0.0308,-27.237,1,3,0.0405,0.94300,0.014200,0.106,0.5040,83.952,286585,False
10875,Goodnight,"Good Girl, Bad Boy",0.669,0.0381,-18.631,1,7,0.0382,0.78300,0.000000,0.182,0.1730,99.841,349454,False


In [12]:
# Convert and save to csv
combined_df.to_csv('./Data/combined_songs_dataset.csv', index=False)

In [13]:
# Check for duplicate data
combined_df[combined_df.duplicated()]

Unnamed: 0,track_name,artist_name,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,is_popular
187,It's You,Ali Gatie,0.7320,0.46300,-6.972,0,11,0.0287,0.37400,0.000000,0.1940,0.39700,95.971,212607,True
0,Say So,Doja Cat,0.7870,0.67300,-4.583,0,11,0.1590,0.26400,0.000003,0.0904,0.77900,110.962,237893,True
17,Buttercup,Jack Stauber,0.7050,0.37300,-9.066,1,7,0.0384,0.72300,0.810000,0.2890,0.55100,120.046,208026,True
18,Dissolve,Absofacto,0.6880,0.58200,-10.668,0,6,0.0542,0.23000,0.000157,0.0663,0.87200,85.486,224661,True
20,Airplane Mode,Limbo,0.6650,0.37300,-10.631,1,7,0.1580,0.85200,0.118000,0.1080,0.07000,119.905,164113,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9437,Whispering Waves,Sleep Waves,0.2050,0.00801,-23.200,1,0,0.0511,0.62800,0.869000,0.1220,0.03110,128.145,179878,False
9439,Steady River Flow,Water Soundscapes,0.1060,0.00214,-21.715,1,1,0.0526,0.12200,0.287000,0.1120,0.02690,93.350,199647,False
9440,Meandering River,Ocean Waves For Sleep,0.1560,1.00000,-20.650,1,7,0.0585,0.84800,0.380000,0.9450,0.00001,108.253,244413,False
9441,Tanquil Lake Shores,Ocean Waves For Sleep,0.0786,0.07260,-21.645,1,10,0.0508,0.06480,0.681000,0.0888,0.02070,176.989,153030,False


In [14]:
# Some duplicates are caused by the same song in different albums (be dropped)
songs_2019[songs_2019['track_name'] == "It's You"]

Unnamed: 0,track_name,artist_name,artist_pop,album,track_pop,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,is_popular
73,It's You,Ali Gatie,70,YOU,79,0.732,0.463,-6.972,0,11,0.0287,0.374,0.0,0.194,0.397,95.971,4,212607,True
187,It's You,Ali Gatie,70,It's You,5,0.732,0.463,-6.972,0,11,0.0287,0.374,0.0,0.194,0.397,95.971,4,212607,True


In [15]:
# Duplicates are caused by different IDs in the unpopular songs dataset (can also be dropped)
combined_df[combined_df['track_name'] == 'Whispering Waves']

Unnamed: 0,track_name,artist_name,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,is_popular
2883,Whispering Waves,Sleep Waves,0.205,0.00801,-23.2,1,0,0.0511,0.628,0.869,0.122,0.0311,128.145,179878,False
9437,Whispering Waves,Sleep Waves,0.205,0.00801,-23.2,1,0,0.0511,0.628,0.869,0.122,0.0311,128.145,179878,False


In [16]:
# Remove the duplicates - they can be dropped
combined_df.drop_duplicates()


Unnamed: 0,track_name,artist_name,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,is_popular
0,Shake It,Metro Station,0.618,0.9550,-3.836,1,4,0.0798,0.00221,0.000003,0.486,0.7900,150.034,179947,True
1,Chinese New Year,SALES,0.744,0.8450,-7.422,0,4,0.2530,0.75900,0.232000,0.100,0.7490,75.221,160000,True
2,Baby I'm Yours,Breakbot,0.829,0.7920,-3.755,0,2,0.0668,0.72600,0.000006,0.122,0.7580,118.050,215507,True
3,The Git Up,Blanco Brown,0.847,0.6780,-8.635,1,9,0.1090,0.06690,0.000000,0.274,0.8110,97.984,200594,True
4,Say Hey (I Love You),Michael Franti & Spearhead,0.738,0.9830,-4.374,0,5,0.0855,0.03800,0.000006,0.183,0.9570,92.998,235760,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10872,Bugs n Toes,"Good Girl, Bad Boy",0.686,0.1320,-20.363,1,0,0.0339,0.26800,0.001050,0.181,0.4910,95.987,285286,False
10873,Scattered Thoughts,"Good Girl, Bad Boy",0.532,0.0573,-20.476,0,11,0.0479,0.90000,0.087700,0.111,0.0731,114.123,360729,False
10874,Travelin Bug,"Good Girl, Bad Boy",0.589,0.0308,-27.237,1,3,0.0405,0.94300,0.014200,0.106,0.5040,83.952,286585,False
10875,Goodnight,"Good Girl, Bad Boy",0.669,0.0381,-18.631,1,7,0.0382,0.78300,0.000000,0.182,0.1730,99.841,349454,False


### 2. Split data into training, validation, and testing

In [20]:
# Get feature columns (everything except for the label)
feature_cols = combined_df.columns[combined_df.columns != 'is_popular']
feature_cols

Index(['track_name', 'artist_name', 'danceability', 'energy', 'loudness',
       'mode', 'key', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms'],
      dtype='object')

In [22]:
X = combined_df[feature_cols]
y = combined_df['is_popular']

print(X.shape)
print(y.shape)

(11845, 14)
(11845,)


In [25]:
# split into training and testing datasets
# we will use a 60/20/20 split for training, validation, and testing
from sklearn.model_selection import train_test_split

X_training, X_final_test, y_training, y_final_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [26]:
# this is what we will train with
X_train, X_test, y_train, y_test = train_test_split(X_training, y_training, test_size=0.25, random_state=6)

### 2. KNN models
- Try running KNN models on our new dataset (we try to classify the data into "popular" and "unpopular")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k = 3
knn = KNeighborsClassifier(n_neighbors=k) 

### Miscellaneous

In [None]:
#dont look
#df = pd.concat([songs_2019, songs_2020, songs_2021, songs_2022, unpopular_songs])
#df.head()
#songs_2019.columns
#hi soeur

In [None]:
songs_2019_csv.head()

Unnamed: 0,track_name,artist_name,artist_pop,album,track_pop,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms
0,Running Up That Hill (A Deal With God),Kate Bush,81,Hounds Of Love,95,0.629,0.547,-13.123,0,10,0.055,0.72,0.00314,0.0604,0.197,108.375,4,298933
1,As It Was,Harry Styles,91,As It Was,96,0.52,0.731,-5.338,0,6,0.0557,0.342,0.00101,0.311,0.662,173.93,4,167303
2,Sunroof,Nicky Youre,73,Sunroof,44,0.768,0.716,-5.11,1,10,0.0404,0.35,0.0,0.15,0.841,131.43,4,163026
3,Heat Waves,Glass Animals,80,Dreamland (+ Bonus Levels),89,0.761,0.525,-6.9,1,11,0.0944,0.44,7e-06,0.0921,0.531,80.87,4,238805
4,About Damn Time,Lizzo,81,About Damn Time,92,0.836,0.743,-6.305,0,10,0.0656,0.0995,0.0,0.335,0.722,108.966,4,191822


In [None]:
newdf = unpopular_songs
newdf.drop(columns=)

SyntaxError: invalid syntax (2294370384.py, line 2)

In [None]:
combined_df.tail()
y = df['is_popular']

Unnamed: 0,track_name,artist_name,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,is_popular
10872,Bugs n Toes,"Good Girl, Bad Boy",0.686,0.132,-20.363,1,0,0.0339,0.268,0.00105,0.181,0.491,95.987,285286,False
10873,Scattered Thoughts,"Good Girl, Bad Boy",0.532,0.0573,-20.476,0,11,0.0479,0.9,0.0877,0.111,0.0731,114.123,360729,False
10874,Travelin Bug,"Good Girl, Bad Boy",0.589,0.0308,-27.237,1,3,0.0405,0.943,0.0142,0.106,0.504,83.952,286585,False
10875,Goodnight,"Good Girl, Bad Boy",0.669,0.0381,-18.631,1,7,0.0382,0.783,0.0,0.182,0.173,99.841,349454,False
10876,Paranoid,Talen Talea,0.818,0.146,-19.684,0,9,0.123,0.828,0.959,0.108,0.235,110.026,74180,False
