In [1]:
import pandas as pd

# Load the data using Pandas
data = pd.read_csv('spotifytoptracks.csv')

In [2]:
# Perform data cleaning

# Handling missing values
data.dropna(inplace=True)

# Removing duplicate samples and features
data.drop_duplicates(inplace=True)

# Treating the outliers (if necessary)

In [3]:
# How many observations are there in this dataset?
num_observations = len(data)
print("Number of observations:", num_observations)

Number of observations: 50


In [4]:
# How many features this dataset has?
num_features = len(data.columns)
print("Number of features:", num_features)

Number of features: 17


In [5]:
# Identify categorical and numeric features
categorical_features = ['artist', 'album', 'track_name', 'track_id', 'genre']
numeric_features = ['energy', 'danceability', 'key', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

In [32]:
# Are there any artists that have more than 1 popular track? If yes, which and how many?
popular_artists = data['artist'].value_counts()
popular_artists = popular_artists[popular_artists > 1]
num_popular_artists = len(popular_artists)
print("Number of artists with more than 1 popular track:", num_popular_artists)
print("Popular artists with their track count:")
print(popular_artists)

Number of artists with more than 1 popular track: 7
Popular artists with their track count:
artist
Billie Eilish    3
Dua Lipa         3
Travis Scott     3
Justin Bieber    2
Harry Styles     2
Lewis Capaldi    2
Post Malone      2
Name: count, dtype: int64


In [7]:
# Who was the most popular artist?
most_popular_count = popular_artists.max()
most_popular_artists = popular_artists[popular_artists == most_popular_count].index.tolist()

if len(most_popular_artists) == 1:
    most_popular_artist = most_popular_artists[0]
    print("Most popular artist:", most_popular_artist)
else:
    print("Most popular artist:")
    for artist in most_popular_artists:
        print(artist)

Most popular artist:
Billie Eilish
Dua Lipa
Travis Scott


In [8]:
# How many artists in total have their songs in the top 50?
num_artists_in_top_50 = len(data['artist'].unique())
print("Number of artists in total in the top 50:", num_artists_in_top_50)

Number of artists in total in the top 50: 40


In [9]:
# Are there any albums that have more than 1 popular track? If yes, which and how many?
popular_albums = data['album'].value_counts()
popular_albums = popular_albums[popular_albums > 1]
num_popular_albums = len(popular_albums)
print("Number of albums with more than 1 popular track:", num_popular_albums)
print("Popular albums with their track count:")
print(popular_albums)

Number of albums with more than 1 popular track: 4
Popular albums with their track count:
album
Future Nostalgia        3
Hollywood's Bleeding    2
Fine Line               2
Changes                 2
Name: count, dtype: int64


In [10]:
# How many albums in total have their songs in the top 50?
num_albums_in_top_50 = len(data['album'].unique())
print("Number of albums in total in the top 50:", num_albums_in_top_50)

Number of albums in total in the top 50: 45


In [11]:
# Which tracks have a danceability score above 0.7?
tracks_above_07_danceability = data[data['danceability'] > 0.7]['track_name']
print("Tracks with danceability score above 0.7:")
print(tracks_above_07_danceability)

Tracks with danceability score above 0.7:
1                                      Dance Monkey
2                                           The Box
3                             Roses - Imanbek Remix
4                                   Don't Start Now
5                      ROCKSTAR (feat. Roddy Ricch)
7                  death bed (coffee for your head)
8                                           Falling
10                                             Tusa
13                                  Blueberry Faygo
14                         Intentions (feat. Quavo)
15                                     Toosie Slide
17                                           Say So
18                                         Memories
19                       Life Is Good (feat. Drake)
20                 Savage Love (Laxed - Siren Beat)
22                                      Breaking Me
24                              everything i wanted
25                                         Señorita
26                    

In [12]:
# Which tracks have a danceability score below 0.4?
tracks_below_04_danceability = data[data['danceability'] < 0.4]['track_name']
print("Tracks with danceability score below 0.4:")
print(tracks_below_04_danceability)

Tracks with danceability score below 0.4:
44    lovely (with Khalid)
Name: track_name, dtype: object


In [13]:
# Which tracks have their loudness above -5?
tracks_above_neg5_loudness = data[data['loudness'] > -5]['track_name']
print("Tracks with loudness above -5:")
print(tracks_above_neg5_loudness)

Tracks with loudness above -5:
4                                   Don't Start Now
6                                  Watermelon Sugar
10                                             Tusa
12                                          Circles
16                                    Before You Go
17                                           Say So
21                                        Adore You
23                           Mood (feat. iann dior)
31                                   Break My Heart
32                                         Dynamite
33                 Supalonely (feat. Gus Dapperton)
35                  Rain On Me (with Ariana Grande)
37    Sunflower - Spider-Man: Into the Spider-Verse
38                                            Hawái
39                                          Ride It
40                                       goosebumps
43                                          Safaera
48                                         Physical
49                               

In [14]:
# Which tracks have their loudness below -8?
tracks_below_neg8_loudness = data[data['loudness'] < -8]['track_name']
print("Tracks with loudness below -8:")
print(tracks_below_neg8_loudness)

Tracks with loudness below -8:
7                   death bed (coffee for your head)
8                                            Falling
15                                      Toosie Slide
20                  Savage Love (Laxed - Siren Beat)
24                               everything i wanted
26                                           bad guy
36                               HIGHEST IN THE ROOM
44                              lovely (with Khalid)
47    If the World Was Ending - feat. Julia Michaels
Name: track_name, dtype: object


In [15]:
# Which track is the longest?
longest_track = data[data['duration_ms'] == data['duration_ms'].max()]['track_name'].iloc[0]
print("Longest track:", longest_track)

Longest track: SICKO MODE


In [16]:
# Which track is the shortest?
shortest_track = data[data['duration_ms'] == data['duration_ms'].min()]['track_name'].iloc[0]
print("Shortest track:", shortest_track)

Shortest track: Mood (feat. iann dior)


In [17]:
# Which genre is the most popular?
most_popular_genre = data['genre'].value_counts().idxmax()
print("Most popular genre:", most_popular_genre)

Most popular genre: Pop


In [18]:
# Which genres have just one song on the top 50?
genres_with_single_song = data['genre'].value_counts()[data['genre'].value_counts() == 1]
print("Genres with just one song on the top 50:")
print(genres_with_single_song)

Genres with just one song on the top 50:
genre
Nu-disco                              1
R&B/Hip-Hop alternative               1
Pop/Soft Rock                         1
Pop rap                               1
Hip-Hop/Trap                          1
Dance-pop/Disco                       1
Disco-pop                             1
Dreampop/Hip-Hop/R&B                  1
Alternative/reggaeton/experimental    1
Chamber pop                           1
Name: count, dtype: int64


In [19]:
# How many genres in total are represented in the top 50?
num_genres_in_top_50 = len(data['genre'].unique())
print("Number of genres in total in the top 50:", num_genres_in_top_50)

Number of genres in total in the top 50: 16


In [20]:
# Calculate correlation matrix
correlation_matrix = data[numeric_features].corr()

In [21]:
# Identify strongly positively correlated features
strong_pos_corr_features = correlation_matrix[correlation_matrix > 0.7].stack().dropna().index
print("Features strongly positively correlated:")
print(strong_pos_corr_features)

Features strongly positively correlated:
MultiIndex([(          'energy',           'energy'),
            (          'energy',         'loudness'),
            (    'danceability',     'danceability'),
            (             'key',              'key'),
            (        'loudness',           'energy'),
            (        'loudness',         'loudness'),
            (    'acousticness',     'acousticness'),
            (     'speechiness',      'speechiness'),
            ('instrumentalness', 'instrumentalness'),
            (        'liveness',         'liveness'),
            (         'valence',          'valence'),
            (           'tempo',            'tempo'),
            (     'duration_ms',      'duration_ms')],
           )


In [22]:
# Identify strongly negatively correlated features
strong_neg_corr_features = correlation_matrix[correlation_matrix < -0.7].stack().dropna().index
print("Features strongly negatively correlated:")
print(strong_neg_corr_features)

Features strongly negatively correlated:
MultiIndex([], )


In [23]:
# Identify features that are not correlated
uncorrelated_features = correlation_matrix[(correlation_matrix >= -0.2) & (correlation_matrix <= 0.2)].stack().dropna().index
print("Features that are not correlated:")
print(uncorrelated_features)

Features that are not correlated:
MultiIndex([(          'energy',     'danceability'),
            (          'energy',              'key'),
            (          'energy',      'speechiness'),
            (          'energy',         'liveness'),
            (          'energy',            'tempo'),
            (          'energy',      'duration_ms'),
            (    'danceability',           'energy'),
            (    'danceability',         'loudness'),
            (    'danceability', 'instrumentalness'),
            (    'danceability',         'liveness'),
            (    'danceability',            'tempo'),
            (    'danceability',      'duration_ms'),
            (             'key',           'energy'),
            (             'key',         'loudness'),
            (             'key',     'acousticness'),
            (             'key',      'speechiness'),
            (             'key', 'instrumentalness'),
            (             'key',          'valen

In [24]:
# Compare danceability score between genres
danceability_by_genre = data[data['genre'].isin(['pop', 'hip-hop/rap', 'dance/electronic', 'alternative/indie'])].groupby('genre')['danceability'].mean()
print("Average danceability score by genre:")
print(danceability_by_genre)

Average danceability score by genre:
Series([], Name: danceability, dtype: float64)


In [25]:
# Compare loudness score between genres
loudness_by_genre = data[data['genre'].isin(['pop', 'hip-hop/rap', 'dance/electronic', 'alternative/indie'])].groupby('genre')['loudness'].mean()
print("Average loudness score by genre:")
print(loudness_by_genre)

Average loudness score by genre:
Series([], Name: loudness, dtype: float64)


In [26]:
# Compare acousticness score between genres
acousticness_by_genre = data[data['genre'].isin(['pop', 'hip-hop/rap', 'dance/electronic', 'alternative/indie'])].groupby('genre')['acousticness'].mean()
print("Average acousticness score by genre:")
print(acousticness_by_genre)

Average acousticness score by genre:
Series([], Name: acousticness, dtype: float64)


In [27]:
# Provide suggestions for improving the analysis

# You can further explore the relationships between different features using visualizations such as scatter plots, box plots, or histograms.
# Consider analyzing the distribution of each feature to identify any outliers or anomalies.
# You can perform hypothesis testing or statistical analysis to identify significant differences between genres or other categorical variables.
# Consider incorporating other external datasets or features that may provide additional insights or context to the analysis.
# It is important to ensure the data quality and accuracy by verifying the sources and conducting data validation or cross-checks.
# Document your analysis steps, assumptions, and any limitations to facilitate reproducibility and transparency.