In [1]:
# importing required libraries

import numpy as np
import plotly.express as px
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
# reading the data
df = pd.read_csv('music_project_en (1).csv')

In [3]:
df.head()
# to print top 5 rows of data

Unnamed: 0,userID,Track,artist,genre,City,time,Day
0,FFB692EC,Kamigata To Boots,The Mass Missile,rock,Shelbyville,20:28:33,Wednesday
1,55204538,Delayed Because of Accident,Andreas Rönnberg,rock,Springfield,14:07:09,Friday
2,20EC38,Funiculì funiculà,Mario Lanza,pop,Shelbyville,20:58:07,Wednesday
3,A3DD03C9,Dragons in the Sunset,Fire + Ice,folk,Shelbyville,08:37:09,Monday
4,E2DC1FAE,Soul People,Space Echo,dance,Springfield,08:34:34,Monday


In [4]:
df.info()
# show detailed info about columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65079 entries, 0 to 65078
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0     userID  65079 non-null  object
 1   Track     63736 non-null  object
 2   artist    57512 non-null  object
 3   genre     63881 non-null  object
 4     City    65079 non-null  object
 5   time      65079 non-null  object
 6   Day       65079 non-null  object
dtypes: object(7)
memory usage: 3.5+ MB


In [5]:
df.describe()
# get columnwise metrics

Unnamed: 0,userID,Track,artist,genre,City,time,Day
count,65079,63736,57512,63881,65079,65079,65079
unique,41748,39666,37806,268,2,20392,3
top,A8AE9169,Brand,Kartvelli,pop,Springfield,08:14:07,Friday
freq,76,136,136,8850,45360,14,23149


In [6]:
df.columns
#print all columns names

Index(['  userID', 'Track', 'artist', 'genre', '  City  ', 'time', 'Day'], dtype='object')

In [7]:
# Here Column names 'userId' and 'City' have extra spaces, so removing them

df.rename(columns={'  City  ': 'City', '  userID': 'UserID'}, inplace=True)

df.columns

Index(['UserID', 'Track', 'artist', 'genre', 'City', 'time', 'Day'], dtype='object')

In [8]:
df.isna().sum()

# get some of all null values column-wise

UserID       0
Track     1343
artist    7567
genre     1198
City         0
time         0
Day          0
dtype: int64

#### Out of 65,079 entries, 7567 values of artist, 1343 values of Track and 1198 values of genre are None.

#### To prevent any inaccuracy in the data, best choice seems to replace them with a space( ) value.

In [9]:
# Replace NaN values with spaces in the specified columns

columns_to_replace = ['Track', 'artist', 'genre']
df[columns_to_replace] = df[columns_to_replace].fillna(' ')

In [10]:
# verifying
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65079 entries, 0 to 65078
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   UserID  65079 non-null  object
 1   Track   65079 non-null  object
 2   artist  65079 non-null  object
 3   genre   65079 non-null  object
 4   City    65079 non-null  object
 5   time    65079 non-null  object
 6   Day     65079 non-null  object
dtypes: object(7)
memory usage: 3.5+ MB


In [11]:
# Convert the 'time' column to datetime
df['time'] = pd.to_datetime(df['time'])

# Extract the hour component and create a new column 'hour'
df['hour'] = df['time'].dt.hour

df.head()

  df['time'] = pd.to_datetime(df['time'])


Unnamed: 0,UserID,Track,artist,genre,City,time,Day,hour
0,FFB692EC,Kamigata To Boots,The Mass Missile,rock,Shelbyville,2023-08-15 20:28:33,Wednesday,20
1,55204538,Delayed Because of Accident,Andreas Rönnberg,rock,Springfield,2023-08-15 14:07:09,Friday,14
2,20EC38,Funiculì funiculà,Mario Lanza,pop,Shelbyville,2023-08-15 20:58:07,Wednesday,20
3,A3DD03C9,Dragons in the Sunset,Fire + Ice,folk,Shelbyville,2023-08-15 08:37:09,Monday,8
4,E2DC1FAE,Soul People,Space Echo,dance,Springfield,2023-08-15 08:34:34,Monday,8


## Distribution of all Columns

In [None]:
# Loop through each column and create a distribution plot
for column in df.columns:
    fig = px.histogram(df, x=column, title=f'Distribution of {column}')
    fig.update_xaxes(tickangle=-45, tickfont=dict(size=10))
    fig.update_layout(font_family='Noto Sans')
    fig.show()

## Relationship between Genre and Time:

In [13]:
# Create a scatter plot for the relationship between genre and time
fig = px.scatter(df, x='genre', y='hour', title='Relationship between Genre and Time')
fig.update_layout(xaxis_title='Genre', yaxis_title='Hour')
fig.update_xaxes(tickangle=-45, tickfont=dict(size=10))
fig.update_layout(font_family='Noto Sans')

# Show the plot
fig.show()

## Relationship between Genre and City for Top 10 Genres:

In [14]:
top_ten_genre = df['genre'].value_counts().head(10).index.tolist()
top_ten_genre
# get top 10 genre by counting their occurences

['pop',
 'dance',
 'rock',
 'electronic',
 'hip',
 'classical',
 'alternative',
 'world',
 'ruspop',
 'rusrap']

In [15]:


# Filter data to include only top 10 genres
data_filtered = df[df['genre'].isin(top_ten_genre)]

# Create a scatter plot for the relationship between genre and city (top 10 genres)
fig = px.scatter(data_filtered, x='genre', y='City', title='Relationship between Genre and City (Top 10 Genres)')
fig.update_layout(xaxis_title='Genre', yaxis_title='City')
fig.update_xaxes(tickangle=-45, tickfont=dict(size=10))
fig.update_layout(font_family='Noto Sans')

# Adjust marker parameters for better visibility
fig.update_traces(marker=dict(size=10, opacity=0.7))

# Show the plot
fig.show()

## Distribution of Top Ten Artists:

In [16]:
top_ten_artist = df['artist'].value_counts().head(10).index.tolist()
top_ten_artist


[' ',
 'Kartvelli',
 'MALFA',
 'Real Bodrit',
 'The Seasons',
 'Irina Shok',
 'KoperniK',
 'Dr. Living Dead!',
 'RELFY',
 'Argishty (Duduk)']

In [17]:
# Filter data to include only top five artists
data_filtered = df[df['artist'].isin(top_ten_artist)]
# Filter out rows with empty artist values
data_filtered = data_filtered[data_filtered['artist'] != ' ']

artist_count = df.artist.value_counts()

# Create a histogram using Plotly Express
fig = px.histogram(data_filtered, x='artist', title='Total Count of Artists')
fig.update_layout(xaxis_title='Artist Names', yaxis_title='Count')
fig.update_xaxes(tickangle=-45, tickfont=dict(size=10))
fig.update_layout(font_family='Noto Sans')

# Show the plot
fig.show()

In [18]:
df.artist.value_counts().sort_values(ascending=False)

#verify counts of artists
# Note space or null values are not included in graph

artist
                                  7567
Kartvelli                          136
MALFA                              118
Real Bodrit                         95
The Seasons                         87
                                  ... 
Heatmiser                            1
OBF Improvisator Dub High Tone       1
Uzul                                 1
SeaNator                             1
Monica Lopez                         1
Name: count, Length: 37807, dtype: int64

## Hourly Distribution of Top Ten Genres:

In [19]:
# Filter data to include only top five genres
data_filtered = df[df['genre'].isin(top_ten_genre)]

# Create a histogram for the hourly distribution of top five genres
fig = px.histogram(data_filtered, x='hour', color='genre', title='Hourly Distribution of Top Five Genres')
fig.update_layout(xaxis_title='Hour', yaxis_title='Count')
fig.update_xaxes(tickangle=0)
fig.update_layout(font_family='Noto Sans')

# Show the plot
fig.show()

## Distribution of Time in Top Ten Artists:



In [20]:
# Filter data to include only top five artists
data_filtered = df[df['artist'].isin(top_ten_artist)]

# Create a box plot for the distribution of time in top five artists
fig = px.box(data_filtered, x='artist', y='hour', title='Distribution of Time in Top Five Artists')
fig.update_layout(xaxis_title='Artist', yaxis_title='Hour')
fig.update_xaxes(tickangle=-45, tickfont=dict(size=10))
fig.update_layout(font_family='Noto Sans')

# Show the plot
fig.show()