In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from textblob import TextBlob
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import plot_importance
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.manifold import TSNE
import time



  import pandas.util.testing as tm


# Load the data

In [2]:
year='../Data/datasetsFinalaizedForEDA/datasets_top100-2017.csv'
df_new=pd.read_csv('../Data/intermediate datasets-ToBeUsedForRanking/df_2017_new.csv')
new_dataframe_name="../Data/intermediate datasets-ToBeUsedForRanking/df_2017_new.csv"


In [3]:
def load(year):
    df=pd.read_csv(year)
    return df

#df_2019=pd.read_csv('../data/datasets_top50-2019.csv',encoding='latin-1')


In [4]:
load(year)

In [5]:
df=load(year)

# data properties

In [6]:
def data_properties(year):
    print("properties for year are as follows :\n")
    print("column details:\n")
    print(year.columns)
    print("shape:\n")
    print(year.shape)
    print("info about dataset:\n")
    print(year.info())
    print("\n")


In [7]:
data_properties(df)

### Initial calculations on data

#### checking if any value is null in the data 

In [8]:
def checkfornull(year):
    print("cheking for null values......")
    print(year.columns[year.isnull().any()])
    output=year.isnull().sum()
    return output 

In [9]:
checkfornull(df)

#### convert the non float data to float data type

In [10]:
# in the data we obsrve that all the data is not in float .So , in order to maintain a symetry,we will 
# convert the non float data to float data type

def float_conversion(column,value):
    column= pd.to_numeric(value, downcast="float")
    return column


In [11]:
def execute_float_conversion():
    float_conversion(df.mode,df["mode"])
    float_conversion(df.duration_ms,df["duration_ms"])
    float_conversion(df.time_signature,df["time_signature"])
    return df.head()


In [12]:
execute_float_conversion()

## Sequential column analysis

In [13]:
#Sequential column analysis: for in depth understanding 

In [14]:
def understand_feature(feature):
    return feature.describe()


#### danceability

In [15]:
#The higher the value, the easier it is to dance to this song.
def print_overall_dancebility_features():
    print("overall danceability feature\n")
    return understand_feature(df["danceability"])


In [16]:
print_overall_dancebility_features()

In [17]:
def top_10_dancebility():
    print("top 10  danceability feature\n")
    return understand_feature(df["danceability"].head(10))



In [18]:
top_10_dancebility()

In [None]:
#sorting out more danceble songs 
def sorting_dancebility():
    df['dancebility_new']=df['danceability']
    df.loc[df['danceability']>=0.6000,'dancebility_new']='1'
    df.loc[(df['danceability']<0.6000),'dancebility_new']='0'

    return (df['dancebility_new'].value_counts())

In [None]:
sorting_dancebility()

In [None]:
print("top 10  energy  feature\n")
understand_feature(df["energy"].head(10))

#### In Depth BPM

Understanding beats per minute i.e tempo of the song
here we do classification of the tempo on the bases of beats per min 
REf:https://www.musical-u.com/learn/rhythm-tips-for-identifying-music-genres-by-ear/

And creating a new feature :tempo_rate 
tempo_rate:This classifies the song on the basis of bpm like:
    - very slow
    - slow
    - moderate
    - fast
    - very fast


In [None]:
def tempo_rate():
    df['tempo_rate']=df['tempo']
    return df['tempo_rate']

In [None]:
tempo_rate()

In [None]:
def cateogarise_tempo_rate():
    df.loc[df['tempo']<65,'tempo_rate']='very slow'
    df.loc[(df['tempo']>=66) & (df['tempo']<=76),'tempo_rate']='slow'
    df.loc[(df['tempo']>=77) & (df['tempo']<=108),'tempo_rate']='moderate'
    df.loc[(df['tempo']>=109) & (df['tempo']<=168),'tempo_rate']='fast'
    df.loc[df['tempo']>168,'tempo_rate']='very fast'


In [None]:
cateogarise_tempo_rate()

In [None]:
def describe_songs_per_tempo_rate():
    print("Tempo Rate\tCount\n")
    print(df['tempo_rate'].value_counts())
    print("\n")
    df.head()

In [None]:
describe_songs_per_tempo_rate()

In [None]:
def barplot_temporate():
    sns.barplot(x=pd.DataFrame(df['tempo_rate'].value_counts()).tempo_rate, 
                y=pd.DataFrame(df['tempo_rate'].value_counts()).index
               )
    plt.title('music tempo', fontsize=17);

In [None]:
barplot_temporate()

sort the songs on the bases of important parametes like :

In [None]:
def sort_imp_parameters():
    return df[['name','artists','danceability','valence','tempo','tempo_rate','energy']].sort_values(by=["danceability","energy","valence"],ascending=False).head(10)

As we can see that these features collectively couldn't generate the sorted manner of artist as expected.We need to change our approach and include more features 


#### Analysing top artists
Analysing Top artists will give us a better idea about the feature importance 

In [None]:
#count the value:in terms on songs per year for an  artist
def songs_per_artist():
    print("Artist\t\tSongs\n")
    print(df["artists"].value_counts().head(20))

In [None]:
songs_per_artist()

#### Comparing  the  top artists based on our features 
lets start with top 5:

- Ed Sheeran          4
- The Chainsmokers    4
- Drake               3
- Martin Garrix       3
- Kendrick Lamar      2

In [None]:
def first():
    EdSheeran=df[df['artists']=='Ed Sheeran']
    print("Ed sheeran:")
    return EdSheeran[['name','danceability','energy','loudness','valence','tempo','tempo_rate']]

In [None]:
first()

In [None]:
def second():
    TheChainsmokers=df[df['artists']=='The Chainsmokers']
    print("The Chainsmokers:")
    return TheChainsmokers[['name','danceability','energy','loudness','valence','tempo','tempo_rate']]

In [None]:
second()

In [None]:
def third():
    Drake=df[df['artists']=='Drake']
    print("Drake:")
    return Drake[['name','danceability','energy','loudness','valence','tempo','tempo_rate']]

In [None]:
third()

In [None]:
def forth():
    MartinGarrix  =df[df['artists']=='Martin Garrix']
    print("Martin Garrix  :")
    return MartinGarrix  [['name','danceability','energy','loudness','valence','tempo','tempo_rate']]

In [None]:
forth()

In [None]:
def fifth():
    KendrickLamar   =df[df['artists']=='Kendrick Lamar']
    print("Kendrick Lamar   :")
    return KendrickLamar   [['name','danceability','energy','loudness','valence','tempo','tempo_rate']]

In [None]:
fifth()

Here's the observation for the above  top 5 results:
- dancebility:Range:0.49-->0.90+
- energy:Range:0.48-->0.90+ , in general, the enegy in not very significant but  top songs have energy more than 0.48
- loudness:more -ve loudness means more popular the song is 
- valence:Range:Not a very important criteria
- tempo:Range:can not conclude about this, need more EDA
- tempo rate:Range: fast and moderate are in top



# EDA

### PCA plot-large features

In [None]:
def plot_PCA_large():
    feture_to_plot = ["energy", "liveness", "tempo", "valence", "loudness", "speechiness", "acousticness", "danceability", "instrumentalness"]
    text1 = df["artists"] + " - " + df["name"]
    text2 = text1.values
    X = df[feture_to_plot].values
    y = df["danceability"].values
    min_maxscaler = MinMaxScaler()
    X = min_maxscaler.fit_transform(X)
    pca = PCA(n_components=3)
    pca.fit(X)
    X = pca.transform(X)

    plot3d = go.Scatter3d(
        x=X[:,0],
        y=X[:,1],
        z=X[:,2],
        text=text2,
        mode="markers",
        marker=dict(
            size=8,
            color=y
        )
    )

    fig = go.Figure(data=[plot3d])
    py.iplot(fig, filename="3d graph")


In [None]:
plot_PCA_large()

### Plot correlation matrix

In [None]:
def plot_correlation():
    correlation = df[['acousticness','danceability','energy','instrumentalness','liveness','tempo','valence',
                           'loudness']]
    plt.figure(figsize=(12,8))
    sns.heatmap(correlation.corr(), annot=True,cmap="coolwarm");

In [None]:
plot_correlation()

We can observe that there is a correlation between the loudness and the perceived energy of the song. The jointplots indicate a correlation between these two. Lets understand  how much one feature si impacted by another feature 

### Join plots

In [None]:
def create_joinplot(val1,val2,year,colr):
    sns.jointplot(val1,val2, data=year,kind='reg',color=colr)

In [None]:

create_joinplot('loudness','energy',df,'r')
create_joinplot('tempo','danceability',df,'b')
create_joinplot('acousticness','energy',df,'y')
create_joinplot('valence','energy',df,'b')

from the corelation matrix we observe that  
- there is a relation between  tempo and dancebility 
- there is a relation between  acounsticeness and energy 
- there is a relation between  tempo and acousticeness
- there is a relation between  energy and loudness
- there is a relation between  tempo and valence
- there is a relation between  valence and dancebility 
- there is a relation between  valence and energy  

### dist plot for feature details 


In [None]:
def feature_details(feature):
    print("Mean value ", feature.mean())
    sns.distplot(feature,color="r")
    plt.show()


In [None]:
feature_details(df['danceability'])

*Conclusion*: top songs have high dancebility.And people listen to those songs in which they can dance.

In [None]:
feature_details(df['energy'])

*Conclusion*:people like energetic songs 

In [None]:
feature_details(df['loudness'])

*Conclusion*: more noice means more ranking 

In [None]:
feature_details(df['acousticness'])

*Conclusion*:people don't lisen more to the acoustics 

In [None]:
feature_details(df['valence'])

*Conclusion*:songs are evenly distributed and totally depends upon the mood 

In [None]:
feature_details(df['tempo'])

*Conclusion*:people listen fast songs more 

In [None]:
# store mean values of above features
a=df['danceability'].mean()
b=df['energy'].mean()
c=df['loudness'].mean()
d=df['acousticness'].mean()
e=df['valence'].mean()
f=df['tempo'].mean()


### pca -important features 

In [None]:
def plot_pca_important_features():
    feture_to_plot = ["energy", "liveness", "tempo", "valence","danceability"]
    text1 = df["artists"] + " - " + df["name"]
    text2 = text1.values
    X = df[feture_to_plot].values
    y = df["danceability"].values
    min_maxscaler = MinMaxScaler()
    X = min_maxscaler.fit_transform(X)
    pca = PCA(n_components=3)
    pca.fit(X)
    X = pca.transform(X)

    plot3d = go.Scatter3d(
        x=X[:,0],
        y=X[:,1],
        z=X[:,2],
        text=text2,
        mode="markers",
        marker=dict(
            size=8,
            color=y
        )
    )

    fig = go.Figure(data=[plot3d])
    py.iplot(fig, filename="3d graph-2nd")


In [None]:
plot_pca_important_features()

 # Create a popularity column

In [None]:

#dance ,valence , energy , liveliness

def popularity_binary():
    if  'popularity'  not in df.columns:
         df["popularity"]=""
         df.loc[(df['danceability']>=a) | (df['energy']>=b)| (df['loudness']>=c)| (df['acousticness']>=d)| 
                        (df['valence']>=e)| (df['tempo']>=f),'popularity']='1'
         df.loc[(df['danceability']<a) & (df['energy']<b) & (df['loudness']<c),'popularity']='0'
         df.head()

         df.to_csv(new_dataframe_name,index=False)
           
    else:
        pass
        

In [None]:
popularity_binary()

In [None]:
df.head()

In [None]:
df_new=pd.read_csv('../Data/intermediate datasets-ToBeUsedForRanking/df_2017_new.csv')


## correlation between features and popularity

In [None]:
def draw(year,compare_col,color):
    graph = sns.FacetGrid(df_new, col = "popularity")
    graph.map(sns.distplot, compare_col, bins = 25,color=color)
    return plt.show()

In [None]:
draw(df_new,"danceability","b")
draw(df_new,"acousticness","r")
draw(df_new,"loudness","m")
draw(df_new,"valence","r")
draw(df_new,"tempo","y")


In [None]:
def concat_name_popularity():
    popularity_data = pd.concat([df_new["name"],df_new["popularity"]],axis=1)
    return popularity_data.head()

In [None]:
concat_name_popularity()

# calculating scores

## data preparation 

In [None]:
df_final=df_new.copy()

In [None]:

def drop():
    
    df_final.drop(["id","name","artists","tempo_rate","dancebility_new"],axis=1,inplace=True)
    return df_final.columns


In [None]:
drop()

In [None]:
def cateogarize_keys():
    global df_final
    df_final["key"] = df_final["key"].astype("category")
    df_final = pd.get_dummies(df_final, columns=["key"])
    return df_final.head()


In [None]:
cateogarize_keys()

In [None]:
def cateogarize_time():
    global df_final
    df_final["time_signature"] = df_final["time_signature"].astype("category")
    df_final = pd.get_dummies(df_final, columns=["time_signature"])
    return df_final.head()

In [None]:
cateogarize_time()

In [None]:
def change_datatype(var):
    df_final[var] = df_final[var].astype(int)
    
columns= ["key_0.0","key_1.0","key_2.0",
         "key_3.0","key_4.0","key_5.0","key_6.0","key_7.0","key_8.0","key_9.0","key_10.0","key_11.0","time_signature_3.0","time_signature_4.0"]
for column in columns:
    change_datatype(column)
    

In [None]:
def load_data():
    
    y = df_final["popularity"].values
    X = df_final.drop(["popularity"],axis=1)
    return X,y
X,y =load_data()

feature_cols = X.columns.to_list()

*NOTE*:There is not a great  linear correlations in our data So, Decision Tree algorithms will give  better results as compared to linear regression models. Therefore using  Random Forest 
 

In [None]:

#x_data=X
X,y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.6)
# X_train = X_train.T
# X_test = X_test.T
X_train=X_train[:None]
X_test=X_test[:None]
y_train = y_train.astype(int).T
y_test = y_test.astype(int).T
print("X_train: ",X_train.shape)
print("X_test: ",X_test.shape)
print("y_train: ",y_train.shape)
print("y_test: ",y_test.shape)


## xgb  top features -for validating  our  hypothesis

In [None]:
def plot_impfeature():
    xgb = XGBClassifier()
    xgb.fit(X_train, y_train)
    ax = plot_importance(xgb,color="r", height=0.15,
                         title='  Features Importance ', xlabel='Feature score', ylabel='Features',
                         importance_type='weight'
                        )

    return plt.show()




In [None]:
plot_impfeature()

##### The above graph assure that :
-  Our proposed ranked features turned out to be  most important for this data  
-  Example: we calculated features like :dancebility ,energy,loudness ets of most importance.And hence thus turned out true.

In [None]:
def calculate_scores():
    rf=RandomForestClassifier(random_state = 3)
    rf.fit(X_train,y_train)

    print("Train accuracy ",rf.score(X_train,y_train))
    print("Test accuracy ",rf.score(X_test,y_test))

    RandomForestClassifier_score=rf.score(X_test,y_test)
    y_pred=rf.predict(X_test)
    t_true=y_test

In [None]:
calculate_scores()

# validating model 

## plot loudness t-SNE

In [None]:
def plot_pca_scaler(y,dataframe):
    features = ["energy", "liveness", "tempo", "valence", "loudness","danceability"]

    text1 = dataframe["artists"] + " - " + dataframe["name"]
    text2 = text1.values
    
    X = dataframe[features].values
    y = dataframe[y].values

    minmaxscaler = MinMaxScaler()
    X = minmaxscaler.fit_transform(X)

    print('t-SNE starting.................')
    tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(X)

    print('......................t-SNE done')

    fig = {
        "data": [
            {
                "x": tsne_results[:, 0],
                "y": tsne_results[:, 1],
                "text": text2,
                "mode": "markers",
                "marker": {"size": 8, "color": y}
            }
        ],
        "layout": {
            "xaxis": {"title": "x-tsne"},
            "yaxis": {"title": "y-tsne"}
        }
    }

    return py.iplot(fig, filename="amit")

In [None]:
plot_pca_scaler("loudness",df)

## project the 'top' and 'bottom' songs (or 'not-top' songs) 

project into the t-SNE space and see if they cluster differently.

1. separate the songs on the basis of popularity--> o and 1 , here 0  are the not top songs and 1 are top songs

- EXPECTED 1: Songs having popularity 1 should be tightly coupled  and 

- EXPECTED 2: Songs having popularity 0 should be loosely coupled  

In [None]:
def top_songs(dataframe):
    df1 = dataframe[dataframe['popularity'] == 1]
    return plot_pca_scaler("loudness",df1)

In [None]:
top_songs(df_new)

HENCE:assumption 1 prooved 

In [None]:
def not_top_songs(dataframe):
    df2 = dataframe[dataframe['popularity'] ==0 ]
    return plot_pca_scaler("loudness",df2)

In [None]:
not_top_songs(df_new)

HENCE:assumption 2 also prooved 