In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing data and basic analysis

In [28]:
data = pd.read_csv('/content/drive/MyDrive/datas/dataset_rap.csv', sep=',')

In [29]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,artist,title,year,lyrics_x
0,0,100 Blaze,ASKIP,2022.0,"\n\nOh\n\nIls vont t'coller par interêt, crois..."
1,1,100 Blaze,Balec,2019.0,J'me suis marié au rap sans dire merci\nLà j's...
2,2,100 Blaze,Black Gogeta,2018.0,\nBlack Gogeta\nBlack Gogeta\nBlack Gogeta\nBl...
3,3,100 Blaze,Black Or White,2022.0,"\n\nNoir ou blanc, on évite d'aller trop finir..."
4,4,100 Blaze,Blazedog,2022.0,\n\nJ'ai grandi dans les blocs tout pourris de...
5,5,100 Blaze,Blow,2022.0,"\n\nHello, people, j'suis dans l'arène, ça fai..."
6,6,100 Blaze,Boosk’Castellane,2018.0,"\nIls m'disaient : ""Baisse la tête et marche t..."
7,7,100 Blaze,Brolik,2018.0,"\nYah, yah, yah, yah\nYah, yah, yah, yah\nYah,..."
8,8,100 Blaze,Ce qu’il se passe dehors,2019.0,\nJ'fais ça tous les jours\nEt puis normalemen...
9,9,100 Blaze,Coco,2022.0,\n\nNo-no-no-no-no-no-no\nNo-no-no-no-no\nNo-n...


In [30]:
for col in data.columns:
    print(col,': ',data[col].isnull().sum())

Unnamed: 0 :  0
artist :  0
title :  0
year :  4723
lyrics_x :  0


In [31]:
data.loc[:,"year"]=data.loc[:,"year"].fillna("9999")

In [32]:
data = data.drop(columns= "Unnamed: 0")
data['year'] = data['year'].astype(int)

In [33]:
data.head(10)

Unnamed: 0,artist,title,year,lyrics_x
0,100 Blaze,ASKIP,2022,"\n\nOh\n\nIls vont t'coller par interêt, crois..."
1,100 Blaze,Balec,2019,J'me suis marié au rap sans dire merci\nLà j's...
2,100 Blaze,Black Gogeta,2018,\nBlack Gogeta\nBlack Gogeta\nBlack Gogeta\nBl...
3,100 Blaze,Black Or White,2022,"\n\nNoir ou blanc, on évite d'aller trop finir..."
4,100 Blaze,Blazedog,2022,\n\nJ'ai grandi dans les blocs tout pourris de...
5,100 Blaze,Blow,2022,"\n\nHello, people, j'suis dans l'arène, ça fai..."
6,100 Blaze,Boosk’Castellane,2018,"\nIls m'disaient : ""Baisse la tête et marche t..."
7,100 Blaze,Brolik,2018,"\nYah, yah, yah, yah\nYah, yah, yah, yah\nYah,..."
8,100 Blaze,Ce qu’il se passe dehors,2019,\nJ'fais ça tous les jours\nEt puis normalemen...
9,100 Blaze,Coco,2022,\n\nNo-no-no-no-no-no-no\nNo-no-no-no-no\nNo-n...


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50855 entries, 0 to 50854
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   artist    50855 non-null  object
 1   title     50855 non-null  object
 2   year      50855 non-null  int64 
 3   lyrics_x  50855 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


In [35]:
data.shape

(50855, 4)

In [36]:
data.describe()

Unnamed: 0,year
count,50855.0
mean,2756.57658
std,2317.381634
min,1969.0
25%,2013.0
50%,2018.0
75%,2021.0
max,9999.0


## "tokenizer" function to isolate words

In [37]:
def tokenizer(texte):
  spl = texte.lower().split(" ")
  i=0
  res = []
  for e in spl:
    for j in [',','-',':','!','?','.','...','’','(',')', '\n']:
      e = e.replace(j, "")
      spl[i] = e
    i = i+1
  for m in spl:
    if m != '':
      res.append(m)
  return res

## Graphs around the number of words used by each artists

In [None]:
data["nb_mots"] = data["lyrics_x"].apply(tokenizer).apply(len)
rap_lyrics_gb = data.groupby(["artist"])[["nb_mots"]].mean()
rap_lyrics_gb

In [None]:
plot = rap_lyrics_gb.reset_index().sort_values("nb_mots")
plot_top_20 = plot.iloc[-20:]
plot_bottom_20 = plot.iloc[:20]

In [None]:
sns.barplot(data=plot_bottom_20, x="artist", y="nb_mots", color = "#fdfd96")

plt.xticks(rotation=85)

In [None]:
sns.barplot(data=plot_top_20, x="artist", y="nb_mots", color = "#fdfd96")

plt.xticks(rotation=85)

## Graphs of number of differents words and different/total ratio for each artist

In [None]:
rap = data[["artist", "lyrics_x"]]
rap_concat = rap.groupby(['artist'])['lyrics_x'].apply(' '.join).reset_index()


rap_concat["nb_mots_tot"] = rap_concat["lyrics_x"].apply(tokenizer).apply(len)
rap_concat["nb_mots_diff"] = rap_concat["lyrics_x"].apply(tokenizer).apply(set).apply(list).apply(len)

rap_concat = rap_concat.sort_values("nb_mots_diff")

In [None]:
sns.barplot(data=rap_concat[-20:], x="artist", y="nb_mots_diff", color = "#fdfd96")

plt.xticks(rotation=85)

In [None]:
rap_concat["ratio"] = rap_concat["nb_mots_diff"]/rap_concat["nb_mots_tot"]
rap_concat = rap_concat.sort_values("ratio")

sns.barplot(data=rap_concat[-20:], x="artist", y="ratio", color = "#fdfd96")
plt.xticks(rotation=85)

## Most frequent words for a particular artist

In [None]:
from collections import Counter
booba = data[["artist", "lyrics_x"]]
booba = booba.groupby(['artist'])['lyrics_x'].apply(' '.join).reset_index()

booba = booba[booba["artist"] == "Booba"]

lyrics_booba = booba["lyrics_x"].apply(tokenizer)

lyrics_booba = list(lyrics_booba)
lyrics_booba = lyrics_booba[0]

Counter(lyrics_booba).most_common(20)

In [None]:
premiers_mots = list(Counter(lyrics_booba).most_common(20))

premiers_mots = [i[0] for i in premiers_mots]

premiers_mots

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import *
sw = stopwords.words('french')
sw2 = [w for w in open("/content/drive/MyDrive/datas/stopword.txt",encoding='utf-8').read().split(",")]
sw = sw+sw2
sw = sw

for i in range(len(lyrics_booba)):
  if lyrics_booba[i] in sw:
    lyrics_booba[i] = ""
  if len(lyrics_booba[i]) <4:
    lyrics_booba[i] = ""
  if lyrics_booba[i] in premiers_mots:
    lyrics_booba[i] = ""

l_booba = []

for e in lyrics_booba:
  if e != '':
    l_booba.append(e)

In [None]:
Counter(l_booba).most_common(20)

## WordCloud for 3 artists

In [None]:
from wordcloud import WordCloud

def cloud(str_artist):
  artist = data[["artist", "lyrics_x"]]
  artist = artist.groupby(['artist'])['lyrics_x'].apply(' '.join).reset_index()
  artist = artist[artist["artist"] == str_artist]
  lyrics_artist = artist["lyrics_x"].apply(tokenizer)
  lyrics_artist = list(lyrics_artist)
  lyrics_artist = lyrics_artist[0]
  Counter(lyrics_artist).most_common(20)
  premiers_mots = list(Counter(lyrics_artist).most_common(20))
  premiers_mots = [i[0] for i in premiers_mots]
  premiers_mots
  for i in range(len(lyrics_artist)):
    if lyrics_artist[i] in sw:
      lyrics_artist[i] = ""
    if len(lyrics_artist[i]) <4:
      lyrics_artist[i] = ""
    if lyrics_artist[i] in premiers_mots:
      lyrics_artist[i] = ""
  l_artist = []
  for e in lyrics_artist:
    if e != '':
      l_artist.append(e)
  wordcloud = WordCloud(background_color="white").generate(' '.join(l_artist))
  plt.imshow(wordcloud)
  plt.axis("off")

In [None]:
cloud("Kekra")

In [None]:
cloud("PNL")

In [None]:
cloud("Booba")

## Text classification : creation of a predictive model between a rap and a pop song

In [None]:
df_fr = pd.DataFrame()
with open('/content/drive/MyDrive/datas/song_lyrics.csv') as fl:
    chunk_iter = pd.read_csv(fl, sep=",", chunksize = 100000)
    for chunk in chunk_iter:
        chunk = chunk.loc[chunk['language'] == "fr"]
        df_fr = pd.concat([df_fr,chunk])

In [None]:
print(df_fr["tag"].unique())

In [None]:
df_pop_fr = df_fr.loc[df_fr['tag'] == "pop"]

df_pop_fr

In [None]:
for col in df_pop_fr.columns:
    print(col,': ',df_pop_fr[col].isnull().sum())

In [None]:
df_pop_fr.loc[:,"title"]=df_pop_fr.loc[:,"title"].fillna("titre_inconnu")

In [None]:
df_pop_fr = df_pop_fr[["title", "artist", "lyrics", "tag"]]
df_pop_fr

In [None]:
data = data.rename(columns={'lyrics_x': 'lyrics'})[["title", "artist", "lyrics"]]
data["tag"] = "rap"

data

In [None]:
chansons = pd.concat([data, df_pop_fr])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


from nltk.corpus import stopwords
stopwords = stopwords.words('french')+[w for w in open("/content/drive/MyDrive/M2_Python/TP4/data/stopword.txt",encoding='utf-8').read().split(",")]

vec = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,1))
X_features = vec.fit_transform(chansons.lyrics)

target = LabelEncoder().fit_transform(chansons.tag)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features,target, test_size = 0.3, random_state=555)

In [None]:
print(X_train.shape)

In [None]:
vec.get_feature_names_out()

### Hyperparameters

Not a lof of options in the grid, cv = 2 only, and only 2 models because the time of execution is really long

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier

random_seed=555

# RF = {
# 'model' : RandomForestClassifier(),
# 'parameters' : {'n_estimators' : [20, 50,100],
#                 'max_depth' : [10,30,50,70,90],
#                 'random_state' : [random_seed]
#                 }
# }


XGB = {
'model' : XGBClassifier(),
'parameters' : {'n_estimators' : [20, 60, 100],
                 'learning_rate' : [1e-1, 1e-2, 1e-3],
                 'seed' : [random_seed]
                }
}

# SVM = {
# 'model' : SVC(),
# 'parameters' : {
#                 'kernel' : ['linear'],
#                 'C': [0.1, 1],
#                 'gamma' : [0.001,0.01, 0.1],
#                 'tol' : [ 1e-2, 1e-3],
#                 'random_state' : [random_seed]
#                 }
# }

XT = {
'model' : ExtraTreesClassifier(),
'parameters' : {'n_estimators' : [20, 100],
                'max_depth' : [30, 70],
                'random_state' : [random_seed]
                }
}

Models_list = [XT, XGB]

from sklearn.model_selection import GridSearchCV
scorer ='f1'
res={}
for model_class in Models_list:
    print(str(model_class['model']).split('(')[0])
    model_opt=GridSearchCV(estimator=model_class['model'],param_grid=model_class['parameters'], cv=3, scoring = scorer, return_train_score=True).fit(X_train, y_train)
    res.update({str(model_class['model']).split('(')[0] : model_opt.cv_results_})

In [None]:
import pandas as pd

df_res=pd.DataFrame({'algo':[],'mean_train_score': [],'std_train_score': [],'mean_test_score': [], 'std_test_score':[],'params':[]})
for algo in list(res):
    model_opt=res[algo]
    df=pd.DataFrame({'algo':algo,'mean_train_score':model_opt['mean_train_score'],'std_train_score':model_opt['std_train_score'], 'mean_test_score':model_opt['mean_test_score'],'std_test_score':model_opt['std_test_score'],'params':model_opt['params']}, )
    df_res = pd.concat([df_res, df],ignore_index=True)

df_res.sort_values(['mean_test_score','std_test_score','mean_train_score','std_train_score'], ascending=[False,True,False,True])

In [None]:
#Détermine le meilleur candidat (i.e. celui qui minimise la variance et maximise le score f1 sur des échantillons de test)
index_best=df_res.sort_values(['mean_test_score','std_test_score','mean_train_score','std_train_score'], ascending=[False,True,False,True]).index[0]
fig = plt.figure(figsize=(10,7))
color_list = ["#8CB4CA", "#FFC000",  "#FF8B94"]
for cand_algo,col in zip(df_res['algo'].unique().tolist(),color_list):
    #Pour chacun des types algo, on affichera le nuage de points en utilisant une couleur différente (col)
    plt.scatter(df_res[df_res["algo"]==cand_algo]['mean_test_score'],df_res[df_res["algo"]==cand_algo]['std_test_score'], color=col, s=7, label=cand_algo)

#Affiche le meilleur candidat via l'indice index_best
plt.scatter(df_res.loc[index_best,'mean_test_score'],df_res.loc[index_best,'std_test_score'], color='red',marker='p', s=200, label="Best ("+str(df_res.loc[index_best]["algo"])+')')

#Légende + axe x/y
plt.xlabel('Mean_test_score')
plt.ylabel('Std_test_score')
plt.title('All the candidates of the grid \n(with '+scorer+'-score | '+str(df_res.shape[0])+' testes candidates)', size = 'x-large')
plt.legend(loc=2)

print(df_res.loc[index_best,'algo'],df_res.loc[index_best,'params'])

In [None]:
# best model

model_XGB = XGBClassifier(learning_rate=0.1,n_estimators=100, seed=555)
model_XGB.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, model_XGB.predict(X_test))

In [None]:
from sklearn.metrics import roc_auc_score, brier_score_loss, average_precision_score, f1_score, accuracy_score, matthews_corrcoef, precision_score, recall_score, fbeta_score, cohen_kappa_score
def calcul_metrique_fct(list_models,list_metrics,y_test=y_test,y_train=y_train,X_test=X_test,X_train=X_train):
    for model in list_models:
        print(str(model).split('(')[0])
        for metric in list_metrics:
            name_metric=str(metric).split(' ')[1]
            if name_metric in ['roc_auc_score','brier_score_loss']:
                perf_train=metric(y_train,[pr[1] for pr in model.predict_proba(X_train)])
                perf_test=metric(y_test,[pr[1] for pr in model.predict_proba(X_test)])
                print("\t{0:s} Train : {1:f} \t Test {2:f}".format(name_metric,perf_train,perf_test))
            else:
                perf_train=metric(y_true=y_train,y_pred=model.predict(X_train))
                perf_test=metric(y_true=y_test,y_pred=model.predict(X_test))
                print("\t{0:s} Train : {1:f} \t Test {2:f}".format(name_metric,perf_train,perf_test))


calcul_metrique_fct([model_XGB], list_metrics=[f1_score,accuracy_score,brier_score_loss,roc_auc_score])