In [1]:
import pandas as pd
import  matplotlib.pyplot as plt
import numpy as np
import nltk

import itertools
from sklearn import metrics, pipeline, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split as tts
from nltk.stem import WordNetLemmatizer, PorterStemmer


In [2]:
df = pd.read_csv('english_cleaned_lyrics.csv')

In [3]:
df.drop(columns='Unnamed: 0', inplace = True)
df.drop(columns='index', inplace = True)

In [4]:
display(df)

Unnamed: 0,song,year,artist,genre,lyrics
0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know I'm gonna cut r...
1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it's like you seem s...
2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party it's pop...
...,...,...,...,...,...
218205,who-am-i-drinking-tonight,2012,edens-edge,Country,I gotta say Boy after only just a couple of da...
218206,liar,2012,edens-edge,Country,I helped you find her diamond ring You made me...
218207,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth Looks a...
218208,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth And I'm measu...


In [5]:
display(df.isnull().sum())
df[df.isnull().any(axis=1)]

song      1
year      0
artist    0
genre     0
lyrics    0
dtype: int64

Unnamed: 0,song,year,artist,genre,lyrics
116724,,2009,booker-t-and-the-mg-s,Jazz,All right people the rest of the hard working ...


Un seul NaN dans tout le tableau, on peut le drop sans aucun problème car nous avons beaucoup de données.

In [6]:
df = df.drop(index= 116724)


In [7]:
display(df.value_counts(df['genre']))

genre
Rock          100053
Pop            34137
Hip-Hop        22654
Metal          21210
Country        14158
Jazz            7309
Electronic      6942
Other           3786
R&B             3336
Indie           2935
Folk            1689
dtype: int64

Le nombre de chansons des différents genres varie beaucoup, quelques uns en ont beaucoup plus que d'autres, particulièrement Rock.
Si on entraîne un modèle sur ces données, on risquerait d'avoir un gros biais pour ces genres. On devrait donc rendre le set un peu plus 'équilibré'. Pour que ce soit réellement équilibré on doit perdre beaucoup d'entrées, mais j'espère que 1500 par genre reste assez.

Pour rendre le tout plus rapide, on utilise temporairement(?) 15 de chaque genre. Sinon on aurait une sparse matrix énorme, plusieur milliers de colonnes. On pourra upscale plus tard et/ou travailler avec des one-hot encodings.

In [8]:
df_sample = df.groupby('genre').sample(1500, random_state = 42)
#tout en minuscules
df_sample['lyrics'] = df_sample['lyrics'].str.lower()

In [9]:
#Mots inutiles/vides de sens et très communs

stopwords = nltk.corpus.stopwords.words('english')

df_sample['tokenized'] = df_sample.apply(lambda row: nltk.word_tokenize(row['lyrics']), axis=1)
df_sample['tokenized'] = df_sample['tokenized'].apply(lambda x : [word for word in x if (word not in stopwords and len(word) > 1  and word[0] != "'")])


In [10]:
#On lemmatise puis prend le radical pour éviter les différentes formes conjuguées

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def stem_text(text):
    return [stemmer.stem(w) for w in text]
def lemmatize_text(text):    
    return [lemmatizer.lemmatize(w) for w in text]

df_sample['lemmatized'] = df_sample['tokenized'].apply(stem_text)
df_sample['lemmatized'] = df_sample['lemmatized'].apply(lemmatize_text)
df_sample.drop(columns=['tokenized'], inplace=True)

In [11]:
vectorizer = TfidfVectorizer(lowercase=False)


In [12]:
"""classifier=pipeline.Pipeline([
        ('tfidf_vectorizer', TfidfVectorizer(lowercase=False)),
        ('rf_classifier', ensemble.RandomForestClassifier(verbose=1,n_jobs=-1))]) #default 100 trees, peut tester 500
        
        """

"classifier=pipeline.Pipeline([\n        ('tfidf_vectorizer', TfidfVectorizer(lowercase=False)),\n        ('rf_classifier', ensemble.RandomForestClassifier(verbose=1,n_jobs=-1))]) #default 100 trees, peut tester 500\n        \n        "

In [13]:
X_train, X_test, y_train, y_test = tts(df_sample['lemmatized'], df_sample['genre'], test_size=0.2)

In [14]:
#classifier.fit(X_train, y_train)

In [15]:
words = df_sample['lemmatized'].values
vocab = set(list(itertools.chain.from_iterable(words)))
print(len(vocab))

36746


In [16]:
def list_to_str(text):
    return ' '.join(e for e in text)
df_sample['lemmas'] = df_sample['lemmatized'].apply(list_to_str)

In [17]:

tfidf_encodings = vectorizer.fit_transform(df_sample['lemmas'])
df_sample['tfidf'] = list(tfidf_encodings.toarray())

In [18]:
vectors_for_training = np.array(df_sample['tfidf'].tolist()) #get the vectors back out of the dataframe for use in something else
X_train, X_test,y_train, y_test = tts(vectors_for_training, df_sample['genre'].tolist(), test_size =0.2)

In [19]:
clf = ensemble.RandomForestClassifier(n_estimators=100,verbose=1)

In [20]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  4.9min finished


RandomForestClassifier(verbose=1)

In [21]:
y_pred=clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


In [22]:


metrics.accuracy_score(y_test,y_pred)



0.3390909090909091