In [8]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [3]:
file_path = 'RapLyrics-Scraper-master/lyrics_US/Eminem_lyrics.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lyrics = file.read()

lines = lyrics.split('\n')

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Fonction de prétraitement du texte
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    stems = [stemmer.stem(token) for token in tokens]
    return stems

cleaned_lines = [preprocess_text(line) for line in lines]

In [None]:
word_counts = Counter()

for line in cleaned_lines:
    word_counts.update(line)

print("Mots les plus fréquents dans les paroles :")
print(word_counts.most_common(50))

In [None]:
# Convertir les listes de tokens en texte
text = ' '.join([' '.join(line) for line in cleaned_lines])

# Fonction pour afficher le nuage de mots
def plot_word_cloud(text, title):
    wc = WordCloud(width=600, height=600, background_color='white', max_words=200, stopwords=stop_words, max_font_size=90, collocations=False, random_state=42)
    wc.generate(text)

    plt.figure(figsize=(10, 7))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=10, pad=10)
    plt.show()

plot_word_cloud(text, 'Nuage de mots des paroles d\'Eminem')

In [None]:
word_freq_df = pd.DataFrame(word_counts.most_common(20), columns=['Word', 'Frequency'])

plt.figure(figsize=(12, 8))
sns.barplot(x='Frequency', y='Word', data=word_freq_df)
plt.title('20 mots les plus fréquents')
plt.show()

In [None]:
# Calculer la longueur des phrases
line_lengths = [len(line.split()) for line in lines]

plt.figure(figsize=(12, 8))
sns.histplot(line_lengths, kde=True)
plt.title('Distribution de la longueur des phrases')
plt.xlabel('Longueur des phrases')
plt.ylabel('Fréquence')
plt.show()

In [None]:
file_path = '/mnt/data/Eminem_lyrics.txt'
process_file(file_path)