In [1]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
df = pd.read_csv('data/Emotion_final.csv') ## import du dataset

### Stopwords

In [26]:
from collections import Counter
all_text = ' '.join(df.Text)
words = all_text.split()
word_counts = Counter(words)
'" "'.join([word for word, count in word_counts.most_common(30)]) 
## vvv Listes des 30 mots les plus courants du corpus

'i" "feel" "and" "to" "the" "a" "of" "that" "feeling" "my" "in" "it" "like" "was" "so" "for" "im" "me" "but" "have" "is" "with" "this" "am" "not" "about" "be" "as" "on" "you'

    ^ Liste des 30 mots les plus courants du corpus

Gestion des stopwords

In [4]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/apprenant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
manual_filter = ['feel', 'feeling', 'like', ',', 'really', 'know', 'get', 'would', 'time', 'little', 'ive', 'still', 'even', 'want', 'life', 'way', 'could', 'back', 'make', 'time', 'going', 'know', 'im', 'one', 'bit', 'much', 'dont', 'day', 'think', 'one', 'always', 'people', 'things', 'something', 'today', 'go', 'see', 'work', 'cant', 'say', 'never', 'didnt', 'made', 'someone', 'many', 'pretty', 'right', 'felt', 'feelings', 'though', 'also', 'need', 'every', 'lot', 'around', "'s", 'look', 'every', 'new', 'year', 'able', 'got', 'also', 'less', 'feels', 'home', 'last', 'days', 'come', 'actually', 'makes']
## Liste des mots qui ne sont pas dans les stopwords de NLTK mais que l'on souhaite tout de même retirer pour l'analyse

In [50]:
words_by_emotion = dict()
for emotion in df.Emotion.unique():  # Pour chaque émotion on va vérifier quels mots sont les plus fréquents en dehors des stopwords
    
    df_temp = df[df.Emotion == emotion]     # Filtre le dataset sur l'emotion itérée
    words = ' '.join(df_temp.Text).split()      # Mets tous les mots de la colonne text dans une liste
    
    filtered_words = [word for word in words if word.lower() not in stop_words]     
    filtered__words = [word for word in filtered_words if word.lower() not in manual_filter]    
        # Retire tous les stopwords et le filtre écrit manuellement
        
    word_counts = Counter(filtered__words)  # Compte les mots par occurences
    
    words_by_emotion[emotion] = (", ".join([word for word, count in word_counts.most_common(30)])).split(', ') 
    # mets les 30 premiers résultats dans le dictionnaire pour chaque emotion iterée

In [52]:
print(f'\033[94m  Sadness :', words_by_emotion['sadness'], '\n',  '\033[91m Anger :', words_by_emotion['anger'], '\n',  '\033[95m Love :', words_by_emotion['love'], '\n',  '\033[90m Surprise :', words_by_emotion['surprise'], '\n',  '\033[92m Fear :', words_by_emotion['fear'], '\n',  '\033[93m Happy :', words_by_emotion['happy'], '\n',  '\033[0m')
## vvv Mots les plus fréquents pour chaque émotion hors stop words

[94m  Sadness : ['love', 'sad', 'alone', 'bad', 'depressed', 'good', 'miserable', 'kind', 'lost', 'left', 'help', 'sorry', 'stupid', 'guilty', 'without', 'stressed', 'lonely', 'exhausted', 'hurt', 'said', 'friends', 'ashamed', 'devastated', 'away', 'sometimes', 'low', 'pain', 'week', 'punished', 'http'] 
 [91m Anger : ['angry', 'offended', 'resentful', 'cold', 'irritable', 'bothered', 'greedy', 'mad', 'insulted', 'irritated', 'pissed', 'violent', 'annoyed', 'hated', 'dissatisfied', 'fucked', 'rude', 'bitchy', 'cranky', 'frustrated', 'rushed', 'dangerous', 'stressed', 'selfish', 'bitter', 'disgusted', 'distracted', 'agitated', 'jealous', 'love'] 
 [95m Love : ['love', 'sweet', 'loving', 'caring', 'passionate', 'sympathetic', 'liked', 'hot', 'tender', 'lovely', 'longing', 'loved', 'accepted', 'nostalgic', 'gentle', 'horny', 'naughty', 'romantic', 'blessed', 'supporting', 'loyal', 'supportive', 'beloved', 'fond', 'generous', 'delicate', 'faithful', 'towards', 'good', 'friends'] 
 [90m

In [48]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

emotions_counts = df.Emotion.value_counts()

##############################################
# Pour chaque sentiment, identifiez les 30 mots les plus courants en dehors des stopwords
words_by_emotion = {}
for emotion in df.Emotion.unique():
    texts = df[df.Emotion == emotion]['Text']
    vectorizer = CountVectorizer(stop_words=stop_words)
    X = vectorizer.fit_transform(texts)
    word_counts = list(zip(vectorizer.get_feature_names(), X.sum(axis=0).tolist()[0]))
    word_counts = sorted(word_counts, key=lambda x: x[1], reverse=True)
    top_words = [word for word, count in word_counts if word not in stop_words][:30]
    words_by_emotion[emotion] = top_words

# Définissez une matrice de similarité entre les sentiments
sentiments = df['Emotion'].unique()
similarity_matrix = cosine_similarity(X)

# Affichez la matrice de similarité sous forme de heatmap
sns.heatmap(similarity_matrix, xticklabels=sentiments, yticklabels=sentiments, cmap='coolwarm', annot=True)
plt.xlabel('Sentiments')
plt.ylabel('Sentiments')
plt.title('Similarité entre les sentiments')
plt.show()


Emotion
happy       7029
sadness     6265
anger       2993
fear        2652
love        1641
surprise     879
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/apprenant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


InvalidParameterError: The 'stop_words' parameter of CountVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {"needn't", 'wasn', 'd', 'other', 'very', 'and', 'hers', "mustn't", 'shouldn', 'during', 'yourselves', 'an', 'same', 'had', 'yourself', 'down', 'where', 'they', 'them', "shouldn't", 'were', 'above', 'than', "mightn't", 'after', 'being', 'so', 'was', 'couldn', 'what', 'i', 'ours', 'weren', 'further', 'any', 'to', 'you', 'themselves', 'in', 'o', 'mightn', 'do', 'their', 'won', 's', 'why', "aren't", 'by', 'does', 'will', 'against', 'should', 'be', 'been', 'we', 'which', 'few', 'those', 'that', 'through', "wouldn't", "should've", 'more', 'with', "isn't", 'hasn', 'its', 'now', "you've", 'once', 'll', "wasn't", 'nor', 'how', 'y', "you'd", 'off', "weren't", 'doesn', 'there', 'ma', 'again', 'theirs', 'our', 'before', 'until', 're', 'her', 'of', 'both', "hadn't", 'out', 'most', 've', 'don', 'all', 'hadn', "doesn't", 'such', 'too', 'whom', 'no', 'can', 'mustn', 'own', 'herself', 'ourselves', 'each', 'wouldn', 'isn', "don't", 'has', 'over', 'himself', 'under', 'she', 'while', 'him', 'about', "shan't", 'for', 'doing', 'your', 'myself', "hasn't", 'is', 'if', 'ain', 'haven', 'not', 'when', 'have', 'here', 'or', 'from', 'because', 'needn', 'but', "you're", "it's", 'it', 'just', 'as', 'below', 'having', 'the', 'between', 'me', 'shan', 'then', 'at', 'a', 'his', 'these', "haven't", 'yours', "won't", 'am', 'into', 't', "couldn't", 'aren', 'itself', 'my', "you'll", "she's", 'didn', 'up', 'are', 'did', "that'll", 'some', 'only', 'he', 'm', 'on', 'this', "didn't", 'who'} instead.