## WEM project: JVC

#### Project's members:  Campos Carvalho Cédric, Feuillade Florian, Ramosaj Nicolas

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from vaderSentiment_fr.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from textblob_fr import PatternTagger, PatternAnalyzer
from nltk.corpus import sentiwordnet as swn
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline



#### Load the dataset

In [2]:
dataset = pd.read_json(r'data/comments_elden.json')
dataset.head()

Unnamed: 0,player,date,grade,comment
0,Torzeka,2022-02-28 20:54:00,20,"Fan inconditionnel de FS, j’attendais ER comme..."
1,Ranni_la_Witch,2022-03-24 01:02:00,20,Tout d'abord précision importante: le jeu a ét...
2,Sworm_cod,2022-03-20 16:40:00,11,"franchement, un jeu ou si tu ne parles pas a t..."
3,Vergilesco,2022-02-25 11:03:00,9,"Portage honteux sur PC, du stuttering sur une ..."


#### Function to convert range[-1, 1] to [0, 20]

In [3]:
def convertToGrade(percentage):
    if percentage < 0:
        grade = np.ceil(12 * percentage + 12)
    else:
        grade = np.ceil(8 * percentage + 12)
    return grade

#### Sentiment analysis with BlobText

In [4]:
# Sentiment analysis using TextBlob range is [-1, 1]
# Dedicated for French: https://github.com/sloria/textblob-fr
y_pred_textblob = [TextBlob(sent, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment for sent in dataset['comment']]
[print('comment from', dataset['player'][i], 'is', convertToGrade(y_pred[0])) for i, y_pred in enumerate(y_pred_textblob)]
[print('comment from', dataset['player'][i], 'is', y_pred[1]) for i, y_pred in enumerate(y_pred_textblob)]

comment from Torzeka is 14.0
comment from Ranni_la_Witch is 14.0
comment from Sworm_cod is 12.0
comment from Vergilesco is 11.0
comment from Torzeka is 0.4046739130434782
comment from Ranni_la_Witch is 0.5296884128529699
comment from Sworm_cod is 0.35833333333333334
comment from Vergilesco is 0.4708333333333334


[None, None, None, None]

In [5]:
# https://textblob.readthedocs.io/en/dev/
# Based on English version
y_pred_textblob = [TextBlob(sent).sentiment for sent in dataset['comment']]
[print('comment from', dataset['player'][i], 'is', convertToGrade(y_pred.polarity)) for i, y_pred in enumerate(y_pred_textblob)]
[print('comment from', dataset['player'][i], 'is', y_pred.subjectivity) for i, y_pred in enumerate(y_pred_textblob)]

comment from Torzeka is 12.0
comment from Ranni_la_Witch is 15.0
comment from Sworm_cod is 13.0
comment from Vergilesco is 0.0
comment from Torzeka is 0.6799999999999999
comment from Ranni_la_Witch is 0.7520833333333332
comment from Sworm_cod is 0.7857142857142857
comment from Vergilesco is 1.0


[None, None, None, None]

#### Sentiment analysis with Vader

In [6]:
# Sentiment analysis using Vader range is [-1, 1]
# https://github.com/cjhutto/vaderSentiment
vader_analyser = SentimentIntensityAnalyzer()
y_pred_vader = [vader_analyser.polarity_scores(sent) for sent in dataset['comment']]

[print('comment from', dataset['player'][i], 'is', convertToGrade(y_pred['compound'])) for i, y_pred in enumerate(y_pred_vader)]

comment from Torzeka is 20.0
comment from Ranni_la_Witch is 20.0
comment from Sworm_cod is 19.0
comment from Vergilesco is 3.0


[None, None, None, None]

#### Sentiment analysis with NLTK

In [7]:
def lemma_newtag(taggedterm):
    """ we need to convert the output of the POS tagger in terms of tag names to correspond to the input of sentiwordnet
        Arguments:
        term            - input tuple of token and NLTK POS 
  
        
        Returns:
        lemma           - the lemma of the token
        newtag          - the pos tag of the token in the sentiwordnet form
        """ 
    wnl = nltk.WordNetLemmatizer()
    lemma=wnl.lemmatize(taggedterm[0])
    if taggedterm[1].startswith('NN'):
            newtag='n'
    elif taggedterm[1].startswith('JJ'):
            newtag='a'
    elif taggedterm[1].startswith('V'):
            newtag='v'
    elif taggedterm[1].startswith('R'):
            newtag='r'
    else:
            newtag=''
    return lemma,newtag

In [8]:
def swn_score(taggedsent):
    score_list={}
    for term in taggedsent:
        lemma,newtag=lemma_newtag(term) 
        if(newtag!=''): 
            #BEGIN REMOVE   
            synsets = list(swn.senti_synsets(lemma, newtag))
            #Getting average of all possible sentiments, as you requested 
            if(len(synsets)>0):
                score=0
                for syn in synsets:
                    score+=syn.pos_score()-syn.neg_score()
                score_list[lemma]=score/len(synsets)
            #END REMOVE
    return score_list

In [9]:
def sentiwordnet_sentiment_analysis(corpus, use_synsets_mean=True):
    
    labels=[]
    scores=[]
    
    for document in corpus:
        assert(1==1)
        sentences = nltk.sent_tokenize(document)

        alltokens = [nltk.word_tokenize(sent) for sent in sentences]
        taggedlist=[nltk.pos_tag(stoken) for stoken in alltokens]       

        score = {}
        for sent in taggedlist:
            score.update(swn_score(sent))
            
        if sum(score.values())>0:
            labels.append('pos')
        else:
            labels.append('neg')
        scores.append(score)            
    return labels, scores

predicted_labels, scores = sentiwordnet_sentiment_analysis(dataset['comment'])

try:
    print(predicted_labels)
except:
    print ('The function sentiwordnet_sentiment_analysis needs your attention.')

['neg', 'neg', 'neg', 'neg']


#### Naive approach extract words and do a mean on the positive or negative words

In [10]:
with open('Words/positive-words.txt', errors='ignore') as opened:
    contents=opened.read()
contents_lines=['a+'] + contents.split('a+')[1].split('\n')

positive_words = [x for x in contents_lines if len(x)>0]


with open('Words/negative-words.txt', errors='ignore') as opened:
    contents=opened.read()
contents_lines=['2-faced'] + contents.split('2-faced')[1].split('\n')

negative_words = [x for x in contents_lines if len(x)>0]

In [11]:
def naiveScore(corpus):
    scores=[]
    for document in corpus:

        document_words = set(word for word in word_tokenize(document))
        positive = list(document_words.intersection(positive_words))
        negative = list(document_words.intersection(negative_words))
        score = (len(positive) - len(negative))/(len(positive) + len(negative)) if (len(positive) + len(negative)) != 0 else 0
        scores.append(score)
    return scores
    
y_pred_naive = naiveScore(dataset['comment'])
[print('comment from', dataset['player'][i], 'is', convertToGrade(y_pred)) for i, y_pred in enumerate(y_pred_naive)]

comment from Torzeka is 8.0
comment from Ranni_la_Witch is 12.0
comment from Sworm_cod is 12.0
comment from Vergilesco is 0.0


[None, None, None, None]

#### Sentiment analysis with transformer

In [12]:
# https://huggingface.co/tblard/tf-allocine and https://github.com/TheophileBlard/french-sentiment-analysis-with-bert
# Limited to 512 tokens
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")
model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine")

nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

All the layers of TFCamembertForSequenceClassification were initialized from the model checkpoint at tblard/tf-allocine.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertForSequenceClassification for predictions without further training.


In [13]:
#[print('comment from', dataset['player'][i], 'is', nlp(comment)) for i, comment in enumerate(dataset['comment'])]
print('comment from', dataset['player'][3], 'is', nlp(dataset['comment'][3])[0]['label'])

print(nlp("Juste whoaaahouuu !")) # POSITIVE
print(nlp("NUL...A...CHIER ! FIN DE TRANSMISSION.")) # NEGATIVE

comment from Vergilesco is NEGATIVE
[{'label': 'POSITIVE', 'score': 0.9862489104270935}]
[{'label': 'NEGATIVE', 'score': 0.9974060654640198}]
