## WEM project: JVC

#### Project's members:  Campos Carvalho Cédric, Feuillade Florian, Ramosaj Nicolas

In [1]:
import pandas as pd
import numpy as np
import nltk
from tools.read import get_data
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from vaderSentiment_fr.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from textblob_fr import PatternTagger, PatternAnalyzer
from nltk.corpus import sentiwordnet as swn
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline

%load_ext autoreload
%autoreload 2



#### Load the dataset

In [2]:
dataset = get_data('data/dataset500.csv')
data = [(dataset['comments'][i][0]['comment'][0], dataset['comments'][i][0]['username'][0], dataset['comments'][i][0]['grade'][0]) for i in range(len(dataset))]

#### Function to convert range[-1, 1] to [0, 20]

In [3]:
def convertToGrade(percentage):
    if percentage < 0:
        grade = np.ceil(12 * percentage + 12)
    else:
        grade = np.ceil(8 * percentage + 12)
    return grade

#### Sentiment analysis with BlobText

In [4]:
# Sentiment analysis using TextBlob range is [-1, 1]
# Dedicated for French: https://github.com/sloria/textblob-fr
y_pred_textblob = [TextBlob(comment[0], pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()).sentiment for comment in data]
for i, y_pred in enumerate(y_pred_textblob): print('comment from', data[i][1], 'is', convertToGrade(y_pred[0]))
for i, y_pred in enumerate(y_pred_textblob): print('subjectivity from', data[i][1], 'is', y_pred[1]) 

comment from Platon_Socrate is 11.0
comment from Ranni_la_Witch is 14.0
comment from Chichariito is 14.0
comment from Chichariito is 14.0
comment from Dark--Ash is 15.0
comment from GranTurismoIV is 13.0
comment from magohax is 14.0
comment from Nynovac is 14.0
comment from ArudBard is 14.0
comment from jay_ricanay is 13.0
comment from PsykoBlind71 is 14.0
comment from Pxwer is 12.0
comment from NewPlayer67130 is 18.0
comment from Ct7274 is 14.0
comment from Lapa2015 is 13.0
comment from Man-of-location is 13.0
comment from Poonex is 14.0
comment from corentin199402 is 13.0
comment from Amaranthine is 13.0
comment from hisoka1977 is 13.0
comment from tristangdf is 12.0
comment from ohker is 13.0
comment from leflic03 is 15.0
comment from Pharos_Zap is 12.0
comment from zhywen30 is 10.0
comment from Adiego-danrrun is 13.0
comment from flodino is 13.0
comment from ModoWoke7 is 12.0
comment from kiki261 is 13.0
comment from DraiiZ is 16.0
comment from Raptor-jazz77 is 14.0
comment from yT

In [5]:
rb_high = [(data[i][0], data[i][2]) for i, pred in enumerate(y_pred_textblob) if pred[1] < 0.55 and pred[1] > 0.45 and data[i][2] > 19]
rb_high

[('Tout d\'abord précision importante: le jeu a été patché sur PC, les problèmes d\'opti ne sont plus aussi présents qu\'avant, il y a clairment une énorme différence avec les MàJ récentes. La sortie a été honteuse, il est plus que normal de pester contre ça car c\'est inadmissible. Je suis le premier à "rager" et m\'offusquer de ça. Hélas FS sont ni les premiers ni les derniers dans la catégorie "foutage de gueule".. C\'est légion sur 80% des jeux qui sortent sur PC malheureusement.. et ce depuis toujours. Bref ! Désormais ça appartient au passé et il faut rendre hommage à ce fantastique jeu qui ne mérite de façon objective que des éloges.Je viens de le finir à l\'instant. 160H, je pense avoir tout fait.Quel jeu.. Mais quel jeu.. Des surprises et des émerveillement de bout en bout. J\'ai jamais pris autant de plaisir dans une aventure comme celle là. Clairement l\'après Elden Ring va être dur pour ce qui est des autres productions du genre. Le jeu est infini, regorge de secrets, de po

In [6]:
rb_low = [(data[i][0], data[i][2]) for i, pred in enumerate(y_pred_textblob) if pred[1] < 0.55 and pred[1] > 0.45 and data[i][2] < 2]
rb_low

[('Jeu nul mouvement flou des personnages peu styles le mec qui a mis 20 est louche les graphisme sont horible dur à prendre en main  les touche sont incompréhensible',
  0),
 ("Quel dommage ! Ce célèbre univers de Robert E. Howard réduit à un jeu sandbox....Passons l'aspect graphique qui est immonde ! 80go pour ce jeu... On a vu mieux pour cette taille ! C'est façonner comme un Skyrim dans les interfaces mais en encore plus bordelique... Le nombre hallucinant de commande, j'imagine pas l'ennui que ça doit être sur PC. Passer son temps à se baisser pour ramasser des pierres et des bouts de bois, sérieusement.... Est-ce que c'est Conan tout ça ? Pas d'histoire, pas de scénario. Des objectifs terre à terre,  crafter crafter, manger, boire, dormir... Quel ennui ! Il y a tout une peltée de jeu dans ce principe, c'était pas la peine de remettre le couvert chez Conan.\nC'est très mauvais ! Effectivement on l'a l'exil... C'est très très exilé de l'univers.\nAllez, je vais me relire les comics

In [7]:
# https://textblob.readthedocs.io/en/dev/
# Based on English version
y_pred_textblob = [TextBlob(comment[0]).sentiment for comment in data]
for i, y_pred in enumerate(y_pred_textblob): print('comment from', data[i][1], 'is', convertToGrade(y_pred.polarity))
for i, y_pred in enumerate(y_pred_textblob): print('comment from', data[i][1], 'is', y_pred.subjectivity)

comment from Platon_Socrate is 12.0
comment from Ranni_la_Witch is 14.0
comment from Chichariito is 14.0
comment from Chichariito is 14.0
comment from Dark--Ash is 20.0
comment from GranTurismoIV is 10.0
comment from magohax is 12.0
comment from Nynovac is 15.0
comment from ArudBard is 14.0
comment from jay_ricanay is 12.0
comment from PsykoBlind71 is 14.0
comment from Pxwer is 12.0
comment from NewPlayer67130 is 15.0
comment from Ct7274 is 17.0
comment from Lapa2015 is 15.0
comment from Man-of-location is 12.0
comment from Poonex is 13.0
comment from corentin199402 is 16.0
comment from Amaranthine is 12.0
comment from hisoka1977 is 13.0
comment from tristangdf is 12.0
comment from ohker is 13.0
comment from leflic03 is 16.0
comment from Pharos_Zap is 5.0
comment from zhywen30 is 4.0
comment from Adiego-danrrun is 13.0
comment from flodino is 8.0
comment from ModoWoke7 is 10.0
comment from kiki261 is 12.0
comment from DraiiZ is 12.0
comment from Raptor-jazz77 is 16.0
comment from yTxmv

#### Sentiment analysis with Vader

In [8]:
# Sentiment analysis using Vader range is [-1, 1]
# https://github.com/cjhutto/vaderSentiment
vader_analyser = SentimentIntensityAnalyzer()
y_pred_vader = [vader_analyser.polarity_scores(comment[0]) for comment in data]

for i, y_pred in enumerate(y_pred_vader): print('comment from', data[i][1], 'is', convertToGrade(y_pred['compound']))

comment from Platon_Socrate is 18.0
comment from Ranni_la_Witch is 20.0
comment from Chichariito is 20.0
comment from Chichariito is 20.0
comment from Dark--Ash is 20.0
comment from GranTurismoIV is 20.0
comment from magohax is 19.0
comment from Nynovac is 19.0
comment from ArudBard is 20.0
comment from jay_ricanay is 10.0
comment from PsykoBlind71 is 20.0
comment from Pxwer is 2.0
comment from NewPlayer67130 is 19.0
comment from Ct7274 is 20.0
comment from Lapa2015 is 20.0
comment from Man-of-location is 20.0
comment from Poonex is 20.0
comment from corentin199402 is 3.0
comment from Amaranthine is 20.0
comment from hisoka1977 is 2.0
comment from tristangdf is 2.0
comment from ohker is 20.0
comment from leflic03 is 13.0
comment from Pharos_Zap is 9.0
comment from zhywen30 is 16.0
comment from Adiego-danrrun is 20.0
comment from flodino is 20.0
comment from ModoWoke7 is 1.0
comment from kiki261 is 20.0
comment from DraiiZ is 19.0
comment from Raptor-jazz77 is 20.0
comment from yTxmv is

In [9]:
rb_high = [(data[i][0], data[i][2]) for i, pred in enumerate(y_pred_vader) if pred['compound'] < 0.1 and pred['compound'] > -0.1 and data[i][2] > 19]
rb_high 

[("Je n'ai pas pour habitude de taper un 20/20 pour contrer les 0 pointés d'autres trolls (rambour3 pour ne pas le citer), mais donner une note aussi basse juste parce qu'on a une sensibilité trop importante suite à une exposition à la VR, ce qui n'est pas le cas de tout le monde, et sans donner un avis subjectif sur ledit jeu, je trouve ça aberrant.",
  20)]

In [10]:
rb_low = [(data[i][0], data[i][2]) for i, pred in enumerate(y_pred_vader) if pred['compound'] < 0.1 and pred['compound'] > -0.1 and data[i][2] < 2]
rb_low 

[('000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000',
  0)]

#### Sentiment analysis with NLTK

In [11]:
def lemma_newtag(taggedterm):
    """ we need to convert the output of the POS tagger in terms of tag names to correspond to the input of sentiwordnet
        Arguments:
        term            - input tuple of token and NLTK POS 
  
        
        Returns:
        lemma           - the lemma of the token
        newtag          - the pos tag of the token in the sentiwordnet form
        """ 
    wnl = nltk.WordNetLemmatizer()
    lemma=wnl.lemmatize(taggedterm[0])
    if taggedterm[1].startswith('NN'):
            newtag='n'
    elif taggedterm[1].startswith('JJ'):
            newtag='a'
    elif taggedterm[1].startswith('V'):
            newtag='v'
    elif taggedterm[1].startswith('R'):
            newtag='r'
    else:
            newtag=''
    return lemma,newtag

In [12]:
def swn_score(taggedsent):
    score_list={}
    for term in taggedsent:
        lemma,newtag=lemma_newtag(term) 
        if(newtag!=''): 
            #BEGIN REMOVE   
            synsets = list(swn.senti_synsets(lemma, newtag))
            #Getting average of all possible sentiments, as you requested 
            if(len(synsets)>0):
                score=0
                for syn in synsets:
                    score+=syn.pos_score()-syn.neg_score()
                score_list[lemma]=score/len(synsets)
            #END REMOVE
    return score_list

In [13]:
def sentiwordnet_sentiment_analysis(corpus, use_synsets_mean=True):
    
    labels=[]
    scores=[]
    
    for document in corpus:
        assert(1==1)
        sentences = nltk.sent_tokenize(document[0])

        alltokens = [nltk.word_tokenize(sent) for sent in sentences]
        taggedlist=[nltk.pos_tag(stoken) for stoken in alltokens]       

        score = {}
        for sent in taggedlist:
            score.update(swn_score(sent))
            
        if sum(score.values())>0:
            labels.append('pos')
        else:
            labels.append('neg')
        scores.append(sum(score.values()))            
    return labels, scores

predicted_labels, scores = sentiwordnet_sentiment_analysis(data)

try:
    for l, s in zip(predicted_labels, scores): print(l, s)
except:
    print ('The function sentiwordnet_sentiment_analysis needs your attention.')

pos 0.3116224053724054
neg -1.2774305555555556
pos 1.0879616910866912
pos 1.0879616910866912
pos 0.625
neg -0.4137248168498164
pos 0.20833333333333334
pos 0.0854166666666667
neg -0.19270833333333326
neg -0.5
pos 0.14583333333333331
neg -0.4910037878787878
neg -0.046875
pos 0.828125
neg -1.2442307692307693
neg -0.9472222222222225
neg -1.1804292929292932
pos 0.020541958041958075
pos 0.05131118881118882
neg -0.5833333333333333
neg -0.9166666666666666
neg -0.5013694638694639
pos 0.45
neg -0.07852564102564116
neg -0.5833333333333333
neg -1.7907585470085468
neg -0.08531746031746029
neg -2.1166666666666663
pos 0.33854166666666674
neg -0.0625
pos 0.6118550893550894
neg -1.7691468253968257
neg -0.4058320764203117
neg -0.24880952380952376
neg -0.6875
pos 0.02083333333333337
neg -0.5431835321541204
neg 0.0
neg -0.16079545454545452
neg -0.03125
neg -1.7516532809295966
neg -0.4208333333333333
pos 0.04326923076923077
pos 0.3159722222222222
neg -1.3140168654874536
pos 1.3043788609469946
pos 0.1463363

In [14]:
high_subjectivity = [(data[i][0], data[i][2]) for i, pred in enumerate(scores) if pred > -0.1 and pred < 0.1 and data[i][2] > 19]
high_subjectivity

[("Excellent jeu parce que Je ne m'en lasse pas d'y jouer plusieurs fois, il est bien meilleur que celui du précèdent. Les commentaires du narrateur peuvent être à la fois marrants et piquants.On se croirait presque ( je dis presque ) dans Warcraft à cause des personnages.Quand au gameplay , rien à redire, mais certaines unités peuvent être inutles quand on a compris le principe ( enfin cela dépend aussi de votre gouts parce que j'ai toujours préferé les orcs et nagas )Bref ce jeu vaut un 20 ( et je crois que c'est le 1ere à mérité de ma part ).",
  20),
 ("Je donne une excellente note à ce jeu.\nIl y a des chansons que je connaissais déjà mais j'ai pu en connaître d'autres par cette occasion.\nChanter avec des amis peut être une expérience sympa.\nNous pouvons chanter avec ou sans les paroles, il y a également des surprises comme chanter acapella ou changer de tête.\nChanter accompagné du clip est également agréable.\nCependant, il faut aimer l'anglais.",
  20)]

#### Naive approach extract words and do a mean on the positive or negative words

In [15]:
with open('Words/positive-words.txt', errors='ignore') as opened:
    contents=opened.read()
contents_lines=['a+'] + contents.split('a+')[1].split('\n')

positive_words = [x for x in contents_lines if len(x)>0]


with open('Words/negative-words.txt', errors='ignore') as opened:
    contents=opened.read()
contents_lines=['2-faced'] + contents.split('2-faced')[1].split('\n')

negative_words = [x for x in contents_lines if len(x)>0]

In [16]:
def naiveScore(corpus):
    scores=[]
    for document in corpus:

        document_words = set(word for word in word_tokenize(document[0]))
        positive = list(document_words.intersection(positive_words))
        negative = list(document_words.intersection(negative_words))
        score = (len(positive) - len(negative))/(len(positive) + len(negative)) if (len(positive) + len(negative)) != 0 else 0
        scores.append(score)
    return scores
    
y_pred_naive = naiveScore(data)
[print('comment from', data[i][1], 'is', convertToGrade(y_pred)) for i, y_pred in enumerate(y_pred_naive)]

comment from Platon_Socrate is 15.0
comment from Ranni_la_Witch is 12.0
comment from Chichariito is 13.0
comment from Chichariito is 13.0
comment from Dark--Ash is 20.0
comment from GranTurismoIV is 14.0
comment from magohax is 20.0
comment from Nynovac is 12.0
comment from ArudBard is 12.0
comment from jay_ricanay is 12.0
comment from PsykoBlind71 is 20.0
comment from Pxwer is 0.0
comment from NewPlayer67130 is 20.0
comment from Ct7274 is 20.0
comment from Lapa2015 is 0.0
comment from Man-of-location is 12.0
comment from Poonex is 7.0
comment from corentin199402 is 15.0
comment from Amaranthine is 5.0
comment from hisoka1977 is 20.0
comment from tristangdf is 12.0
comment from ohker is 12.0
comment from leflic03 is 12.0
comment from Pharos_Zap is 0.0
comment from zhywen30 is 0.0
comment from Adiego-danrrun is 0.0
comment from flodino is 0.0
comment from ModoWoke7 is 3.0
comment from kiki261 is 17.0
comment from DraiiZ is 20.0
comment from Raptor-jazz77 is 8.0
comment from yTxmv is 0.0

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

#### Sentiment analysis with transformer

In [17]:
# https://huggingface.co/tblard/tf-allocine and https://github.com/TheophileBlard/french-sentiment-analysis-with-bert
# Limited to 512 tokens
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")
model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine")

nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

All the layers of TFCamembertForSequenceClassification were initialized from the model checkpoint at tblard/tf-allocine.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertForSequenceClassification for predictions without further training.


In [18]:
#[print('comment from', dataset['player'][i], 'is', nlp(comment)) for i, comment in enumerate(dataset['comment'])]
print('comment from', data[0][1], 'is', nlp(data[0][0])[0]['label'])

print(nlp("Juste whoaaahouuu !")) # POSITIVE
print(nlp("NUL...A...CHIER ! FIN DE TRANSMISSION.")) # NEGATIVE

comment from Platon_Socrate is POSITIVE
[{'label': 'POSITIVE', 'score': 0.9862489104270935}]
[{'label': 'NEGATIVE', 'score': 0.9974060654640198}]
