### Create some dummy comments

In [2]:
comments = ["This was not a good game",
            "No hubo ningún espíritu deportivo",
            "@jose no tuvo ninguna oportunidad hoy",
            "Ich Spreche Deutch",
            "La competencia estuvo reñida",
            "This is one of the worst football players in the season",
            "I've never seen someone that skillful",
            "La jugada del final estuvo muy cerca de hacer gol",
            "Solo faltaba que le metieran otra roja",
            "He is probably going to make it",
            "Sucks for them",
            "She had an injury",
            "No puedo esperar a ver el siguiente partido",
            "El será el ganador de la vuelta",
            "Se nota que estuvo entrenando en verano",
            "Ojalá lo compre el otro equipo",
            "Su condición ha desmejorado desde la última temporada",
            "No ha conseguido el título en más de tres años"]

### Preprocess the comments

In [3]:
from pysentimiento.preprocessing import preprocess_tweet


preprocessed_comments = [preprocess_tweet(comment) for comment in comments]

  from .autonotebook import tqdm as notebook_tqdm


### Detect the language

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [5]:
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
language_detector = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

In [6]:
languages_detected = []
for comment in preprocessed_comments:
    inputs = tokenizer(comment, return_tensors="pt")
    with torch.no_grad():
        logits = language_detector(**inputs).logits
    predicted_class_id = logits.argmax().item()
    languages_detected.append(language_detector.config.id2label[predicted_class_id])

### Predict the sentiment

In [None]:
from pysentimiento import create_analyzer

analyzer_en = create_analyzer(task="sentiment", lang="en")
analyzer_es = create_analyzer(task="sentiment", lang="es")

In [8]:
sentiment_probas = []
for ix in range(len(preprocessed_comments)):
    if languages_detected[ix] == "en":
        estimation = analyzer_en.predict(preprocessed_comments[ix]).probas
    elif languages_detected[ix] == "es":
        estimation = analyzer_es.predict(preprocessed_comments[ix]).probas
    else:
        estimation = {'NEG': 0, 'NEU': 1, 'POS': 0}
    sentiment_probas.append(estimation)

In [9]:
import numpy as np
def rescale_probs(proba_dict):
    keys, values = list(proba_dict.keys()), list(proba_dict.values())
    pred_key = keys[np.argmax(values)]
    pred_value = 0
    if pred_key == "NEG":
        pred_value = 1-np.max(values)
    elif pred_key == "POS":
        pred_value = np.max(values)
    else:
        neg_value = values[0]
        pos_value = values[-1]

        add_val = pos_value if pos_value > neg_value else -neg_value

        pred_value = 0.5 + (1-np.max(values))*add_val/2
    return pred_value

In [10]:
sentiment = [rescale_probs(probs) for probs in sentiment_probas]

In [11]:
import pandas as pd

df = pd.DataFrame({"Comment": comments, "Language": languages_detected, "Sentiment": sentiment})
df

Unnamed: 0,Comment,Language,Sentiment
0,This was not a good game,en,0.018816
1,No hubo ningún espíritu deportivo,es,0.378782
2,@jose no tuvo ninguna oportunidad hoy,es,0.005745
3,Ich Spreche Deutch,de,0.5
4,La competencia estuvo reñida,es,0.494178
5,This is one of the worst football players in t...,en,0.016167
6,I've never seen someone that skillful,en,0.969795
7,La jugada del final estuvo muy cerca de hacer gol,es,0.625783
8,Solo faltaba que le metieran otra roja,es,0.015776
9,He is probably going to make it,en,0.696235


In [12]:
df["Sentiment"].mean()

0.3193434249047682