In [1]:
# change to root directory of project
import os
os.chdir('/home/tm/sciebo/corona/twitter_analysis/')

from bld.project_paths import project_paths_join as ppj

In [2]:
from IPython.display import display

In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#import requests
#import json
#import argparse

#from google.cloud import language
#from google.oauth2 import service_account
#from google.cloud.language import enums
#from google.cloud.language import types

## Data management

In [20]:
data = pd.read_csv(
    ppj("IN_DATA", "training_data/data_clean_translated.csv")
).iloc[:, 1:]

data_processed = pd.read_csv(
    ppj("IN_DATA", "training_data/data_processed_translated.csv"),
).iloc[:, 1:]

df = data.copy()

df["processed"] = data_processed.text

df['sentiment_score'] = df.sentiment.replace({'neutral': 0, 'negative': -1, 'positive': 1})

df = df.dropna()

## Functions

In [22]:
def classify_sentiment(list_of_text, method):
    """Classify sentiment for each item in ``list_of_text``.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
        
        method (str): Name of method that should be used. Possible
            values are 'google', 'vader', 'textblob'.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``.
    
    """
    analyzer = return_sentiment_analyzer(method)
    
    sentiments = analyzer(list_of_text)
    return sentiments

In [23]:
def return_sentiment_analyzer(method):
    """Return specific sentiment analyzer function.
    
    Args:
        method (str): Name of method that should be used. Possible
            values are 'google', 'vader', 'textblob'.
            
    Returns:
        analyzer (function): Function which return a sentiment score
            given text input. Inner workings depend on ``method``.
        
    """
    functions = {
        'google': analyze_google,
        'textblob': analyze_textblob,
        'vader': analyze_vader,
    }
    
    analyzer = functions[method]
    return analyzer

In [24]:
def analyze_google(list_of_text):
    """Return sentiment for each text in ``list_of_text``.
    
    Sentiments are analyzed using googles cloud natural language
    api.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``, where the sentiment score
            is computed using google cloud natural language.
            
    """
    client = language.LanguageServiceClient.from_service_account_json(
        'src/keys/ose-twitter-analysis-8508806b2efb.json'
    )
    
    sentiments = []
    for text in list_of_text:
        document = types.Document(
            content=text,
            type=enums.Document.Type.PLAIN_TEXT
        )
        annotations = client.analyze_sentiment(document=document)
        sentiments.append(annotations.document_sentiment.score)
        
    return sentiments

In [25]:
def analyze_textblob(list_of_text):
    """Return sentiment for each text in ``list_of_text`` using ``textblob``.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``, where the sentiment score
            is computed using the package ``textblob``.
            
    """
    sentiments = [
        TextBlob(text).sentiment.polarity for text in list_of_text
    ]
    return sentiments

In [26]:
def analyze_vader(list_of_text):
    """Return sentiment for each text in ``list_of_text`` using ``vaderSentiment``.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``, where the sentiment score
            is computed using the package ``vaderSentiment``.
            
    """
    analyzer = SentimentIntensityAnalyzer()
    
    sentiments = [
        analyzer.polarity_scores(text)['compound'] for text in list_of_text
    ]
    return sentiments

## Analysis

In [27]:
analyzers = ['textblob', 'vader'] #, 'google']

In [28]:
for col in ['text', 'processed']:        
    for m in analyzers:
        df[m + "_" + col] = classify_sentiment(df[col].to_list(), method=m)

In [29]:
def continuous_to_class(score):
    new_score = np.zeros(score.shape)
    
    new_score[score < -0.33] = -1
    new_score[score > 0.33] = 1
    
    new_score = pd.Series(new_score).replace(
        {-1: 'negative', 0: 'neutral', 1: 'positive'}
    )
    return new_score

In [30]:
def confusion_matrix_to_readable(cmat, labels):
    columns = ['pred_' + lab for lab in labels]
    rows = ['true_' + lab for lab in labels]
    
    df = pd.DataFrame(cmat, columns=columns, index=rows)
    return df

In [31]:
def absolute_to_freq(cmat):
    total = cmat.sum(axis=1)
    return cmat / total[:, np.newaxis]

In [33]:
le = LabelEncoder()
le = le.fit(df["sentiment"])
y_true = le.transform(df["sentiment"])

columns = [
    'textblob_text', 
    'vader_text', 
    'textblob_processed', 
    'vader_processed'
]

predictions = [
    le.transform(continuous_to_class(df[col])) for col in columns
]

cmats = [
    confusion_matrix(y_true, pred) for pred in predictions
]

cmats_freq = [absolute_to_freq(cmat) for cmat in cmats]

df_cmats = [
    confusion_matrix_to_readable(cmat, le.classes_) for cmat in cmats_freq
]

## Benchmark

In [34]:
weights = pd.Series(y_true).value_counts() / len(y_true)
weights = weights.reindex(le.transform(['negative', 'neutral', 'positive']))
weights

0    0.148318
1    0.626758
2    0.224924
dtype: float64

### Evaluation

In [35]:
for col, df_tmp in zip(columns, df_cmats):
    print(col)
    display(df_tmp)
    print(f"Percent correctly classified: {df_tmp.values.diagonal().dot(weights)}")

textblob_text


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.203093,0.695876,0.101031
true_neutral,0.036838,0.79312,0.170041
true_positive,0.020394,0.532291,0.447315


Percent correctly classified: 0.62782874617737
vader_text


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.41134,0.41134,0.17732
true_neutral,0.159063,0.529397,0.311539
true_positive,0.055744,0.2862,0.658056


Percent correctly classified: 0.5408256880733945
textblob_processed


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.13299,0.740206,0.126804
true_neutral,0.034155,0.807758,0.158087
true_positive,0.027872,0.67709,0.295037


Percent correctly classified: 0.5923547400611621
vader_processed


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.301031,0.53299,0.165979
true_neutral,0.134911,0.593071,0.272018
true_positive,0.056424,0.497621,0.445955


Percent correctly classified: 0.5166666666666666
