In [9]:
# change to root directory of project
import os
os.chdir('/home/tm/sciebo/corona/twitter_analysis/')

from bld.project_paths import project_paths_join as ppj

In [225]:
from IPython.display import display

In [210]:
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#import requests
#import json
#import argparse

#from google.cloud import language
#from google.oauth2 import service_account
#from google.cloud.language import enums
#from google.cloud.language import types

## Data management

In [211]:
data = pd.read_csv(
    ppj("IN_DATA", "training_data/data.tsv"),
    sep="\t",
    header=None,
    names=['id', 'sentiment', 'hexcode', 'NA', 'text']
)

df = data.drop(
        ['id', 'hexcode', 'NA'], axis=1
    ).replace(
        {'Not Available': np.nan}
    ).dropna()[
        ['text', 'sentiment']
    ].reset_index().drop('index', axis=1)

In [212]:
df_processed = pd.read_csv(
    ppj("OUT_DATA", "data_processed.tsv"),
    sep="\t",
)[['text', 'sentiment']]

In [213]:
df['text_processed'] = data_processed.text
df['sentiment_score'] = df.sentiment.replace({'neutral': 0, 'negative': -1, 'positive': 1})

df = df.dropna()

## Functions

In [214]:
def classify_sentiment(list_of_text, method):
    """Classify sentiment for each item in ``list_of_text``.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
        
        method (str): Name of method that should be used. Possible
            values are 'google', 'vader', 'textblob'.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``.
    
    """
    analyzer = return_sentiment_analyzer(method)
    
    sentiments = analyzer(list_of_text)
    return sentiments

In [215]:
def return_sentiment_analyzer(method):
    """Return specific sentiment analyzer function.
    
    Args:
        method (str): Name of method that should be used. Possible
            values are 'google', 'vader', 'textblob'.
            
    Returns:
        analyzer (function): Function which return a sentiment score
            given text input. Inner workings depend on ``method``.
        
    """
    functions = {
        'google': analyze_google,
        'textblob': analyze_textblob,
        'vader': analyze_vader,
    }
    
    analyzer = functions[method]
    return analyzer

In [216]:
def analyze_google(list_of_text):
    """Return sentiment for each text in ``list_of_text``.
    
    Sentiments are analyzed using googles cloud natural language
    api.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``, where the sentiment score
            is computed using google cloud natural language.
            
    """
    client = language.LanguageServiceClient.from_service_account_json(
        'src/keys/ose-twitter-analysis-8508806b2efb.json'
    )
    
    sentiments = []
    for text in list_of_text:
        document = types.Document(
            content=text,
            type=enums.Document.Type.PLAIN_TEXT
        )
        annotations = client.analyze_sentiment(document=document)
        sentiments.append(annotations.document_sentiment.score)
        
    return sentiments

In [217]:
def analyze_textblob(list_of_text):
    """Return sentiment for each text in ``list_of_text`` using ``textblob``.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``, where the sentiment score
            is computed using the package ``textblob``.
            
    """
    sentiments = [
        TextBlob(text).sentiment.polarity for text in list_of_text
    ]
    return sentiments

In [218]:
def analyze_vader(list_of_text):
    """Return sentiment for each text in ``list_of_text`` using ``vaderSentiment``.
    
    Args:
        list_of_text (list): List of strings for which the sentiment
            should be classified.
            
    Returns:
        sentiments (list): List of respective sentiment score
            for each item in ``list_of_text``, where the sentiment score
            is computed using the package ``vaderSentiment``.
            
    """
    analyzer = SentimentIntensityAnalyzer()
    
    sentiments = [
        analyzer.polarity_scores(text)['compound'] for text in list_of_text
    ]
    return sentiments

## Analysis

In [219]:
analyzers = ['textblob', 'vader'] #, 'google']

In [220]:
for t in ['nonprocessed', 'processed']:
    if t == 'nonprocessed':
        col = 'text'
    else:
        col = 'text_processed'
        
    for m in analyzers:
        df[m + "_" + t] = classify_sentiment(df[col].to_list(), method=m)

In [221]:
def continuous_to_class(score):
    new_score = np.zeros(score.shape)
    
    new_score[score < -0.33] = -1
    new_score[score > 0.33] = 1
    
    new_score = pd.Series(new_score).replace(
        {-1: 'negative', 0: 'neutral', 1: 'positive'}
    )
    return new_score

In [258]:
def confusion_matrix_to_readable(cmat, labels):
    columns = ['pred_' + lab for lab in labels]
    rows = ['true_' + lab for lab in labels]
    
    df = pd.DataFrame(cmat, columns=columns, index=rows)
    return df

In [283]:
def absolute_to_freq(cmat):
    total = cmat.sum(axis=1)
    return cmat / total[:, np.newaxis]

In [284]:
le = LabelEncoder()
le = le.fit(df["sentiment"])
y_true = le.transform(df["sentiment"])

columns = [
    'textblob_nonprocessed', 
    'vader_nonprocessed', 
    'textblob_processed', 
    'vader_processed'
]

predictions = [
    le.transform(continuous_to_class(df[col])) for col in columns
]

cmats = [
    confusion_matrix(y_true, pred) for pred in predictions
]

cmats_freq = [absolute_to_freq(cmat) for cmat in cmats]

df_cmats = [
    confusion_matrix_to_readable(cmat, le.classes_) for cmat in cmats_freq
]

## Benchmark

In [311]:
weights = pd.Series(y_true).value_counts() / len(y_true)
weights = weights.reindex(le.transform(['negative', 'neutral', 'positive']))
weights

0    0.148358
1    0.626738
2    0.224905
dtype: float64

### Evaluation

In [312]:
for col, df_tmp in zip(columns, df_cmats):
    print(col)
    display(df_tmp)
    print(f"Percent correctly classified: {df_tmp.values.diagonal().dot(weights)}")

textblob_nonprocessed


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.098867,0.84655,0.054583
true_neutral,0.013896,0.933935,0.05217
true_positive,0.008832,0.703125,0.288043


Percent correctly classified: 0.6647822765469824
vader_nonprocessed


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.237899,0.704428,0.057673
true_neutral,0.14822,0.784739,0.06704
true_positive,0.122283,0.594429,0.283288


Percent correctly classified: 0.5908326967150497
textblob_processed


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.040165,0.92379,0.036045
true_neutral,0.011458,0.94198,0.046563
true_positive,0.013587,0.894701,0.091712


Percent correctly classified: 0.6169595110771581
vader_processed


Unnamed: 0,pred_negative,pred_neutral,pred_positive
true_negative,0.087539,0.869207,0.043254
true_neutral,0.066309,0.872989,0.060702
true_positive,0.05163,0.817935,0.130435


Percent correctly classified: 0.5894576012223071
