In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import pandas as pd
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

from utils import *

In [None]:
# Load dataframe with all texts
df = df_from_filings()
df.shape

# Eliminate very short texts (due to cleaning)
df = df[df['text'].map(len)>1000]

## VADER ANALYSER

In [None]:
vader = SentimentIntensityAnalyzer()

### VADER on non processed data (only cleaning described in previous notebook)

In [None]:
%%time
vader_untrained_polarity = df.apply(lambda row: vader.polarity_scores(row['text']), axis=1)

vader_untrained_polarity = pd.DataFrame(vader_untrained_polarity.values.tolist())
vader_untrained_polarity = vader_untrained_polarity.add_prefix('vader_polarity_');

### VADER on  preprocessed data (stopwords, punctuation and, most frequent removals)

In [None]:
%%time
vader_untrained_preprocessed_polarity = df.apply(lambda row: vader.polarity_scores(preprocess(row['text'])), axis=1)

vader_untrained_preprocessed_polarity = pd.DataFrame(vader_untrained_preprocessed_polarity.values.tolist())
vader_untrained_preprocessed_polarity = vader_untrained_preprocessed_polarity.add_prefix('vader_preprocessed_polarity_');

In [None]:
# Merge all results for VADER and save it in pickle file
df_vader = pd.concat([df[['cik', 'report_type', 'report_identity']], vader_untrained_polarity, vader_untrained_preprocessed_polarity], axis=1);

# Save data
df_vader.to_pickle(os.path.join('data', 'sentiment_analysis','df_vader.pkl'))

In [None]:
df_vader.describe()

## TEXTBLOB ANALYSER

### TextBlob on non processed data (only cleaning described in previous notebook)

In [None]:
%%time
textblob_sentiment_polarity = df.apply(lambda row: TextBlob(row['text']).sentiment.polarity, axis=1)
textblob_sentiment_subjectivity = df.apply(lambda row: TextBlob(row['text']).sentiment.subjectivity, axis=1)

textblob_sentiment = pd.concat([textblob_sentiment_polarity, textblob_sentiment_subjectivity], axis=1)
textblob_sentiment.columns = ['textblob_sentiment_polarity', 'textblob_sentiment_subjectivity']

### TextBlob on preprocessed data (stopwords, punctuation and, most frequent removals)

In [None]:
%%time
textblob_preprocessed_sentiment_polarity = df.apply(lambda row: TextBlob(preprocess(row['text'])).sentiment.polarity, axis=1)
textblob_preprocessed_sentiment_subjectivity = df.apply(lambda row: TextBlob(preprocess(row['text'])).sentiment.subjectivity, axis=1)

textblob_preprocessed_sentiment = pd.concat([textblob_preprocessed_sentiment_polarity, textblob_preprocessed_sentiment_subjectivity], axis=1)
textblob_preprocessed_sentiment.columns = ['textblob_preprocessed_sentiment_polarity', 'textblob_preprocessed_sentiment_subjectivity']

In [None]:
# Merge all results for VADER and save it in pickle file
df_tb = pd.concat([df[['cik', 'report_type', 'report_identity']], textblob_sentiment, textblob_preprocessed_sentiment], axis=1);

# Save data
df_tb.to_pickle(os.path.join('data', 'sentiment_analysis','df_tb.pkl'))

In [None]:
df_tb.describe()

## FLAIR - LSTM

In [None]:
import flair
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

def get_flair_sentiment(text):
    s = flair.data.Sentence(text)
    flair_sentiment.predict(s)
    return s.tag, s.score

In [None]:
### FLAIR on non processed data (only cleaning described in previous notebook)

In [None]:
%%time
df_flair_sentiment = df.apply(lambda row: get_flair_sentiment(row['text'])  , axis=1, result_type='expand')
df_flair_sentiment.columns = ['flair_sentiment_tag', 'flair_sentiment_score']

# Save data
df_flair_sentiment.to_pickle(os.path.join('data', 'sentiment_analysis','df_flair_wo_prep_backup.pkl'))

### FLAIR on preprocessed data (stopwords, punctuation and, most frequent removals)

In [None]:
%%time
df_flair_sentiment_preprocessed = df.apply(lambda row: get_flair_sentiment(preprocess(row['text'])), axis=1, result_type='expand')
df_flair_sentiment_preprocessed.columns = ['flair_preprocessed_sentiment_tag', 'flair_preprocessed_sentiment_score']

# Save data
df_flair_sentiment_preprocessed.to_pickle(os.path.join('data', 'sentiment_analysis','df_flair_w_prep_backup.pkl'))

In [None]:
# Merge all results for VADER and save it in pickle file
df_flair = pd.concat([df[['cik', 'report_type', 'report_identity']], df_flair_sentiment, df_flair_sentiment_preprocessed], axis=1);

# Save data
df_flair.to_pickle(os.path.join('data', 'sentiment_analysis','df_flair.pkl'))

In [None]:
# Get statistics for flair method WITHOUT preprocessing
print(df_flair['flair_sentiment_tag'].value_counts())

df_flair['flair_sentiment_score'].describe().to_frame()

In [None]:
# Get statistics for flair method WITH preprocessing
print(df_flair['flair_preprocessed_sentiment_tag'].value_counts())

df_flair['flair_preprocessed_sentiment_score'].describe().to_frame()