## Table of Contents

* [Chapter 1](#chapter1): Data Preprocessing
* [Chapter 2](#chapter2): Sentiment Analysis
    * [Section 2.1](#section_2_1): Defining Functions
    * [Section 2.2](#section_2_2): TextBlob
    * [Section 2.3](#section_2_3): SentiWS
* [Chapter 3](#chapter3): Smoothed Sentiment Analysis Over Time
    * [Section 3.1](#section_3_1): Defining Functions
    * [Section 3.2](#section_3_2): TextBlob
    * [Section 3.3](#section_3_3): SentiWS

## Chapter 1: <a class="anchor" id="chapter1"></a> Data Preprocessing

In [1]:
#Performing required installations
#nltk.download('stopwords')
#nltk.download('punkt')

#!python3 -m spacy download de_core_news_md

#pip install textblob_de

#pip install spacy
#pip install spacy_sentiws
#pip install spacy-transformers

#pip install polyglot

In [2]:
#Importing libraries
#Data processing
import pandas as pd
import numpy as np
import re

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches

#Sentiment analysis
from textblob_de import TextBlobDE
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk
import string as st
import spacy
import spacy.cli
from spacy_sentiws import spaCySentiWS

#Timeseries and date handling
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt 
from scipy import signal
from scipy.fftpack import fft, fftshift
from math import factorial
from astropy.convolution import convolve, Box1DKernel, Gaussian1DKernel, Trapezoid1DKernel

#Stats
from statistics import mean
import statsmodels.api as sm
import scipy.stats
import math

#Other
import os
from tqdm import tqdm
import warnings
tqdm.pandas()

In [3]:
#Suppressing warnings
warnings.simplefilter(action = "ignore")

In [4]:
#Reading in CSVs
os.chdir("..")
os.chdir("..")
df_cleaned_parsed = pd.read_csv("Data/Articles/cleaned_parsed_data.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])
df = pd.read_csv("Data/Articles/cleaned_parsed_preprocessed_data.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])

df_textblob = pd.read_csv("Outputs/Articles/Lexicon/textblob.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])
df_textblob_agg = pd.read_csv("Outputs/Articles/Lexicon/textblob_agg.csv", index_col = 0)
df_textblob_timeseries = pd.read_csv("Outputs/Articles/Lexicon/textblob_timeseries.csv", index_col = 0)
df_textblob_timeseries.index = pd.to_datetime(df_textblob_timeseries.index)

df_sentiws = pd.read_csv("Outputs/Articles/Lexicon/sentiws.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])
df_sentiws_agg = pd.read_csv("Outputs/Articles/Lexicon/sentiws_agg.csv", index_col = 0)
df_sentiws_timeseries = pd.read_csv("Outputs/Articles/Lexicon/sentiws_timeseries.csv", index_col = 0)
df_sentiws_timeseries.index = pd.to_datetime(df_sentiws_timeseries.index)

df_entities = pd.read_csv("Inputs/Articles/entities.csv")
df_key_media = pd.read_csv("Inputs/Articles/key_media.csv", index_col = 0)
df_key_entities = pd.read_csv("Inputs/Articles/key_entities.csv", index_col = 0)
os.chdir("Notebooks/Articles")

In [5]:
#Setting entities
entities = list(df_entities[df_entities["selection"] == 1]["designed_entity"].unique())
entities

['Ueli_Maurer',
 'Guy_Parmelin',
 'Simonetta_Sommaruga',
 'Alain_Berset',
 'Bundesrat',
 'Tanja_Stadler',
 'Martin_Ackermann',
 'Taskforce',
 'Christoph_Berger',
 'EKIF',
 'Patrick_Mathys',
 'Marcel_Salathe',
 'Daniel_Koch',
 'BAG',
 'Swissmedic',
 'Lukas_Engelberger',
 'GDK',
 'SVP',
 'SP',
 'FDP',
 'Die_Mitte',
 'Die_Gruene',
 'Befuerworter',
 'Gegner',
 'Skeptiker',
 'Kritiker',
 'Opposition',
 'Demonstranten']

In [7]:
#Setting media
media = list(df_cleaned_parsed["medium_name"].unique())
media

['NZZ',
 'Zuercher_Unterlaender',
 'Blick',
 'Zuerichsee_Zeitung',
 'Berner_Zeitung',
 'Solothurner_Zeitung',
 'SRF',
 'Berner_Oberlaender',
 'Tages_Anzeiger',
 'Limmattaler_Zeitung',
 'Langenthaler_Tagblatt',
 'Cash',
 'Grenchner_Tagblatt',
 'BZ_Basel',
 'Handelszeitung',
 'Luzerner_Zeitung',
 'Landbote',
 'Werdenberger_&_Obertoggenburger',
 'Aargauer_Zeitung',
 '20_Minuten',
 'Der_Bund',
 'Basler_Zeitung',
 'Nidwaldner_Zeitung',
 'St._Galler_Tagblatt',
 'Thuner_Tagblatt',
 'Zofinger_Tagblatt',
 'Badener_Tagblatt',
 'Thurgauer_Zeitung',
 'Schweizer_Illustrierte',
 'Zuger_Zeitung',
 'Finanz_und_Wirtschaft',
 'Urner_Zeitung',
 'Die_Wochenzeitung',
 'Swissinfo',
 'Oltner_Tagblatt',
 'Obwaldner_Zeitung',
 'Appenzeller_Zeitung',
 'Das_Magazin',
 'Beobachter',
 'Bilanz',
 'Toggenburger_Tagblatt',
 'Thalwiler_Anzeiger',
 'Zuger_Presse',
 'TV_Star',
 'Schweizer_Familie',
 'Zugerbieter',
 'Glueckspost',
 'Tele']

In [9]:
#Instantiating nlp
nlp = spacy.load("de_core_news_md", disable = ["tagger", "parser", "ner"])

In [10]:
#Linking to SentiWS
sentiws = spaCySentiWS(sentiws_path = "Resources/SentiWS/")

In [11]:
#Defining function to lemmatize tokens
def lemmatize(tokens):
    spacy_tokens = [nlp(token) for token in tokens]
    lemmas = [spacy_token[0].lemma_ for spacy_token in spacy_tokens]
    return lemmas

In [12]:
#Defining preprocessing function
def preprocess(df):
    df = df_cleaned_parsed.copy()
    
    #Creating new columns
    df["clause_tokens"] = df["clause_ABSA"]
    df["passage_tokens"] = df["passage_ABSA"]
        
    #Setting columns
    columns = ["clause_tokens", 
               "passage_tokens"]
      
    #Tokenizing
    for column in columns:
        df[column] = df[column].apply(lambda x: x.split())
        
    #Removing stopwords
    stopword = set(stopwords.words("german"))
    
    for column in columns:
        df[column] = df[column].apply(lambda x: [token for token in x if token not in stopword])
        
    #Removing punctuation
    punctuation = list(st.punctuation)
    
    for column in columns:
        df[column] = df[column].apply(lambda x: [token for token in x if token not in punctuation])
        
    #Removing [NEG_ENT] tokens
    for column in columns:
        df[column] = df[column].apply(lambda x: [token for token in x if token != "[NEG_ENT]"])
    
    #Lemmatizing on sentence- and passage-level
    #df["clause_tokens_lemmatized"] = df["clause_tokens"].progress_apply(lambda x: lemmatize(x))
    #df["passage_tokens_lemmatized"] = df["passage_tokens"].progress_apply(lambda x: lemmatize(x))
    
    #Turning token list into token string
    columns = ["clause_tokens", 
               "passage_tokens", 
               #"clause_tokens_lemmatized", "passage_tokens_lemmatized"
              ]
    new_columns = ["clause", 
                   "passage", 
                   #"clause_lemmatized", "passage_lemmatized"
                  ]

    for column, new_column in zip(columns, new_columns):
        df[new_column] = df[column].apply(lambda x: " ".join(x))
    
    return df

In [13]:
#Preprocessing
df = preprocess(df_cleaned_parsed)

In [15]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df.to_csv("Data/Articles/cleaned_parsed_preprocessed_data.csv")
os.chdir("Notebooks/Articles")

## Chapter 2: <a class="anchor" id="chapter2"></a> Sentiment Analysis

### Section 2.1: <a class="anchor" id="section_2_1"></a> Defining Functions

In [16]:
#Defining function to calculate sentiment and subjectivity for TextBlob
def calculate_sentiment_subjectivity_textblob(df, sentiment_col_name, subjectivity_col_name, text_col):
    df[sentiment_col_name] = df[text_col].progress_apply(lambda x: TextBlobDE(x).polarity)
    df[subjectivity_col_name] = df[text_col].progress_apply(lambda x: TextBlobDE(x).subjectivity)

In [17]:
#Defining function to bin sentiment and subjectivity for TextBlob
def bin_sentiment_subjectivity_textblob(df, bins, sentiment_bin_col_name, subjectivity_bin_col_name, sentiment_col_name, subjectivity_col_name):
    df[sentiment_bin_col_name] = pd.cut(df[sentiment_col_name], bins, labels = [-1,0,1])
    df[sentiment_bin_col_name] = df[sentiment_bin_col_name].astype("int64")
    df[subjectivity_bin_col_name] = pd.cut(df[subjectivity_col_name], bins, labels = [-1,0,1])
    df[subjectivity_bin_col_name] = df[subjectivity_bin_col_name].astype("int64")

In [18]:
#Defining function to run TextBlob sentiment analyis
def sa_textblob(df, bins):
    #Calculating sentiment and subjectivity scores on sentence- and passage- level
    calculate_sentiment_subjectivity_textblob(df, "clause_sentiment_score", "clause_subjectivity_score", "clause")
    calculate_sentiment_subjectivity_textblob(df, "passage_sentiment_score", "passage_subjectivity_score", "passage")
    
    #Binning sentiment and subjectivity scores
    bin_sentiment_subjectivity_textblob(df, bins, "clause_sentiment", "clause_subjectivity", "clause_sentiment_score", "clause_subjectivity_score")
    bin_sentiment_subjectivity_textblob(df, bins, "passage_sentiment", "passage_subjectivity", "passage_sentiment_score", "passage_subjectivity_score")

In [19]:
#Defining function to calculate sentiment and subjectivity for SentiWS
def calculate_sentiment_sentiws(df, sentiment_col_name, text_col):
    def get_score(string):
        try:
            doc = nlp(string)
            score_list = [0 if token._.sentiws == None else float(token._.sentiws) for token in doc]
            polarity_list = [1 if score > 0 else -1 for score in score_list if score != 0]
            polarity = sum(polarity_list)
            sentiment = np.where(polarity > 0, 1, np.where(polarity == 0, 0, -1)).flatten()[0]
            return sentiment
        except:
            return np.nan
    
    df[sentiment_col_name] = df[text_col].progress_apply(lambda x: get_score(x))
    df[sentiment_col_name] = df[sentiment_col_name].astype("int64")

In [20]:
#Defining function to run SentiWS sentiment analyis
def sa_sentiws(df):
    #Calculating sentiment scores on sentence-level
    calculate_sentiment_sentiws(df, "clause_sentiment", "clause")

    #Calculating sentiment scores on passage-level
    calculate_sentiment_sentiws(df, "passage_sentiment", "passage")

### Section 2.2: <a class="anchor" id="section_2_2"></a> TextBlob

In [21]:
#Creating copy of dataframe
df_textblob = df.copy()

In [22]:
#Running TextBlob sentiment analysis
sa_textblob(df_textblob, [-1.1,-0.5,0.5,1.1])

100%|██████████████████████████████████| 268001/268001 [11:53<00:00, 375.52it/s]
100%|██████████████████████████████████| 268001/268001 [15:49<00:00, 282.40it/s]
100%|██████████████████████████████████| 268001/268001 [35:46<00:00, 124.88it/s]
100%|██████████████████████████████████| 268001/268001 [28:38<00:00, 155.92it/s]


In [23]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df_textblob.to_csv("Outputs/Articles/Lexicon/textblob.csv")
os.chdir("Notebooks/Articles")

### Section 2.3: <a class="anchor" id="section_2_3"></a> SentiWS

In [24]:
#Creating copy of dataframe
df_sentiws = df.copy()

In [25]:
#Running SentiWS sentiment analysis
sa_sentiws(df_sentiws)

100%|██████████████████████████████████| 268001/268001 [08:50<00:00, 504.93it/s]
100%|██████████████████████████████████| 268001/268001 [13:52<00:00, 321.87it/s]


In [26]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df_sentiws.to_csv("Outputs/Articles/Lexicon/sentiws.csv")
os.chdir("Notebooks/Articles")

## Chapter 3: <a class="anchor" id="chapter3"></a> Smoothed Sentiment Analysis Over Time

### Section 3.1: <a class="anchor" id="section_3_1"></a> Defining Functions

In [27]:
#Defining function to calculate moving average sentiment of given entity and medium
def calculate_moving_avg_std(df, entity, medium, window_size, sentiment_col):
    """
    This function calculates the moving average sentiment for a given entity and/or newspaper.
    If the user does not want to filter the sentiment by entity and/or newspaper, the parameters 
    should be set to False. If the user does not want to return a specific sentiment column,
    the parameter should be set to False.
    """
    #Filtering dataframe
    if entity == False:
        if medium == False:
            df_filtered = df
        else:
            df_filtered = df[df["medium_name"] == medium]
    else:
        if medium == False:
            df_filtered = df[df["entity_name"] == entity]
        else:
            df_filtered = df[(df["medium_name"] == medium) & (df["entity_name"] == entity)]
    
    #Creating windows
    windows = df_filtered.groupby("pubday").mean().rolling(window = window_size)
       
    #Calculating average and standard deviation per window
    moving_avgs = windows.mean().iloc[window_size-1:,:]
    moving_stds = windows.std().iloc[window_size-1:,:]
    df_moving = moving_avgs.join(moving_stds, lsuffix = "_avg", rsuffix = "_std") 
    
    #Calculating sample size
    sample_size = df_filtered.groupby("pubday").size()
    sample_size.name = "sample_size"
    df_moving = df_moving.join(sample_size)
    
    #Dropping ID
    df_moving.drop(["id_avg", "id_std"], axis = 1, inplace = True)
    
    #Returning information
    if sentiment_col == False:
        return df_moving
    else:
        return df_moving[sentiment_col+"_avg"], df_moving[sentiment_col+"_std"]

In [28]:
#Defining function to calculate Savitzky Golay smoothed sentiment of given entity and medium
def calculate_savitzky_golay_smoothed_avg_std(df, entity, medium, interpolation_method, window_size, order, sentiment_col):
    """
    This function calculates the Savitzky Golay smoothed sentiment for a given entity and/or 
    newspaper. If the user does not want to filter the sentiment by entity and/or newspaper, the 
    parameters should be set to False. If the user does not want to return a specific sentiment 
    column, the parameter should be set to False.
    """
    #Filtering dataframe
    if entity == False:
        if medium == False:
            df_filtered = df
        else:
            df_filtered = df[df["medium_name"] == medium]
    else:
        if medium == False:
            df_filtered = df[df["entity_name"] == entity]
        else:
            df_filtered = df[(df["medium_name"] == medium) & (df["entity_name"] == entity)]
    
    #Setting indeces
    indeces = pd.period_range(min(df_filtered["pubday"]), max(df_filtered["pubday"]))
    indeces = indeces.to_timestamp()
    
    #Taking daily average, standard deviation, and sample size
    avg = df_filtered.groupby("pubday").mean()
    avg.columns = [x + "_avg" for x in avg.columns]
    std = df_filtered.groupby("pubday").std()
    std.columns = [x + "_std" for x in std.columns]
    sample_size = df_filtered.groupby("pubday").size()
    sample_size.name = "sample_size"
    
    #Creating timeseries with interpolation
    timeseries = pd.DataFrame(index = indeces)
    timeseries = timeseries.join(avg).join(std)
    timeseries = timeseries.interpolate(method = interpolation_method)
    timeseries = timeseries.dropna()
    
    #Creating smoothed dataframe
    df_smoothed = pd.DataFrame(index = timeseries.index)
    
    #Performing Savitzky Golay smoothing
    for col in list(avg.columns) + list(std.columns):
        smoothed_values = signal.savgol_filter(timeseries[col], window_size, order)
        df_smoothed[col] = smoothed_values
    
    #Adding sample size
    df_smoothed = df_smoothed.join(sample_size).fillna(0)
    
    #Dropping ID
    df_smoothed.drop(["id_avg", "id_std"], axis = 1, inplace = True)
    
    #Returning information
    if sentiment_col == False:
        return df_smoothed
    else:
        return df_smoothed[sentiment_col+"_avg"], df_smoothed[sentiment_col+"_std"]

In [29]:
#Defining function to calculate kernel smoothed sentiment of given entity and medium
def calculate_kernel_smoothed_avg_std(df, entity, medium, interpolation_method, kernel, width, sentiment_col):
    """
    This function calculates the smoothed sentiment via a selected kernel for a given entity and/or 
    newspaper. If the user does not want to filter the sentiment by entity and/or newspaper, the 
    parameters should be set to False. If the user does not want to return a specific sentiment 
    column, the parameter should be set to False.
    """
    #Filtering dataframe
    if entity == False:
        if medium == False:
            df_filtered = df
        else:
            df_filtered = df[df["medium_name"] == medium]
    else:
        if medium == False:
            df_filtered = df[df["entity_name"] == entity]
        else:
            df_filtered = df[(df["medium_name"] == medium) & (df["entity_name"] == entity)]
    
    #Setting indeces
    indeces = pd.period_range(min(df_filtered["pubday"]), max(df_filtered["pubday"]))
    indeces = indeces.to_timestamp()
    
    #Taking daily average, standard deviation, and sample size
    avg = df_filtered.groupby("pubday").mean()
    avg.columns = [x + "_avg" for x in avg.columns]
    std = df_filtered.groupby("pubday").std()
    std.columns = [x + "_std" for x in std.columns]
    sample_size = df_filtered.groupby("pubday").size()
    sample_size.name = "sample_size"
    
    #Creating timeseries with interpolation
    timeseries = pd.DataFrame(index = indeces)
    timeseries = timeseries.join(avg).join(std)
    timeseries = timeseries.interpolate(method = interpolation_method)
    timeseries = timeseries.dropna()
    
    #Creating smoothed dataframe
    df_smoothed = pd.DataFrame(index = timeseries.index)
    
    #Calculating convolution 
    for col in list(avg.columns) + list(std.columns):
        smoothed_values = convolve(timeseries[col], kernel(width))
        df_smoothed[col] = smoothed_values
    
    #Adding sample size
    df_smoothed = df_smoothed.join(sample_size).fillna(0)
    
    #Dropping ID
    df_smoothed.drop(["id_avg", "id_std"], axis = 1, inplace = True)
    
    #Returning information
    if sentiment_col == False:
        return df_smoothed
    else:
        return df_smoothed[sentiment_col+"_avg"], df_smoothed[sentiment_col+"_std"]

In [30]:
#Defining function to calculate triangle smoothed sentiment of given entity and medium
def calculate_triangle_smoothed_avg_std(df, entity, medium, interpolation_method, degree, sentiment_col):
    """
    This function calculates the triangle smoothed sentiment for a given entity and/or 
    newspaper. If the user does not want to filter the sentiment by entity and/or newspaper, the 
    parameters should be set to False. If the user does not want to return a specific sentiment 
    column, the parameter should be set to False.
    """
    #Filtering dataframe
    if entity == False:
        if medium == False:
            df_filtered = df
        else:
            df_filtered = df[df["medium_name"] == medium]
    else:
        if medium == False:
            df_filtered = df[df["entity_name"] == entity]
        else:
            df_filtered = df[(df["medium_name"] == medium) & (df["entity_name"] == entity)]
    
    #Defining triangle average
    def triangle_smoothe(data, degree):
        triangle = np.concatenate((np.arange(degree + 1), np.arange(degree)[::-1]))
        
        smoothed = []
        for i in range(degree, len(data) - degree * 2):
            point = data[i:i + len(triangle)] * triangle
            smoothed.append(np.sum(point) / np.sum(triangle))

        #Handle boundaries
        smoothed = [smoothed[0]]*int(degree + degree/2) + smoothed
        while len(smoothed) < len(data):
            smoothed.append(smoothed[-1])

        return smoothed

    #Setting indeces
    indeces = pd.period_range(min(df_filtered["pubday"]), max(df_filtered["pubday"]))
    indeces = indeces.to_timestamp()
    
    #Taking daily average, standard deviation, and sample size
    avg = df_filtered.groupby("pubday").mean()
    avg.columns = [x + "_avg" for x in avg.columns]
    std = df_filtered.groupby("pubday").std()
    std.columns = [x + "_std" for x in std.columns]
    sample_size = df_filtered.groupby("pubday").size()
    sample_size.name = "sample_size"
    
    #Creating timeseries with interpolation
    timeseries = pd.DataFrame(index = indeces)
    timeseries = timeseries.join(avg).join(std)
    timeseries = timeseries.interpolate(method = interpolation_method)
    timeseries = timeseries.dropna()
    
    #Creating smoothed dataframe
    df_smoothed = pd.DataFrame(index = timeseries.index)
    
    #Performing triangle smoothing
    for col in list(avg.columns) + list(std.columns):
        smoothed_values = triangle_smoothe(timeseries[col], degree)
        df_smoothed[col] = smoothed_values
    
    #Adding sample size
    df_smoothed = df_smoothed.join(sample_size).fillna(0)
    
    #Dropping ID
    df_smoothed.drop(["id_avg", "id_std"], axis = 1, inplace = True)
    
    #Returning information
    if sentiment_col == False:
        return df_smoothed
    else:
        return df_smoothed[sentiment_col+"_avg"], df_smoothed[sentiment_col+"_std"]

In [31]:
#Defining function to calculate LOWESS smoothed sentiment of given entity and medium
def calculate_lowess_smoothed_avg_std(df, entity, medium, interpolation_method, fraction, sentiment_col):
    """
    This function calculates the LOWESS smoothed sentiment for a given entity and/or 
    newspaper. If the user does not want to filter the sentiment by entity and/or newspaper, the 
    parameters should be set to False. If the user does not want to return a specific sentiment 
    column, the parameter should be set to False.
    """
    #Filtering dataframe
    if entity == False:
        if medium == False:
            df_filtered = df
        else:
            df_filtered = df[df["medium_name"] == medium]
    else:
        if medium == False:
            df_filtered = df[df["entity_name"] == entity]
        else:
            df_filtered = df[(df["medium_name"] == medium) & (df["entity_name"] == entity)]

    #Setting indeces
    indeces = pd.period_range(min(df_filtered["pubday"]), max(df_filtered["pubday"]))
    indeces = indeces.to_timestamp()
    
    #Taking daily average, standard deviation, and sample size
    avg = df_filtered.groupby("pubday").mean()
    avg.columns = [x + "_avg" for x in avg.columns]
    std = df_filtered.groupby("pubday").std()
    std.columns = [x + "_std" for x in std.columns]
    sample_size = df_filtered.groupby("pubday").size()
    sample_size.name = "sample_size"
    
    #Creating timeseries with interpolation
    timeseries = pd.DataFrame(index = indeces)
    timeseries = timeseries.join(avg).join(std)
    timeseries = timeseries.interpolate(method = interpolation_method)
    timeseries = timeseries.dropna()
    
    #Creating smoothed dataframe
    df_smoothed = pd.DataFrame(index = timeseries.index)
    
    #Fitting LOWESS
    lowess = sm.nonparametric.lowess
    for col in list(avg.columns) + list(std.columns):
        smoothed_values = lowess(timeseries[col].values, timeseries[col].index, is_sorted = True, frac = fraction)
        smoothed_values = smoothed_values[:,1]
        df_smoothed[col] = smoothed_values
    
    #Adding sample size
    df_smoothed = df_smoothed.join(sample_size).fillna(0)
    
    #Dropping ID
    df_smoothed.drop(["id_avg", "id_std"], axis = 1, inplace = True)
    
    #Returning information
    if sentiment_col == False:
        return df_smoothed
    else:
        return df_smoothed[sentiment_col+"_avg"], df_smoothed[sentiment_col+"_std"]

In [32]:
#Defining function to calculate exponentially smoothed sentiment of given entity and medium
def calculate_exp_smoothed_avg_std(df, entity, medium, exp_smoothing_method, interpolation_method, smoothing_level, sentiment_col):
    """
    This function calculates the exponentially smoothed average sentiment for a given entity and/or 
    newspaper. If the user does not want to filter the sentiment by entity and/or newspaper, the 
    parameters should be set to False. If the user does not want to return a specific sentiment 
    column, the parameter should be set to False.
    """
    #Filtering dataframe
    if entity == False:
        if medium == False:
            df_filtered = df
        else:
            df_filtered = df[df["medium_name"] == medium]
    else:
        if medium == False:
            df_filtered = df[df["entity_name"] == entity]
        else:
            df_filtered = df[(df["medium_name"] == medium) & (df["entity_name"] == entity)]
    
    #Setting indeces
    indeces = pd.period_range(min(df_filtered["pubday"]), max(df_filtered["pubday"]))
    indeces = indeces.to_timestamp()
    
    #Taking daily average, standard deviation, and sample size
    avg = df_filtered.groupby("pubday").mean()
    avg.columns = [x + "_avg" for x in avg.columns]
    std = df_filtered.groupby("pubday").std()
    std.columns = [x + "_std" for x in std.columns]
    sample_size = df_filtered.groupby("pubday").size()
    sample_size.name = "sample_size"
    
    #Creating timeseries with interpolation
    timeseries = pd.DataFrame(index = indeces)
    timeseries = timeseries.join(avg).join(std)
    timeseries = timeseries.interpolate(method = interpolation_method)
    timeseries = timeseries.dropna()
    
    #Creating smoothed dataframe
    df_smoothed = pd.DataFrame(index = timeseries.index)
    
    #Fitting simple exponential smoothing and fetching values
    for col in list(avg.columns) + list(std.columns):
        model = exp_smoothing_method(timeseries[col]).fit(smoothing_level = smoothing_level)
        smoothed_values = model.fittedvalues
        df_smoothed[col] = smoothed_values
    
    #Adding sample size
    df_smoothed = df_smoothed.join(sample_size).fillna(0)
    
    #Dropping ID
    df_smoothed.drop(["id_avg", "id_std"], axis = 1, inplace = True)
    
    #Returning information
    if sentiment_col == False:
        return df_smoothed
    else:
        return df_smoothed[sentiment_col+"_avg"], df_smoothed[sentiment_col+"_std"]

### Section 3.2: <a class="anchor" id="section_3_2"></a> TextBlob

In [35]:
#Saving timeseries outputs for all smoothing methods
moving_avg, _ = calculate_moving_avg_std(df_textblob, False, False, 50, "clause_sentiment")
sg_avg, _ = calculate_savitzky_golay_smoothed_avg_std(df_textblob, False, False, "time", 201, 2, "clause_sentiment")
box_avg, _ = calculate_kernel_smoothed_avg_std(df_textblob, False, False, "time", Box1DKernel, 50, "clause_sentiment")
gaussian_avg, _ = calculate_kernel_smoothed_avg_std(df_textblob, False, False, "time", Gaussian1DKernel, 30, "clause_sentiment")
trapezoid_avg, _ = calculate_kernel_smoothed_avg_std(df_textblob, False, False, "time", Trapezoid1DKernel, 60, "clause_sentiment")
triangle_avg, _ = calculate_triangle_smoothed_avg_std(df_textblob, False, False, "time", 50, "clause_sentiment")
lowess_avg, _ = calculate_lowess_smoothed_avg_std(df_textblob, False, False, "time", 0.1, "clause_sentiment")
simple_exp_avg, _ = calculate_exp_smoothed_avg_std(df_textblob, False, False, SimpleExpSmoothing, "time", 0.025, "clause_sentiment")
holt_avg, _ = calculate_exp_smoothed_avg_std(df_textblob, False, False, Holt, "time", 0.025, "clause_sentiment")

In [36]:
#Creating timeseries dataframe
timeseries = [moving_avg, sg_avg, box_avg, gaussian_avg, trapezoid_avg, triangle_avg, lowess_avg, simple_exp_avg, holt_avg]
timeseries_labels = ["moving", "savitzky_golay", "box", "gaussian", "trapezoid", "triangle", "lowess", "simple_exp", "holt"]
df_textblob_timeseries = pd.DataFrame(data = {key: value for key, value in zip(timeseries_labels, timeseries)}, 
                                      index = sg_avg.index)

In [37]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df_textblob_timeseries.to_csv("Outputs/Articles/Lexicon/textblob_timeseries.csv")
os.chdir("Notebooks/Articles")

### Section 3.3: <a class="anchor" id="section_3_3"></a> SentiWS

In [39]:
#Saving timeseries outputs for all smoothing methods
moving_avg, _ = calculate_moving_avg_std(df_sentiws, False, False, 50, "clause_sentiment")
sg_avg, _ = calculate_savitzky_golay_smoothed_avg_std(df_sentiws, False, False, "time", 201, 2, "clause_sentiment")
box_avg, _ = calculate_kernel_smoothed_avg_std(df_sentiws, False, False, "time", Box1DKernel, 50, "clause_sentiment")
gaussian_avg, _ = calculate_kernel_smoothed_avg_std(df_sentiws, False, False, "time", Gaussian1DKernel, 30, "clause_sentiment")
trapezoid_avg, _ = calculate_kernel_smoothed_avg_std(df_sentiws, False, False, "time", Trapezoid1DKernel, 60, "clause_sentiment")
triangle_avg, _ = calculate_triangle_smoothed_avg_std(df_sentiws, False, False, "time", 50, "clause_sentiment")
lowess_avg, _ = calculate_lowess_smoothed_avg_std(df_sentiws, False, False, "time", 0.1, "clause_sentiment")
imple_exp_avg, _ = calculate_exp_smoothed_avg_std(df_sentiws, False, False, SimpleExpSmoothing, "time", 0.025, "clause_sentiment")
holt_avg, _ = calculate_exp_smoothed_avg_std(df_sentiws, False, False, Holt, "time", 0.025, "clause_sentiment")

In [40]:
#Creating timeseries dataframe
timeseries = [moving_avg, sg_avg, box_avg, gaussian_avg, trapezoid_avg, triangle_avg, lowess_avg, simple_exp_avg, holt_avg]
timeseries_labels = ["moving", "savitzky_golay", "box", "gaussian", "trapezoid", "triangle", "lowess", "simple_exp", "holt"]
df_sentiws_timeseries = pd.DataFrame(data = {key: value for key, value in zip(timeseries_labels, timeseries)}, 
                                     index = sg_avg.index)

In [41]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
df_sentiws_timeseries.to_csv("Outputs/Articles/Lexicon/sentiws_timeseries.csv")
os.chdir("Notebooks/Articles")