## Table of Contents

* [Chapter 1](#chapter1): Generating Timeseries

## Chapter 1: <a class="anchor" id="chapter1"></a> Generating Timeseries

In [1]:
#Performing required installations

In [2]:
#Importing libraries
#Data processing
import pandas as pd
import numpy as np
import re

#Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches

#Timeseries and date handling
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt 
from scipy import signal
from scipy.fftpack import fft, fftshift
from math import factorial
from astropy.convolution import convolve, Box1DKernel, Gaussian1DKernel, Trapezoid1DKernel

#Stats
import statsmodels.api as sm
import scipy.stats
import math

#Other
import os
from tqdm import tqdm
import warnings
tqdm.pandas()

In [3]:
#Suppressing warnings
warnings.simplefilter(action = "ignore")

In [4]:
#Reading in CSVs
os.chdir("..")
os.chdir("..")
df = pd.read_csv("Outputs/Articles/df_final.csv", index_col = 0, parse_dates = ["pubtime", "pubday", "pubmonth"])

df_entities = pd.read_csv("Inputs/Articles/entities.csv")
os.chdir("Notebooks/Articles")

In [5]:
#Setting entities
entities = list(df_entities["designed_entity"].unique())
entities

['Ueli_Maurer',
 'Guy_Parmelin',
 'Ignazio_Cassis',
 'Karin_Keller_Sutter',
 'Simonetta_Sommaruga',
 'Alain_Berset',
 'Viola_Amherd',
 'Bundesrat',
 'Tanja_Stadler',
 'Marcel_Tanner',
 'Martin_Ackermann',
 'Matthias_Egger',
 'Taskforce',
 'Christoph_Berger',
 'EKIF',
 'Stefan_Kuster',
 'Pascal_Strupler',
 'Virginie_Masserey',
 'Anne_Levy',
 'Patrick_Mathys',
 'Marcel_Salathe',
 'Daniel_Koch',
 'BAG',
 'Swissmedic',
 'Lukas_Engelberger',
 'GDK',
 'SVP',
 'SP',
 'FDP',
 'Die_Mitte',
 'Die_Gruene',
 'Gruenliberale',
 'Juso',
 'Befuerworter',
 'Ja_Lager',
 'Gegner',
 'Leugner',
 'Skeptiker',
 'Kritiker',
 'Opposition',
 'Nein_Lager',
 'Demonstranten',
 'Freunde_Der_Verfassung',
 'Mass_Voll']

In [6]:
#Setting media
media = list(df["medium_name"].unique())
media

['20_Minuten',
 'Basler_Zeitung',
 'Swissinfo',
 'SRF',
 'Tages_Anzeiger',
 'Berner_Zeitung',
 'NZZ',
 'Cash',
 'Zuercher_Unterlaender',
 'Blick',
 'Der_Bund',
 'Solothurner_Zeitung',
 'Grenchner_Tagblatt',
 'Aargauer_Zeitung',
 'Werdenberger_&_Obertoggenburger',
 'Urner_Zeitung',
 'Appenzeller_Zeitung',
 'Schweizer_Illustrierte',
 'Handelszeitung',
 'Thurgauer_Zeitung',
 'Zuerichsee_Zeitung',
 'Landbote',
 'Zuger_Zeitung',
 'Limmattaler_Zeitung',
 'Luzerner_Zeitung',
 'Langenthaler_Tagblatt',
 'Zofinger_Tagblatt',
 'Badener_Tagblatt',
 'BZ_Basel',
 'Oltner_Tagblatt',
 'St._Galler_Tagblatt',
 'Nidwaldner_Zeitung',
 'Thuner_Tagblatt',
 'Obwaldner_Zeitung',
 'Berner_Oberlaender',
 'Toggenburger_Tagblatt',
 'Finanz_und_Wirtschaft',
 'Thalwiler_Anzeiger',
 'Die_Wochenzeitung',
 'Beobachter',
 'Bilanz',
 'Schweizer_Familie',
 'Glueckspost',
 'Tele',
 'TV_Star',
 'Das_Magazin',
 'Zuger_Presse',
 'Zugerbieter']

In [13]:
#Defining function to calculate kernel LOWESS smoothed sentiment of given entity and medium
def calculate_kernel_lowess_smoothed_avg(df, entity, medium, interpolation_method, kernel, width, fraction, sentiment_col):
    """
    This function calculates the smoothed sentiment via a selected kernel for a given entity and/or 
    newspaper, with additional LOWESS smoothing. If the user does not want to filter the sentiment by 
    entity and/or newspaper, the parameters should be set to False. If the user does not want to return 
    a specific sentiment column, the parameter should be set to False.
    """
    #Filtering dataframe
    if entity == False:
        if medium == False:
            df_filtered = df
        else:
            df_filtered = df[df["medium_name"] == medium]
    else:
        if medium == False:
            df_filtered = df[df["entity_name"] == entity]
        else:
            df_filtered = df[(df["medium_name"] == medium) & (df["entity_name"] == entity)]
    
    #Setting indeces
    indeces = pd.period_range(min(df_filtered["pubday"]), max(df_filtered["pubday"]))
    indeces = indeces.to_timestamp()
    
    #Taking daily average, standard deviation, and sample size
    avg = df_filtered.groupby("pubday").mean()
    avg.columns = [x + "_avg" for x in avg.columns]
    std = df_filtered.groupby("pubday").std()
    std.columns = [x + "_std" for x in std.columns]
    monthly_sample = df_filtered.groupby("pubmonth").size()
    monthly_sample.name = "monthly_sample"
    
    #Creating timeseries with interpolation
    timeseries = pd.DataFrame(index = indeces)
    timeseries = timeseries.join(avg).join(std)
    timeseries = timeseries.interpolate(method = interpolation_method)
    timeseries = timeseries.dropna()
    
    #Creating smoothed dataframe
    df_smoothed = pd.DataFrame(index = timeseries.index)
    
    #Calculating convolution 
    for col in list(avg.columns) + list(std.columns):
        smoothed_values = convolve(timeseries[col], kernel(width))
        df_smoothed[col] = smoothed_values
        
    #Fitting LOWESS
    lowess = sm.nonparametric.lowess
    for col in list(avg.columns) + list(std.columns):
        smoothed_values = lowess(df_smoothed[col].values, df_smoothed[col].index, is_sorted = True, frac = fraction)
        smoothed_values = smoothed_values[:,1]
        df_smoothed[col] = smoothed_values
    
    #Adding sample size
    df_smoothed["pubmonth"] = df_smoothed.index.strftime("%Y-%m")
    df_smoothed["pubmonth"] = pd.to_datetime(df_smoothed["pubmonth"])
    df_smoothed = df_smoothed.join(monthly_sample, on = "pubmonth")
    df_smoothed.drop("pubmonth", axis = 1, inplace = True)
    df_smoothed["monthly_sample"] = df_smoothed["monthly_sample"].fillna(0)
    
    #Dropping ID
    df_smoothed.drop(["id_avg", "id_std"], axis = 1, inplace = True)
    
    #Returning information
    if sentiment_col == False:
        return df_smoothed
    else:
        return df_smoothed[[sentiment_col+"_avg", "monthly_sample"]]

In [14]:
#Defining function to save smoothed timeseries
def generate_timeseries(func, kernel, width, fraction, df, sentiment_col, entities):
    total_df = func(df, False, False, "time", kernel, width, fraction, sentiment_col)    
    timeseries_df = pd.DataFrame(data = {"total": total_df["sentiment_avg"], "total_sample": total_df["monthly_sample"]})
    for entity in entities:
        entity_df = func(df, entity, False, "time", kernel, width, fraction, sentiment_col)
        timeseries_df[entity] = entity_df["sentiment_avg"]
        timeseries_df[entity+"_sample"] = entity_df["monthly_sample"]
    return timeseries_df

In [15]:
#Dropping duplicates
df = df.drop_duplicates(subset = ["sentence_ABSA", "entity_name"])

In [16]:
#Generating timeseries dataframes
kernel_lowess_smoothed_df = generate_timeseries(calculate_kernel_lowess_smoothed_avg, Box1DKernel, 45, 0.075, df, "sentiment", entities)                                                

In [17]:
#Saving to CSV
os.chdir("..")
os.chdir("..")
kernel_lowess_smoothed_df.to_csv("Outputs/Articles/df_kernel_lowess_timeseries_final.csv")
os.chdir("Notebooks/Articles")