In [86]:
# import the packages
import pandas as pd

In [87]:
# import the corpus
text = pd.read_csv('cleaned_corpus.csv')
text['corpus'] = text['corpus'].apply(eval)

In [88]:
df = text[['Date', 'corpus']] # select relevant column

### 1. SemAxis, pretrained embedding

In [109]:
# define the functions to be used
def polar_dict1(path):
    polar = pd.read_csv(path)
    polar = polar.dropna() # remove NaN values
    po = dict(list(zip(polar.Word, polar.Sim))) # convert the dataframe to key-value pairs
    return po

# calculate the sentiment of the articles
def calculate_polar1(path, text):
    po = polar_dict1(path)
    polar = 0
    for t in text:
        try:
            polar += po.get(t)
        except TypeError:
            polar += 0
            
    return polar

# create the dataframe of the sentiment series
def senti(path, Embedding, GType, BType, Method):
    df1 = pd.DataFrame(df['Date'])
    df1['Score'] = df['corpus'].apply(lambda x: calculate_polar1(path, x)) # get the sentiment of each article
    df1 = df1.groupby(by = ['Date'], as_index = False).mean() # calculate the daily sentiment
    
    df1['Embedding'] = Embedding
    df1['GType'] = GType
    df1['BType'] = BType
    df1['Method'] = Method
    
    return df1
    
    

In [110]:
df11 = senti('SemAxis11.csv', 'PT', 'F', 'F', 'SemAxis')

In [111]:
df11.head()

Unnamed: 0,Date,Score,Embedding,GType,BType,Method
0,2018-10-01,0.118092,PT,F,F,SemAxis
1,2018-10-02,1.1627,PT,F,F,SemAxis
2,2018-10-03,1.357619,PT,F,F,SemAxis
3,2018-10-04,0.706133,PT,F,F,SemAxis
4,2018-10-05,0.697959,PT,F,F,SemAxis


In [112]:
df12 = senti('SemAxis12.csv', 'PT', 'NF', 'F', 'SemAxis')

In [113]:
df12.head()

Unnamed: 0,Date,Score,Embedding,GType,BType,Method
0,2018-10-01,0.026723,PT,NF,F,SemAxis
1,2018-10-02,1.090884,PT,NF,F,SemAxis
2,2018-10-03,1.041153,PT,NF,F,SemAxis
3,2018-10-04,0.487735,PT,NF,F,SemAxis
4,2018-10-05,0.511493,PT,NF,F,SemAxis


In [114]:
df13 = senti('SemAxis13.csv', 'PT', 'F', 'NF', 'SemAxis')

In [116]:
df14 = senti('SemAxis14.csv', 'PT', 'NF', 'NF', 'SemAxis')

## 2. SemAxis, Self-trained embedding

In [117]:
df21 = senti('SemAxis21.csv', 'T', 'F', 'F', 'SemAxis') # F + F, self trained embedding

In [118]:
df22 = senti('SemAxis22.csv', 'T', 'NF', 'F', 'SemAxis') # NF + F, self trained embedding

In [119]:
df23 = senti('SemAxis23.csv', 'T', 'F', 'NF', 'SemAxis') # F + NF, self trained embedding

In [120]:
df24 = senti('SemAxis24.csv', 'T', 'NF', 'NF', 'SemAxis') # NF + NF, self trained embedding

## 3. SentiProp, pre-trained embedding

In [121]:
# define the functions to be used
def polar_dict2(path):
    polar = pd.read_csv(path)
    polar = polar.dropna() # remove NaN values
    po = dict(list(zip(polar.words, polar.polarity))) # convert the dataframe to key-value pairs
    return po

# calculate the sentiment of the articles
def calculate_polar2(path, text):
    po = polar_dict2(path)
    polar = 0
    for t in text:
        try:
            polar += po.get(t)
        except TypeError:
            polar += 0
            
    return polar


# create the dataframe of the sentiment series
def senti(path, Embedding, GType, BType, Method):
    df1 = pd.DataFrame(df['Date'])
    df1['Score'] = df['corpus'].apply(lambda x: calculate_polar2(path, x)) # get the sentiment of each article
    df1 = df1.groupby(by = ['Date'], as_index = False).mean() # calculate the daily sentiment
    
    df1['Embedding'] = Embedding
    df1['GType'] = GType
    df1['BType'] = BType
    df1['Method'] = Method
    
    return df1

In [122]:
df31 = senti('SentiProp11.csv', 'PT', 'F', 'F', 'SentiProp') 

In [123]:
df32 = senti('SentiProp12.csv', 'PT', 'NF', 'F', 'SentiProp') 

In [124]:
df33 = senti('SentiProp13.csv', 'PT', 'F', 'NF', 'SentiProp') 

In [125]:
df34 = senti('SentiProp14.csv', 'PT', 'NF', 'NF', 'SentiProp')

## 4. SentiProp, self-trained embedding

In [126]:
df41 = senti('SentiProp21.csv', 'T', 'F', 'F', 'SentiProp') 

In [127]:
df42 = senti('SentiProp22.csv', 'T', 'NF', 'F', 'SentiProp') 

In [128]:
df43 = senti('SentiProp23.csv', 'T', 'F', 'NF', 'SentiProp') 

In [129]:
df44 = senti('SentiProp24.csv', 'T', 'NF', 'NF', 'SentiProp') 

In [130]:
# concate all the dataframes
dfs = [df11, df12, df13, df14, df21, df22, df23, df24, df31, df32, df33, df34, df41, df42, df43, df44]
result = pd.concat(dfs)

In [133]:
res = result.sort_values(by = ['Date', 'Method'])

In [135]:
res.to_csv('WSJ_sentiment.csv', index = False)