# Labeled text sentiment

For the python-based packages: Stanza and Pattern.

## Load data

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("data-raw/sentence-labels.csv")
df.info()
# Remove rows missing overall sentiment
df.dropna(subset = ['overall-sentiment'], inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1499 entries, 0 to 1498
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               1499 non-null   object
 1   aspect             1387 non-null   object
 2   overall-sentiment  1493 non-null   object
 3   aspect1-sentiment  1386 non-null   object
 4   aspect2-sentiment  556 non-null    object
 5   aspect3-sentiment  207 non-null    object
 6   aspect4-sentiment  96 non-null     object
 7   aspect5-sentiment  45 non-null     object
dtypes: object(8)
memory usage: 93.8+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1493 entries, 0 to 1498
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               1493 non-null   object
 1   aspect             1382 non-null   object
 2   overall-sentiment  1493 non-null   object
 3   aspect1-sentiment  1381 non-null   ob

## Count keywords

In [2]:
# Use next code block.
"""
kw_df = pd.read_excel('data-raw/Keywords for lexicon-based sentiment classifier.xlsx',
                      sheet_name = 'Word Delineations')

#neg_keywords=pd.read_csv('data-raw/negative_keywords.csv').iloc[::,0]
neg_keywords = kw_df['Negative '].dropna().str.strip().str.lower().tolist()

print(type(neg_keywords))
print(neg_keywords[:5])

#pos_keywords=pd.read_csv('data-raw/positive_keywords.csv').iloc[::,0]
pos_keywords = kw_df['Positive '].dropna().str.strip().str.lower().tolist()
print(pos_keywords[:5])

kw = {'pos': pos_keywords,
      'neg': neg_keywords,
      'all': neg_keywords + pos_keywords}

# We need to have the longer phrases first to ensure that nested keywords
# are not detected multiple times. Sort by descending word count.
kw['all'].sort(key = lambda x: len(x.split()), reverse = True)
"""
#keywords['all']

"\nkw_df = pd.read_excel('data-raw/Keywords for lexicon-based sentiment classifier.xlsx',\n                      sheet_name = 'Word Delineations')\n\n#neg_keywords=pd.read_csv('data-raw/negative_keywords.csv').iloc[::,0]\nneg_keywords = kw_df['Negative '].dropna().str.strip().str.lower().tolist()\n\nprint(type(neg_keywords))\nprint(neg_keywords[:5])\n\n#pos_keywords=pd.read_csv('data-raw/positive_keywords.csv').iloc[::,0]\npos_keywords = kw_df['Positive '].dropna().str.strip().str.lower().tolist()\nprint(pos_keywords[:5])\n\nkw = {'pos': pos_keywords,\n      'neg': neg_keywords,\n      'all': neg_keywords + pos_keywords}\n\n# We need to have the longer phrases first to ensure that nested keywords\n# are not detected multiple times. Sort by descending word count.\nkw['all'].sort(key = lambda x: len(x.split()), reverse = True)\n"

In [3]:
#Load keywords and combine pos+neg keywords together 
# """
all_keywords=[]
# Files are downloaded from https://drive.google.com/drive/u/0/folders/1ZdED366Wa2Hrbd2VSz7zbvRJ2iJZ1S81
# Dated November 29, 2020
neg_keywords=pd.read_csv('data-raw/negative_keywords.csv').iloc[::,0].dropna().str.strip().str.lower().tolist()
pos_keywords=pd.read_csv('data-raw/positive_keywords.csv').iloc[::,0].dropna().str.strip().str.lower().tolist()

all_keywords.append(neg_keywords)
all_keywords.append(pos_keywords)

keywords=[i.lower() for i in all_keywords for i in i]
keywords.sort(key=lambda x: len(x.split()), reverse=True)

kw = {'pos': pos_keywords,
      'neg': neg_keywords,
      'all': neg_keywords + pos_keywords}

# We need to have the longer phrases first to ensure that nested keywords
# are not detected multiple times. Sort by descending word count.
kw['all'].sort(key = lambda x: len(x.split()), reverse = True)
kw['all'][0:5]

['does not enjoy',
 'does not want',
 'not a candidate',
 'not well controlled',
 'not well treated']

In [4]:
import re

# Copied from MIMIC_003_keyword_labelling.ipynb
def find_keywords_final(text, keywords = kw['all']):
    found_keywords=[]
    text=str(text).lower()
    for i in keywords: 
        if re.search(r'\b{}\b'.format(i), text): 
            if i not in ' '.join(found_keywords):
                found_keywords.append(i)
    return [text, found_keywords]

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
from keywords import find_keywords2

In [7]:
%%time

df['find_keywords'] = df.text.apply(find_keywords2, keywords = kw['all'])

CPU times: user 4.47 s, sys: 3.2 ms, total: 4.47 s
Wall time: 4.47 s


In [8]:
df['find_keywords'].head()

0                         {'distress': [(2068, 2075)]}
1    {'concerning': [(265, 274)], 'distress': [(243...
2                           {'outpatient': [(36, 45)]}
3                              {'concern': [(46, 52)]}
4                            {'unclear': [(566, 572)]}
Name: find_keywords, dtype: object

In [9]:
df['keyword_counts'] = df.find_keywords.apply(lambda x: { key: len(values) for key, values in x.items()})

In [10]:
df.keyword_counts.head()

0                     {'distress': 1}
1    {'concerning': 1, 'distress': 1}
2                   {'outpatient': 1}
3                      {'concern': 1}
4                      {'unclear': 1}
Name: keyword_counts, dtype: object

In [11]:
# Expand each keyword count into its own column.
df2 = pd.DataFrame(df['keyword_counts'].tolist(), index=df.index).fillna(0).astype(np.int8)
# Sort alphabetically
df2 = df2[sorted(df2.columns.tolist())]

In [12]:
df2.head()
#df2.info()
#df2.max(axis = 0)

Unnamed: 0,advocate,agreeable,appropriate,appropriately,bad,better,catastrophic,challenging,comfortable,compensated,...,uneventful,unfortunate,unknown,unstable,well controlled,worrisome,worse,worsening,would not want,would want
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
cols_pos = set(df2.columns).intersection(kw['pos'])
cols_neg = set(df2.columns).intersection(kw['neg'])

print(f"Distinct positive keywords found ({len(cols_pos)}):", ", ".join(cols_pos), "\n")
print(f"Distinct negative keywords found ({len(cols_neg)}):",  ", ".join(cols_neg))

Distinct positive keywords found (37): well controlled, successful, resolution, unconcerned, would want, advocate, not concerned, low risk, good, outpatient, improving, straightforward, appropriately, great, stability, uneventful, resolved, reasonable, encouraged, compensated, encouraging, excellent, improved, optimized, treated, pleasant, stable, properly, comfortable, controlled, prefers, appropriate, improvement, better, routine, preferable, agreeable 

Distinct negative keywords found (47): bad, worrisome, inoperable, challenging, not improving, unfortunate, failed, unstable, no improvement, labile, concerning, frail, unable, decompensated, grave, poor, declining, difficult, distress, unclear, in distress, not controlled, worsening, does not want, uncontrolled, would not want, severe, instability, not treated, high risk, futile, unknown, catastrophic, grim, concerned, concern, despite, not resolved, not well controlled, poorly controlled, maximum, worse, refractory, not able, not a

In [14]:
#df2.idxmax()
#df2.loc[1039]

In [15]:
# Row sums for the positive keywords
df['keyword_count_pos'] = df2[cols_pos].sum(axis = 1)
df['keyword_count_neg'] = df2[cols_neg].sum(axis = 1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1493 entries, 0 to 1498
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               1493 non-null   object
 1   aspect             1382 non-null   object
 2   overall-sentiment  1493 non-null   object
 3   aspect1-sentiment  1381 non-null   object
 4   aspect2-sentiment  552 non-null    object
 5   aspect3-sentiment  205 non-null    object
 6   aspect4-sentiment  95 non-null     object
 7   aspect5-sentiment  45 non-null     object
 8   find_keywords      1493 non-null   object
 9   keyword_counts     1493 non-null   object
 10  keyword_count_pos  1493 non-null   int64 
 11  keyword_count_neg  1493 non-null   int64 
dtypes: int64(2), object(10)
memory usage: 151.6+ KB


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1493 entries, 0 to 1498
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               1493 non-null   object
 1   aspect             1382 non-null   object
 2   overall-sentiment  1493 non-null   object
 3   aspect1-sentiment  1381 non-null   object
 4   aspect2-sentiment  552 non-null    object
 5   aspect3-sentiment  205 non-null    object
 6   aspect4-sentiment  95 non-null     object
 7   aspect5-sentiment  45 non-null     object
 8   find_keywords      1493 non-null   object
 9   keyword_counts     1493 non-null   object
 10  keyword_count_pos  1493 non-null   int64 
 11  keyword_count_neg  1493 non-null   int64 
dtypes: int64(2), object(10)
memory usage: 151.6+ KB


In [18]:
df[['keyword_count_pos', 'keyword_count_neg']].describe()

Unnamed: 0,keyword_count_pos,keyword_count_neg
count,1493.0,1493.0
mean,0.71065,0.620898
std,0.809213,0.833059
min,0.0,0.0
25%,0.0,0.0
50%,1.0,0.0
75%,1.0,1.0
max,10.0,11.0


In [19]:
# Cbind df2 onto df
df = pd.concat([df, df2], axis = 1)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1493 entries, 0 to 1498
Data columns (total 96 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   text                 1493 non-null   object
 1   aspect               1382 non-null   object
 2   overall-sentiment    1493 non-null   object
 3   aspect1-sentiment    1381 non-null   object
 4   aspect2-sentiment    552 non-null    object
 5   aspect3-sentiment    205 non-null    object
 6   aspect4-sentiment    95 non-null     object
 7   aspect5-sentiment    45 non-null     object
 8   find_keywords        1493 non-null   object
 9   keyword_counts       1493 non-null   object
 10  keyword_count_pos    1493 non-null   int64 
 11  keyword_count_neg    1493 non-null   int64 
 12  advocate             1493 non-null   int8  
 13  agreeable            1493 non-null   int8  
 14  appropriate          1493 non-null   int8  
 15  appropriately        1493 non-null   int8  
 16  bad   

In [21]:
# All of this code is extracted from MIMIC_003_keyword_labelling.ipynb
# We are running this for comparison to the above keyword counts.

#kw_df.head()
all_patients = df

#Create new column of TEXT that is split per sentence 
all_patients['TEXT_SPLIT']=all_patients.text.map(lambda x: x.split('. '))

all_patients['FOUND_KEYWORDS']=all_patients.TEXT_SPLIT.map(lambda x: find_keywords_final(x)[1])

#all_patients.FOUND_KEYWORDS.map(lambda x: ', '.join(x)) 

negative_keywords=[i.strip().lower() for i in neg_keywords]
positive_keywords=[i.strip().lower() for i in pos_keywords]

#all_patients['AGGR_KEYWORDS']=all_patients.FOUND_KEYWORDS.map(lambda x: [a for a in x for a in a])

all_patients['NUM_POS_KEYWORDS']=all_patients.FOUND_KEYWORDS.map(lambda x: len([i for i in x if i in positive_keywords]))
all_patients['NUM_NEG_KEYWORDS']=all_patients.FOUND_KEYWORDS.map(lambda x: len([i for i in x if i in negative_keywords]))

all_patients['FOUND_STRING_KEYWORDS']=all_patients.FOUND_KEYWORDS.map(lambda x: ', '.join(x)) 
#final_df=pd.concat([all_patients.drop('AGGR_STRING_KEYWORDS',axis=1), all_patients['AGGR_STRING_KEYWORDS'].str.get_dummies(sep=', ')], axis=1)

# CK: Skip this, we already have the keyword counts.
#Final_df adds columns where each keyword is a dummy variable 
#final_df=pd.concat([all_patients.drop('FOUND_STRING_KEYWORDS',axis=1), all_patients['FOUND_STRING_KEYWORDS'].str.get_dummies(sep=', ')], axis=1)
#final_df.head(5)

# df = final_df

df = all_patients

## Stanza

In [22]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-02-25 05:31:44 INFO: Downloading default packages for language: en (English)...
2022-02-25 05:31:45 INFO: File exists: /home/ck37/stanza_resources/en/default.zip.
2022-02-25 05:31:48 INFO: Finished downloading models and saved to /home/ck37/stanza_resources.
2022-02-25 05:31:48 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2022-02-25 05:31:48 INFO: Use device: gpu
2022-02-25 05:31:48 INFO: Loading: tokenize
2022-02-25 05:31:51 INFO: Loading: sentiment
2022-02-25 05:31:51 INFO: Done loading processors!


In [23]:
%%time
# This takes 1-2 minutes

overall_sent = []
for text in df.text.values:
#    print(text)
    doc = nlp(text)
    sentiments = []
    for i, sentence in enumerate(doc.sentences):
        sentiments.append(sentence.sentiment)
    overall_sent.append(np.mean(sentiments))

CPU times: user 24.1 s, sys: 120 ms, total: 24.3 s
Wall time: 20.4 s


In [24]:
overall_sent[:5]

[0.0, 0.0, 2.0, 0.0, 0.6666666666666666]

In [25]:
df['stanza_sent'] = overall_sent

## Pattern

In [26]:
%%time

from pattern.en import sentiment
sent_results = df.text.apply(lambda x: sentiment(x)[0])

CPU times: user 2.22 s, sys: 76.2 ms, total: 2.29 s
Wall time: 2.39 s


In [27]:
df['pattern_sent'] = sent_results

In [28]:
df.describe()

Unnamed: 0,keyword_count_pos,keyword_count_neg,advocate,agreeable,appropriate,appropriately,bad,better,catastrophic,challenging,...,well controlled,worrisome,worse,worsening,would not want,would want,NUM_POS_KEYWORDS,NUM_NEG_KEYWORDS,stanza_sent,pattern_sent
count,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,...,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0
mean,0.71065,0.620898,0.00067,0.00067,0.022103,0.004019,0.00067,0.03349,0.00067,0.002009,...,0.016075,0.00067,0.010047,0.042867,0.004019,0.004019,0.673141,0.575352,0.33073,0.025301
std,0.809213,0.833059,0.02588,0.02588,0.147068,0.073115,0.02588,0.179972,0.02588,0.044796,...,0.125806,0.02588,0.099763,0.218538,0.063287,0.063287,0.681276,0.663473,0.638265,0.223917
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.7
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.053125
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.333333,0.069444
max,10.0,11.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,3.0,1.0,1.0,5.0,5.0,2.0,1.0


## Save results

In [29]:
df.to_excel("data/labels-python-predicted-sentiment.xlsx")