# Annotation agreement sample

This is run after 02_sentence-sentiment.ipynb.



## Setup

In [None]:
import os, re, time, sys
import pandas as pd, numpy as np
print("Python executable:", sys.executable)


## Import data

In [None]:
# Created in 02_sentence-sentiment.ipynb
sent_df = pd.read_feather("data/mimic-sentences-sentiment.feather")

print(sent_df.info())

In [None]:
# Restrict to sentences with: chars >= 22, words >= 5, words <= 100
# Need parentheses around each criterion.
sent_df = sent_df.loc[(sent_df.chars >= 22) & (sent_df.words >= 5) & (sent_df.words <= 100)]

# Down to 23 MM sentences.
sent_df.shape

In [None]:
# Restrict to sentences with at least one keyword.
kw_df = sent_df.loc[sent_df['keyword_count'] > 0]

In [None]:
# 2.5 million sentences.
print(kw_df.info())
kw_df.head()

In [None]:
kw_df[['text', 'keywords', 'words', 'chars']].sample(20, random_state = 1)

In [None]:
kw_df.keyword_count.value_counts()

In [None]:
# What is this really long one?
kw_df.loc[kw_df.keyword_count == 11]

## Sample 1

In [None]:
# Take a sample of 100
"""
samp_df = kw_df.sample(100, random_state = 1)
print(samp_df.shape)
"""

In [None]:
# CK: disabled to avoid overwriting the sample that is already being used.
"""
samp_df.to_excel("data/annotation2-sample100.xlsx", index = False)
"""

In [None]:
# Load from file then remove from df.
samp_df = pd.read_excel("data/annotation2-sample100.xlsx")
samp_df.info()

In [None]:
# Note which obs were in sample1.
kw_df['sample'] = ''

# Restrict just to the joining columns.
join_df = samp_df[['row_id', 'sent_num']]
join_df.set_index(['row_id', 'sent_num'], inplace = True)

# Add a field to flag rows that match this sample.
join_df['sample1'] = True

In [None]:
kw_df2 = kw_df.join(join_df, on = ['row_id', 'sent_num'], how = 'left')
kw_df2['sample1'].value_counts()

# Update the main sample column to track the sample 1 rows.
kw_df2.loc[kw_df2['sample1'].values == True, 'sample'] = 'sample1'

kw_df2['sample'].value_counts()

# Confirm that shapes are still good.
print(kw_df2.shape)
print(kw_df.shape)

## Sample 2 (2022-03-13)

In [None]:
# Take a sample of 100
samp_df2 = kw_df2.loc[kw_df2['sample'] == ''].sample(100, random_state = 1)
print(samp_df2.shape)

In [None]:
# CK: disabled to avoid overwriting the sample that is already being used.
"""
samp_df2.to_excel("data/annotation2-sample100-v2.xlsx", index = False)
samp_df2.to_csv("data/annotation2-sample100-v2.tsv", sep = '\t', index = False)
"""

In [None]:
# Load from file then tag in df.
samp_df2 = pd.read_excel("data/annotation2-sample100-v2.xlsx")
samp_df2.info()

# Restrict just to the joining columns.
join_df = samp_df2[['row_id', 'sent_num']]
join_df.set_index(['row_id', 'sent_num'], inplace = True)

# Add a field to flag rows that match this sample.
join_df['sample2'] = True

kw_df3 = kw_df2.join(join_df, on = ['row_id', 'sent_num'], how = 'left')
print(kw_df3['sample2'].value_counts())

# Update the main sample column to track the sample 1 rows.
kw_df3.loc[kw_df3['sample2'].values == True, 'sample'] = 'sample2'

print(kw_df3['sample'].value_counts())

# Confirm that shapes are still good.
print(kw_df3.shape)
print(kw_df2.shape)

# Save df to disk for later usage.
kw_df3.reset_index(drop = True).to_feather("data/annotation2-post-sample2.feather")

## Sample 3 (2022-05-13)

In [None]:
# Load post-sample 2 file.
kw_df3 = pd.read_feather("data/annotation2-post-sample2.feather")
# Review value counts.
kw_df3['sample'].value_counts()

In [None]:
# Take a sample of 100 from the remaining sentences.
samp_df3 = kw_df3.loc[kw_df3['sample'] == ''].sample(100, random_state = 1)
print(samp_df3.shape)

In [None]:
# CK: disabled to avoid overwriting the sample that is already being used.
"""
samp_df3.to_excel("data/annotation2-sample100-v3.xlsx", index = False)
sam_df3.to_csv("data/annotation2-sample100-v3.tsv", sep = '\t', index = False)
"""

In [None]:
# Load from file then tag in df.
samp_df3 = pd.read_excel("data/annotation2-sample100-v3.xlsx")
samp_df3.info()

# Restrict just to the joining columns.
join_df = samp_df3[['row_id', 'sent_num']]
join_df.set_index(['row_id', 'sent_num'], inplace = True)

# Add a field to flag rows that match this sample.
join_df['sample3'] = True

kw_df4 = kw_df3.join(join_df, on = ['row_id', 'sent_num'], how = 'left')
print(kw_df4['sample3'].value_counts())

# Update the main sample column to track the sample 1 rows.
kw_df4.loc[kw_df4['sample3'].values == True, 'sample'] = 'sample3'

print(kw_df4['sample'].value_counts())

# Confirm that shapes are still good.
print(kw_df4.shape)
print(kw_df3.shape)

# Save df to disk for later usage.
"""
kw_df4.reset_index(drop = True).to_feather("data/annotation2-post-sample3.feather")
"""

## Sample 4 (n = 750; 2022-07-07)

In [None]:
# Load post-sample 3 file.
kw_df4 = pd.read_feather("data/annotation2-post-sample3.feather")
# Review value counts.
kw_df4['sample'].value_counts()

We will create the sample in 4 steps:

1. Score all excerpts with keyword sentiment.
2. Among excerpts with keyword sentiment, create 5 quartile-based bins.
3. Among excerpts with keyword sentiment, select 120 random excerpts per quartile (5 quartiles)
4. Among excerpts without keyword sentiment, select 150 random excerpts

In [None]:
%%time

# Score keyword sentiment
from clinsent import KeywordFinder

kwf = KeywordFinder()
text = 'bp is improving, but o2 worsening'
hits, score = kwf.run(text)
print(score)

# This takes an hour+ with a single core.
scores = []
for excerpt in kw_df4.text.values:
    # First result is hits
    _, score = kwf.run(excerpt)
    scores.append(score)


In [None]:
kw_df4['kw_score'] = scores
print(kw_df4['kw_score'].describe())

In [None]:
# Add back on sentences without any keywords.
# Restrict to sentences with at least one keyword.
kw_df_no_kws = sent_df.loc[sent_df['keyword_count'] == 0]
kw_df4_v2 = pd.concat([kw_df4, kw_df_no_kws])

In [None]:
kw_df4_v2['any_kw_score'] = (kw_df4_v2['kw_score'].notnull()).astype(int)
# 2.5 MM excerpts with keywords.
kw_df4_v2.any_kw_score.value_counts()

In [None]:
# Create quartile bins using https://pandas.pydata.org/docs/reference/api/pandas.qcut.html
kw_df4 = kw_df4_v2
kw_df4['kw_score_quartiles'] =  pd.cut(kw_df4.kw_score, 4)
kw_df4.kw_score_quartiles.value_counts()

In [None]:
# Stratify by keyword sentiment score quartile.
samp_df4_kws = kw_df4.loc[kw_df4['any_kw_score'] == 1].groupby('kw_score_quartiles').sample(150, random_state = 1)
print(samp_df4_kws.shape)

samp_df4_nokws = kw_df4.loc[kw_df4['any_kw_score'] == 0].sample(150, random_state = 1)

# Combine the two samples
samp_df4 = pd.concat([samp_df4_kws, samp_df4_nokws])
print(samp_df4.shape)

In [None]:
# Shuffle row order.
samp_df4 = samp_df4.sample(frac = 1, random_state = 2)

In [None]:
# CK: disabled to avoid overwriting the sample that is already being used.
samp_df4.to_excel("data/annotation2-sample750-v4.xlsx", index = False)
samp_df4.to_csv("data/annotation2-sample750-v4.tsv", sep = '\t', index = False)

In [None]:
# Load from file then tag in df.
samp_df4 = pd.read_excel("data/annotation2-sample750-v4.xlsx")
samp_df4.info()

# Restrict just to the joining columns.
join_df = samp_df4[['row_id', 'sent_num']]
join_df.set_index(['row_id', 'sent_num'], inplace = True)

# Add a field to flag rows that match this sample.
join_df['sample4'] = True

kw_df5 = kw_df4.join(join_df, on = ['row_id', 'sent_num'], how = 'left')
print(kw_df5['sample4'].value_counts())

# Update the main sample column to track the sample 1 rows.
kw_df5.loc[kw_df4['sample4'].values == True, 'sample'] = 'sample4'

print(kw_df5['sample'].value_counts())

# Confirm that shapes are still good.
print(kw_df5.shape)
print(kw_df4.shape)

# Save df to disk for later usage.
kw_df5.reset_index(drop = True).to_feather("data/annotation2-post-sample4.feather")