# Vectorize Outcomes

Just focus on outcomes for now...

In [10]:
df = pd.read_csv('pico_cdsr.csv')
df = df[df.abstract.notnull() & df.outcome.notnull()].reset_index(drop=True)
df = df.ix[:19] # mini training set

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
title           20 non-null object
pmid            20 non-null int64
cdno            20 non-null object
population      20 non-null object
intervention    20 non-null object
outcome         20 non-null object
method          20 non-null object
type            20 non-null object
abstract        20 non-null object
dtypes: int64(1), object(8)
memory usage: 1.5+ KB


### Sample Outcomes

In [11]:
def sample_outcomes(k=10):
    for outcome in np.random.choice(df.outcome, size=k):
        print
        print '*'*100
        print
        print outcome
        
sample_outcomes()


****************************************************************************************************

Mean (SD) number of wet nights per week: A:3.3 (1.9), B:4.7 (1.7), C:3.3 (2.5) Number attaining cure: A:3, B:1, C:5 7 out of 8 children who were cured relapsed. The exception was treated with amitriptyline and DDAVP Follow up: mean (SD) number of wet nights per week: A:(n=10) 3.9 (2.9), B:(n=5) 3.8 (1.9), C:(n=8) 5.1 (3.2) Side effects: none reported Most parents said all the drugs were easy to use

****************************************************************************************************

Number not achieving 14 dry nights after 6 months: A: 26/61; B: 31/60; C: 38/61 All children improved psychologically, e.g. behaviour and self concept, regardless of outcome or treatment assignment Side effects: not mentioned

****************************************************************************************************

Mean % (SD) number of wet nights during combined periods: A:30.

### Preprocess Outcomes

- Tokenize into words with nltk
- Remove numbers
- Remove stopwords
- Remove punctuation

In [12]:
import string

from nltk import word_tokenize
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

df.outcome = df.outcome.map(lambda s: ' '.join(word_tokenize(s)).lower()) # basic tokenization
df.outcome = df.outcome.map(lambda s: ' '.join(word for word in s.split() if not any(char.isdigit() for char in word))) # numbers
df.outcome = df.outcome.map(lambda s: ' '.join(word for word in s.split() if word not in stopwords))
df.outcome = df.outcome.map(lambda s: ' '.join(''.join(char for char in word if char not in string.punctuation) for word in s.split())) # delete punctuation
df.outcome = df.outcome.map(lambda s: ' '.join(word for word in s.split())) # remove whitespace in between words

len(df)

20

### Narrow Down Output Space to Most Common K Words

In [13]:
from vectorizer import Vectorizer

vectorizer = Vectorizer()
vectorizer.fit(df.outcome)

dff = pd.DataFrame({word: [count] for word, count in vectorizer.tok.word_counts.items()}).T
dff.columns = ['counts']

K = 20
# K = 5000
target_words = set(dff.counts.sort_values(ascending=False)[:K].index)

df.outcome = df.outcome.map(lambda s: ' '.join(word for word in s.split() if word in target_words))

sample_outcomes()


****************************************************************************************************

mean sd number wet nights dry follow side effects none reported

****************************************************************************************************

mean sd number wet nights number dry number wet nights follow children dry side effects none reported

****************************************************************************************************

wet nights trial mean sd b c number achieving dry nights b c number b wet nights trial mean sd b c side effects b c treatment

****************************************************************************************************

mean sd number wet nights dry follow side effects none reported

****************************************************************************************************

mean sd number wet nights per week number children follow mean sd number wet nights per week b c side effects none reported

****

### Convert Outcomes to BoW

In [14]:
from vectorizer import Vectorizer

vectorizer = Vectorizer()

vectorizer.fit(df.outcome)
vectorizer.texts_to_BoW(df.outcome)

pickle.dump(vectorizer, open('outcomes-mini.p', 'w'))