In [34]:
import pandas as pd
import numpy as np

data = pd.read_csv("healthy.txt", header=0, \
                    delimiter="\t")

In [35]:
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General
5,CS-0006,98502,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,98115,blood pressure range chart,Resource,General
7,CS-0008,94122,ask webmd question,Navigational,General
8,CS-0009,90914,photos of bed bug bites,Resource,General
9,CS-0010,83756,lyme disease and symptoms,Direct Answer,Expert


In [36]:
#Change Intent_1 to numerical values
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

#Drop unused columns, null rows, and reindex
health_data = data.drop(['\xef\xbb\xbfCS_ID', 'Keyword_ID', 'Authority'], axis=1).dropna()
health_data = health_data.dropna()
health_data = health_data.drop_duplicates('Keyword')
health_data_reindex = health_data.reset_index(drop=True)
health_data_reindex.info()
health_data_reindex.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98140 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     98140 non-null object
Intent_1    98140 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.2+ MB


Unnamed: 0,Keyword,Intent_1
0,signs of gallbladder problems,1
1,normal blood pressure chart,4
2,what is normal bloodpressure,1
3,what is hyperlipidemia,0
4,pill identifier,0
5,blood clots in leg symptoms,1
6,blood pressure range chart,4
7,ask webmd question,3
8,photos of bed bug bites,4
9,lyme disease and symptoms,1


In [37]:
kws = health_data['Keyword']

def clean_keywords(keyword):
    stopwords = set('of in to for with on at from by about as into like through after over between out against during with without before under around among i me you he she it they them my mine yours his hers theirs a an the that these those and but nor yet so'.split())
    letters_only = re.sub("[^a-zA-Z]", " ", keyword) 
    tokenized_keyword = letters_only.split()
    meaningful_words = [kw for kw in tokenized_keyword if not kw in stopwords]
    return( " ".join( meaningful_words ))

health_data['Keyword'] = kws.apply(clean_keywords)
health_data['Keyword'].head(10)

0      signs gallbladder problems
1     normal blood pressure chart
2    what is normal bloodpressure
3          what is hyperlipidemia
4                 pill identifier
5        blood clots leg symptoms
6      blood pressure range chart
7              ask webmd question
8            photos bed bug bites
9           lyme disease symptoms
Name: Keyword, dtype: object

In [38]:
def get_intent_kw_counts(intent):
    intent_df = health_data[health_data['Intent_1']==intent]
    intent_kws = intent_df['Keyword']
    vect = CountVectorizer()
    vect.fit(intent_kws)
    intent_kw_features = vect.get_feature_names()
    intent_kw_counts = vect.transform(intent_kws)
    intent_kw_array = intent_kw_counts.toarray()
    intent_kw_token_counts = pd.DataFrame({'token':intent_kw_features, 'count':np.sum(intent_kw_array, axis=0)}).sort_values(by='count', ascending=False)
    return intent_kw_token_counts.head(20)

#get_intent_kw_counts(4)
#health_data[health_data['Intent_1']==4]

In [39]:
health_data['Intent_1'].value_counts()

0    42953
1    36731
2     9008
4     7064
5     1226
3     1158
Name: Intent_1, dtype: int64