In [23]:
%matplotlib inline  
from pytrends.request import TrendReq
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
from queue import Queue
import seaborn as sns
import pandas as pd
import numpy as np
import pytrends
import requests
import vincent
import random
import spacy
import json
import time
import nltk

In [24]:
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV

# the goal of this project is to build a predictive model for depression using google trends data

## build ground truth for depression

In [25]:
def build_groud_truth(flag="c"):
    truth_2011 = pd.read_csv("../data/prevalence_" + flag + "/2011_" + flag + ".csv",sep = ";")
    truth_2012 = pd.read_csv("../data/prevalence_" + flag + "/2012_" + flag + ".csv",sep = ";")
    truth_2013 = pd.read_csv("../data/prevalence_" + flag + "/2013_" + flag + ".csv",sep = ";")
    truth_2014 = pd.read_csv("../data/prevalence_" + flag + "/2014_" + flag + ".csv",sep = ";")
    truth_2015 = pd.read_csv("../data/prevalence_" + flag + "/2015_" + flag + ".csv",sep = ";")
    truth_2016 = pd.read_csv("../data/prevalence_" + flag + "/2016_" + flag + ".csv",sep = ";")
    truth = pd.concat([truth_2011,truth_2012,truth_2013,truth_2014,truth_2015,truth_2016])
    truth = truth[truth["Response"] == "Yes"]
    if flag == "c":
        truth = truth.drop([0,2],axis = 0)
    truth = truth[["LocationDesc","Data_Value","Year"]]
    return truth

In [26]:
def extract_prevalence(truth):
    '''
    clean ground truth
    '''
    y=[]
    loc=truth["LocationDesc"].values
    prevs=truth["Data_Value"].values
    remove=set(["Guam", "Puerto Rico", "Virgin Islands"])
    for i in range(len(loc)):
        if loc[i] not in remove:
            y.append(prevs[i])
    return y

In [27]:
def build_index(truth):
    loc=set(truth["LocationDesc"].values)
    r=set(["Guam", "Puerto Rico", "Virgin Islands"])
    index=loc.difference(r)
    index=sorted(index)
    return index

In [28]:
years=[int(i) for i in range(2011,2017, 1)]

### crude and age adjusted prevalence

In [29]:
truth_a=build_groud_truth("a")
truth_c=build_groud_truth("c")

In [30]:
y_a=extract_prevalence(truth_a)
y_c=extract_prevalence(truth_c)

In [31]:
index=build_index(truth_c)

In [32]:
len(y_a)

306

In [33]:
len(y_c)

306

In [34]:
def data_sample(truth, year):
    '''
    extract data for year
    '''
    data=truth[truth_c["Year"]==year]
    data_map = pd.DataFrame(data["Data_Value"].values, columns=["Prevalence"], index=None)
    data_map["NAME"] = data["LocationDesc"].values
    return data_map

## draw a prevalence map

In [35]:
def draw_map(data_map, year, flag):
    '''
    draw maps of prevalence for different years
    '''
    vincent.core.initialize_notebook()
    state_topo = "https://raw.githubusercontent.com/wrobstory/vincent_map_data/master/us_states.topo.json"
    geo_data = [{'name': 'states',
                 'url': state_topo,
                 'feature': 'us_states.geo'}]

    vis = vincent.Map(data=data_map, geo_data=geo_data, scale=1000,
                      projection='albersUsa', data_bind='Prevalence', data_key='NAME',
                      map_key={'states': 'properties.NAME'}, brew='YlGnBu')
    #Custom threshold scale
    vis.scales[0].type='threshold'
    data_max=int(max(data_map["Prevalence"].values))
    data_min=int(min(data_map["Prevalence"].values))
    t=(data_max-data_min) // 7 + 1
    vis.scales[0].domain = [0] + [9, 12, 15, 18, 21, 24, 27]#[i for i in range(data_min, data_max+t, t)]
    if flag=="c":
        vis.legend(title="Crude prevalence depression " +str(year)+ " (%)")
    else:
        vis.legend(title="Average Adjusted prevalence depression " +str(year)+ " (%)")
    vis.to_json('vega.json')
    vis.display()

In [36]:
def routine_map(truth, year, flag="c"):
    data_map=data_sample(truth, year)
    draw_map(data_map, year, flag)

In [37]:
for y in years:
    routine_map(truth_c, y)

In [14]:
class Prevalence:
    '''
    class to extract prevalence
    '''
    def __init__(self):
        self.ground_truth = ["AgeAdjusted", "Crude"]
        self.topics = {"arthritis": ["05", "_DRDXAR1"], "depression": ["17", "ADDEPEV2", [2011, 2012, 2013, 2014, 2015, 2016]]}
        
    def retrieve_prevalence(self, disease="depression", p="c"):
        
        data_tot=pd.DataFrame()
        topic=self.topics[disease][0]
        code=self.topics[disease][1]
        years=self.topics[disease][2]
        if p == "c":
            value=self.ground_truth[1]
        else:
            value=self.ground_truth[0]
        
        for year in years:
            print(year)
            query="https://nccd.cdc.gov/BRFSSPrevalence/rdPage.aspx?rdReport=DPH_BRFSS.ExportData&DataType=StatesAndMMSA&ClassCode=CLASS03&TopicCode=TOPIC"\
            + str(topic) + "&StratTypeCode=CAT1&StratCode=&LocationCode=&IndicatorCode=" \
            + code +"&ResponseCode=RESP046&QueryType=Chart&YearStart=" \
            + str(year) + "&YearEnd=&DataValueType="\
            + str(value) + "&ShowMMSA=false&rdReportFormat=CSV&rdExportTableID=dtExport&rdExportFilename=ExportCSV"
            
            data_temp=self.routine(query)
            data_tot=pd.concat([data_tot, data_temp], axis=0)
        return data_tot
        
    def routine(self, query):
        r=requests.get(query)
        data=r.text.split("\r\n")
        for r in range(len(data)):
            x=data[r].split(",")
            data[r] = [i.strip("\"") for i in x[:28]]
        data_df=pd.DataFrame(data[3:54], columns=data[0])
        data_df=data_df[["LocationDesc", "Data_Value", "Year"]]
        return data_df

In [None]:
prevalence=Prevalence()

In [None]:
data_df=prevalence.retrieve_prevalence()

## generate features with Google Trends

In [21]:
class Trends:
    '''
    build the features matrix using google trends data
    '''
    
    def __init__(self, parser, index, features=[], explored=set(), years=[2011, 2012, 2013, 2014, 2015, 2016]):
        
        self.index = index
        self.years = years
        self.parser = parser
        self.features = features
        self.explored = explored
        self.matrix_features={}
        self.matrix_df=pd.DataFrame()
        
    def getFeatures(self):
        return self.features
    def getExplored(self):
        return self.explored
    def getMatrix_df(self):
        return self.matrix_df
    
    def routine_features(self, text):
        
        text = [i[0]+" " for i in text]
        text = ''.join(text)
        for token in self.parser(text):
            #print(token.text, token.pos_)
            if token.pos_ == "NOUN":
                #print(token.text, token.pos_)
                self.features.append(token.text)
        self.features=list(set(self.features))
        
    def build_features(self, queries=["depression"], n=150):
        
        '''generate a list of n nouns related with the initial queries'''

        queue = Queue()
        #explored=set()
        for q in queries:
            if q not in self.explored:
                queue.put(q)

        while (len(self.features) < n) and (not queue.empty()):
            q=queue.get()
            self.explored.add(q)
            print(q)
            pytrends = TrendReq(hl='en-US', tz=360)
            ## cat  46 health
            pytrends.build_payload([q], cat=45, timeframe='today 5-y', geo='US', gprop='')
            related_queries=pytrends.related_queries()
            keys=related_queries[q].keys()

            for k in keys:
                text = related_queries[q][k].values
                self.routine_features(text)
                temp=set(self.features)
                temp=temp.difference(self.explored)
                temp=list(temp)
                for f in temp:
                    ## just to be safer
                    if f not in self.explored:
                        queue.put(f)
            time.sleep(10)

        self.features = list(set(self.features + queries))
        
    def build_matrix(self):
        
        '''compute the google trend for every feature and every year'''
        
        self.matrix_features = {f:[] for f in self.features}
        self.matrix_features["year"] = []
        pytrends = TrendReq(hl='en-US', tz=360)
        for year in self.years:
            print(year)
            for f in self.features:
                pytrends.build_payload([f], cat=0, timeframe=str(year) + "-01-01 " + str(year+1) + "-12-31", geo='US', gprop='')
                x = pytrends.interest_by_region(resolution='COUNTRY').values
                x = [i[0] for i in x]
                if len(x) == len(self.index):
                    self.matrix_features[f].extend(x[:])
                else:
                    self.matrix_features[f].extend([None for _ in range(len(self.index))])
            self.matrix_features["year"].extend([year for _ in range(len(self.index))])
            time.sleep(10)

        index_all=[]
        for y in self.years:
            index_all.extend(self.index)

        self.matrix_df = pd.DataFrame(self.matrix_features, index=index_all)

In [16]:
parser = spacy.load('en')

In [17]:
queries=list(pd.read_csv("./queries.csv", sep=";", header=None).values[0])

# The choice of the terms was based on the idea that we should take into account not only directly associated terms (such as depression itself) but also some similar conditions (panic attacks), symptoms (anxiety), drugs (prozac). Moreover, to get not only positively correlated features we also chose some positive terms such as happiness, family, and friends.

In [18]:
queries

['depression',
 'anxiety',
 'panic',
 'friends',
 'family',
 'psychology',
 'prozac',
 'happiness']

In [22]:
trends=Trends(parser, index)

### generate list of features

In [23]:
trends.build_features(queries)

depression
anxiety
panic
friends
family
psychology
prozac


In [24]:
len(trends.features)

162

### build features matrix

In [25]:
trends.build_matrix()

2011
2012
2013
2014
2015
2016


In [27]:
trends.getMatrix_df().shape

(306, 163)

In [79]:
matrix_df=trends.getMatrix_df()
features=trends.getFeatures()
explored=trends.getExplored()

### add crude and adjusted prevalence to the matrix data

In [80]:
matrix_df["prevalence_a"] = y_a
matrix_df["prevalence_c"] = y_c

## process income and insurance 

In [98]:
def process_data():
    '''
    process income, unemployment, insurance and position data
    '''
    
    position="../data/positions.csv"
    income="../data/income/income_clean.csv"
    insurance="../data/insurance/insurance_clean.csv"
    unemployment="../data/unemployment/unemployment.csv"
    position_df=pd.read_csv(position)
    unemployment_df=pd.read_csv(unemployment)
    insurance_df=pd.read_csv(insurance, index_col=0)
    income_df=pd.read_csv(income, index_col=0, header=None)
    
    insurance_df["2012"] = None
    insurance_df["2011"] = None
    insurance_df=insurance_df[["2011", "2012", "2013", "2014", "2015", "2016"]]
    
    years_all = [i for i in range(2016, 1999, -1)]
    cols=[]
    for i in years_all:
        cols.append("median_"+str(i))
        cols.append("st_"+str(i))
    
    income_df=pd.DataFrame(income_df.values, index=income_df.index, columns=cols)
    return insurance_df, income_df, unemployment_df, position_df

def update_matrix(insurance_df, income_df, unemployment_df, position_df, matrix_df):
    '''
    update with census data, index states and insurance rate
    '''
    insurance=pd.Series()
    income=pd.Series()
    index=list(set(matrix_df.index))
    index_states=[i for i in range(len(index))]
    index_states_all=[]
    for y in years:
        insurance = pd.concat([insurance, insurance_df[str(y)]], axis=0)
        income = pd.concat([income, income_df["median_" + str(y)]], axis=0)
        index_states_all = index_states_all + index_states
    
    # update matrix
    matrix_df["states"] = index_states_all
    matrix_df["income"] = income.values
    matrix_df["insurance"] = insurance.values
    matrix_df["unemployment"] = unemployment["Rate"].values
    matrix_df["latitude"] = position_df["Latitude"].values
    matrix_df["longitude"] = position_df["Longitude"].values
    
    # remove features with more that 200 nan
    remove=list(matrix_df.columns[matrix_df.isnull().sum()>200])
    matrix_df.drop(remove, axis=1, inplace=True)
    
    return matrix_df

In [101]:
insurance_df, income_df, unemployment_df, position_df = process_data()

matrix_df=update_matrix(insurance_df, income_df, unemployment_df, position_df, matrix_df)

In [102]:
matrix_df.shape

(306, 163)

In [103]:
matrix_df.head()

Unnamed: 0,addiction,alcohol,america,animal,antidepressants,anxiety,app,ashwagandha,association,attack,...,years,zoloft,prevalence_a,prevalence_c,income,insurance,unemployment,states,latitude,longitude
Alabama,72,76,45,73,85,76,84,48,65,87,...,91,79,21.1,21.2,42590,,,0,32.806671,-86.79113
Alaska,74,93,65,83,68,71,75,69,94,95,...,82,53,16.5,16.5,57431,,,1,61.370716,-152.404419
Arizona,76,77,84,71,71,78,81,98,51,85,...,86,64,17.3,17.5,48621,,,2,33.729759,-111.431221
Arkansas,73,73,62,73,82,79,83,51,74,84,...,89,77,22.9,22.8,41302,,,3,34.969704,-92.373123
California,64,77,88,64,56,68,91,62,40,78,...,84,49,12.1,12.3,53367,,,4,36.116203,-119.681564
