In [90]:
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client, TimeoutError, LocalCluster
import pandas as pd
import hvplot.dask
import re


#code found on stack exchange
#makes a new client if one is not running, runs 6 workers
try:
    client = Client('tcp://localhost:8787', timeout='2s')
except OSError or TimeoutError:
    cluster = LocalCluster(scheduler_port=8787, n_workers=4, ip='localhost')
    client = Client(cluster)
#

client


0,1
Connection method: Direct,
Dashboard: http://localhost:8787/status,

0,1
Comm: tcp://127.0.0.1:8787,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: 13 minutes ago,Total memory: 15.95 GiB

0,1
Comm: tcp://127.0.0.1:57283,Total threads: 2
Dashboard: http://127.0.0.1:57284/status,Memory: 3.99 GiB
Nanny: tcp://127.0.0.1:57253,
Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-10arivvv,Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-10arivvv
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 3.2%,Last seen: Just now
Memory usage: 153.37 MiB,Spilled bytes: 0 B
Read bytes: 4.32 kiB,Write bytes: 5.72 kiB

0,1
Comm: tcp://127.0.0.1:57286,Total threads: 2
Dashboard: http://127.0.0.1:57287/status,Memory: 3.99 GiB
Nanny: tcp://127.0.0.1:57255,
Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-5ghykyun,Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-5ghykyun
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 140.17 MiB,Spilled bytes: 0 B
Read bytes: 13.50 kiB,Write bytes: 5.92 kiB

0,1
Comm: tcp://127.0.0.1:57289,Total threads: 2
Dashboard: http://127.0.0.1:57290/status,Memory: 3.99 GiB
Nanny: tcp://127.0.0.1:57254,
Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-w2l3mj55,Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-w2l3mj55
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 182.71 MiB,Spilled bytes: 0 B
Read bytes: 14.54 kiB,Write bytes: 6.52 kiB

0,1
Comm: tcp://127.0.0.1:57280,Total threads: 2
Dashboard: http://127.0.0.1:57281/status,Memory: 3.99 GiB
Nanny: tcp://127.0.0.1:57252,
Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-0n8e9xka,Local directory: C:\Users\derek\AppData\Local\Temp\dask-worker-space\worker-0n8e9xka
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 144.07 MiB,Spilled bytes: 0 B
Read bytes: 3.96 kiB,Write bytes: 5.48 kiB


In [91]:
keyWords = [
    'mental health', 
    'depression',
    'depressed', 
    'anxiety',
    'anxious', 
    'ADHD', 
    'attention deficit',
    'OCD',
    'obsessive-compulsive disorder',
    'PTSD',
    'posttraumatic stress disorder',
    'trauma',
    'bipolar',
    'autism',
    'schizophrenia',
    'eating disorder',
    'anorexia',
    'bulimia',
    'psychosis',
    'psychologist',
    'psychotherapist',
    'psychiatrist',
    'psychotherapy',
    'depressant'
]

In [92]:
#read and print the dataframes

#contains tweets (and time of the tweet) that contain mental health key word
#polled ever 1 minute
tweetdf = dd.read_csv(urlpath='mental_health_tweets.csv', sep='\t' )

#contains counts of tweets for every hour
countdf = dd.read_csv(urlpath='mental_health_tweet_count.csv', sep='\t')

print(tweetdf)

print(countdf)

Dask DataFrame Structure:
                tweet    time
npartitions=1                
               object  object
                  ...     ...
Dask Name: read-csv, 1 tasks
Dask DataFrame Structure:
              timestamp  count
npartitions=1                 
                 object  int64
                    ...    ...
Dask Name: read-csv, 1 tasks


In [93]:

#specify column types
tweetdf['time'] = dd.to_datetime(tweetdf['time'])
countdf['timestamp'] = dd.to_datetime(countdf['timestamp'])

#separate the components of the time for grouping
tweetdf['year'] = tweetdf['time'].dt.year
tweetdf['month'] = tweetdf['time'].dt.month
tweetdf['day'] = tweetdf['time'].dt.day
tweetdf['hour'] = tweetdf['time'].dt.hour
tweetdf['minute'] = tweetdf['time'].dt.minute

countdf['year'] = countdf['timestamp'].dt.year
countdf['month'] = countdf['timestamp'].dt.month
countdf['day'] = countdf['timestamp'].dt.day
countdf['hour'] = countdf['timestamp'].dt.hour

In [94]:
#sort the dataframes based on ascening tme
countdf = countdf.sort_values(ascending=True, by=['month', 'day', 'hour'])
tweetdf = tweetdf.sort_values(ascending=True, by=['month', 'day', 'hour', 'minute'])

#make a display string for the x axis of the graph
countdf['ts_display'] = countdf['month'].astype('str') + '/' + countdf['day'].astype('str') + '\t' + countdf['hour'].astype('str') + ':00'

#TODO MAKE THE GRAPH LOOK BETTER
countdf.hvplot(x='ts_display', y='count').opts(xrotation=90, fontscale=0.5)


In [95]:
#pre process

def preprocess(text):
    text = text[1]
    text = text.lower()
    return re.sub(string=text, pattern='[^a-z ]', repl='')

tweetdf['text'] = tweetdf['tweet'].map_partitions(preprocess)

In [96]:
#determine key words in tweet
def containsKey(text, key):
    return text.str.contains(key)

for i in keyWords:
    tweetdf[i] = tweetdf['tweet'].map_partitions(containsKey, i)


In [97]:
#count how many tweets each key appeared in
keyCount = pd.DataFrame(columns=keyWords)
keyCount = dd.from_pandas(keyCount, npartitions=1)

for i in keyWords:
    keyCount[i] = tweetdf[i].value_counts()

keyCount.compute()

# MAYBE remove?
# keyCount.persist()

Unnamed: 0,mental health,depression,depressed,anxiety,anxious,ADHD,attention deficit,OCD,obsessive-compulsive disorder,PTSD,...,schizophrenia,eating disorder,anorexia,bulimia,psychosis,psychologist,psychotherapist,psychiatrist,psychotherapy,depressant
False,41984,44764,53697,51902,53810,53539,53904,53820,53903,51491,...,53495,53486,53486,53738,53275,52997,53849,52967,53763,53749
True,11921,9141,208,2003,95,366,1,85,2,2414,...,410,419,419,167,630,908,56,938,142,156


In [98]:
kcBagArr = []

for i in keyWords:
    kcBagArr.append(keyCount[i])

kcBagArr

#CONTINUE WORKING ON DASK BAG TO SHIFT DF


[Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: mental health, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: depression, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: depressed, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: anxiety, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: anxious, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: ADHD, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: attention deficit, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Series Structure:
 npartitions=1
     int64
       ...
 Name: OCD, dtype: int64
 Dask Name: getitem, 193 tasks,
 Dask Seri