In [1]:
import dask.dataframe as dd
import dask
from dask.distributed import Client, TimeoutError, LocalCluster, progress
from dask.diagnostics import ProgressBar
import pandas as pd
import hvplot.dask
import hvplot.pandas
import re
from textblob import TextBlob
from langdetect import detect
from scipy.stats import f_oneway, ttest_ind
from bokeh.models.formatters import DatetimeTickFormatter
from random import randint

#code found on stack exchange
#makes a new client if one is not running, runs 6 workers
try:
    client = Client('tcp://localhost:8787', timeout='5s')
except OSError or TimeoutError:
    cluster = LocalCluster(scheduler_port=8787, n_workers=6, ip='localhost')
    client = Client(cluster)


client


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 6
Total threads: 12,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:8787,Workers: 6
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:63173,Total threads: 2
Dashboard: http://127.0.0.1:63179/status,Memory: 2.67 GiB
Nanny: tcp://127.0.0.1:63159,
Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-fqplw3yg,Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-fqplw3yg

0,1
Comm: tcp://127.0.0.1:63172,Total threads: 2
Dashboard: http://127.0.0.1:63175/status,Memory: 2.67 GiB
Nanny: tcp://127.0.0.1:63160,
Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-1nphjvck,Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-1nphjvck

0,1
Comm: tcp://127.0.0.1:63174,Total threads: 2
Dashboard: http://127.0.0.1:63186/status,Memory: 2.67 GiB
Nanny: tcp://127.0.0.1:63161,
Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-1h56il6l,Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-1h56il6l

0,1
Comm: tcp://127.0.0.1:63177,Total threads: 2
Dashboard: http://127.0.0.1:63182/status,Memory: 2.67 GiB
Nanny: tcp://127.0.0.1:63162,
Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-m5t2q0lm,Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-m5t2q0lm

0,1
Comm: tcp://127.0.0.1:63178,Total threads: 2
Dashboard: http://127.0.0.1:63181/status,Memory: 2.67 GiB
Nanny: tcp://127.0.0.1:63163,
Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-zs1fjht3,Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-zs1fjht3

0,1
Comm: tcp://127.0.0.1:63184,Total threads: 2
Dashboard: http://127.0.0.1:63188/status,Memory: 2.67 GiB
Nanny: tcp://127.0.0.1:63164,
Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-dsrsrluc,Local directory: /var/folders/3t/s31r6vgj44n_g5t4csrvklwh0000gn/T/dask-scratch-space/worker-dsrsrluc


In [2]:
# reference variables
keyWords = [
    'mental health', 
    'depression',
    'depressed', 
    'anxiety',
    'anxious', 
    'ADHD', 
    'attention deficit',
    'OCD',
    'obsessive-compulsive disorder',
    'PTSD',
    'posttraumatic stress disorder',
    'trauma',
    'bipolar',
    'autism',
    'schizophrenia',
    'eating disorder',
    'anorexia',
    'bulimia',
    'psychosis',
    'psychologist',
    'psychotherapist',
    'psychiatrist',
    'psychotherapy',
    'depressant'
]

# categories each key belongs to
keyCats = {
    'gen' : ['mental health', 'trauma', 'psychosis', 'psychologist', 'psychotherapist', 'psychiatrist', 'psychotherapy'],
    'dep' : ['depression', 'depressed', 'depressant'],
    'anx' : ['anxiety', 'anxious'],
    'ocd' : ['OCD', 'obsessive-compulsive disorder'],
    'adhd' : ['ADHD', 'attention deficit'],
    'ptsd' : ['PTSD', 'posttraumatic stress disorder'],
    'bip' : ['bipolar'],
    'asd' : ['autism'],
    'schiz' : ['schizophrenia'],
    'ed' : ['eating disorder', 'anorexia', 'bulimia']
}

# mapping of category abbreviated name to full name
abbrCats = {
    'gen' : 'General',
    'dep' : 'Depression',
    'anx' : 'Anxiety',
    'ocd' : 'OCD',
    'adhd' : 'ADHD',
    'ptsd' : 'PTSD',
    'bip' : 'Bipolar',
    'asd' : 'Autism',
    'schiz' : 'Schizophrenia',
    'ed' : 'Eating Disorder'
}

sentCols = ['subjectivity', 'polarity']

In [3]:
#contains counts of tweets for every hour
countdf = dd.read_csv(urlpath='mental_health_tweet_count.csv', sep='\t', lineterminator='\n')

#try reading in the preprocessed tweets
# if the csv does not exist, create it
try:
    # read the preprocessed csv
    tweetdf = dd.read_parquet(path='mental_health_tweets_preprocessed_en.parquet')
except:
    #contains tweets (and time of the tweet) that contain mental health key word
    #polled every 2 minutes
    tweetdf = dd.read_csv(urlpath='mental_health_tweets.csv', sep='\t', lineterminator='\n')
    
    #pre process
    # preprocess instructions found at https://youtu.be/ujId4ipkBio
    def preprocess(text):
        # remove mentions
        text = re.sub('@[a-zA-Z0-9]+', '', text)
        
        #  remove hashtag symbol
        text = re.sub('#', '', text)
        
        # remove retweet identifier
        text = re.sub('RT[\s]+', '', text)
        
        # remove links
        text = re.sub('https?:\/\/\S+', '', text)


        text = re.sub('[^a-zA-Z\\s]', '', text)

        text = text.strip()

        return text

    tweetdf['text'] = tweetdf['tweet'].map(preprocess)

    # remove any tweets that are either empty, or not in english
    tweetdf = tweetdf.loc[tweetdf['text'] != '']
    tweetdf['language'] = tweetdf['text'].map(lambda x: detect(x))
    tweetdf = tweetdf.loc[tweetdf['language'] == 'en']

    # drop original tweet and language columns
    tweetdf = tweetdf.drop(columns=['tweet', 'language'])

    #write pre processed text to parquet (dask recommends parquet)
    dd.to_parquet(tweetdf, path='mental_health_tweets_preprocessed_en.parquet')
    
    

FileNotFoundError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: [Errno 2] No such file or directory: '/Users/derek/Documents/Dater/TWMHML/Twitter-Mental-Health-Analysis/mental_health_tweet_count.csv'

In [None]:

#specify column types
tweetdf['time'] = dd.to_datetime(tweetdf['time'])
countdf['timestamp'] = dd.to_datetime(countdf['timestamp'])

#separate the components of the time for grouping
tweetdf['year'] = tweetdf['time'].dt.year
tweetdf['month'] = tweetdf['time'].dt.month
tweetdf['day'] = tweetdf['time'].dt.day
tweetdf['hour'] = tweetdf['time'].dt.hour
tweetdf['minute'] = tweetdf['time'].dt.minute

# only keep tweets from november 18th-30th
tweetdf = tweetdf.loc[tweetdf['month'] == 11]
tweetdf = tweetdf.loc[tweetdf['day'] > 17]

countdf['year'] = countdf['timestamp'].dt.year
countdf['month'] = countdf['timestamp'].dt.month
countdf['day'] = countdf['timestamp'].dt.day
countdf['hour'] = countdf['timestamp'].dt.hour

# only keep counts from november 18th-30th
countdf = countdf.loc[countdf['month'] == 11]
countdf = countdf.loc[countdf['day'] > 17]

In [None]:
#sort the dataframes based on ascening tme
countdf = countdf.sort_values(ascending=True, by=['month', 'day', 'hour'])
tweetdf = tweetdf.sort_values(ascending=True, by=['month', 'day', 'hour', 'minute'])

#make a display string for the x axis of the graph
countdf['ts_display'] = countdf['month'].astype('str') + '/' + countdf['day'].astype('str') + '\t' + countdf['hour'].astype('str') + ':00'

# count the total number of house in the dataframe
numTicks = len(tweetdf.groupby(['month', 'day', 'hour']).count().compute())

# divide by 8 in order to reduce the number of ticks that will be plotted
numTicks /= 8
numTicks = int(numTicks)

# format for the ticker timestamp
formatter = DatetimeTickFormatter(months='%m/%d %H:00', days='%m/%d %H:00', hours='%m/%d %H:00')


hvplot.save(countdf.hvplot(x='timestamp', y='count', title='Tweet Count', xlabel='date', ylabel='# tweets', xformatter=formatter).opts(xrotation=90, fontscale=0.75, xticks=numTicks),\
    'figures/tweets-over-time.png')

In [None]:
#determine key words in tweet
def containsKey(text, key):
    return text.str.contains(key)

# create a new column for each key, with values of whether or not
# the tweet in that row contains that key
for i in keyWords:
    tweetdf[i] = tweetdf['text'].map_partitions(containsKey, i)


In [None]:
# given key k and dataframe df, find the number of times k appears in the df
def countKeys(k, df):
    return df[k].sum()

#client submit and gather implementation modified from stackexchange
# create a list of futures that will run countKeys
futures = []

for k in keyWords:
    futures.append(client.submit(countKeys, k, tweetdf))

# gather the tasks
keyCount = client.gather(futures)

# compute the tasks
keyCount = dask.compute(*keyCount)

# create a data frame with columns key and count
# to create the data, the keyWords and the keyCounts are zipped together
keyCountdf = pd.DataFrame(columns=['key', 'count'], data=list(zip(keyWords, keyCount)))

keyCountdf = keyCountdf.sort_values(by='count', axis=0)

hvplot.save(keyCountdf.hvplot.bar(y='count', x='key', title='Key Count', xlabel='key', ylabel='# tweets').opts(fontscale=0.75, invert_axes=True),\
    'figures/individual-key-count-bar.png')

In [None]:
#count tweets each keyword appears in
#if a tweet contains multiple keywords, then they are added 
#to the result df as |word1|word2| etc.

def keysInTweet(df):
    keys = ''
    for k in keyWords:
        if df[k]:
            keys = keys + '|' + k
    keys = keys + '|'
    return keys

tweetdf['keys'] = tweetdf.apply(keysInTweet, axis=1, meta=('keys', 'object'))

# drop tweets with no keys
tweetdf = tweetdf.loc[tweetdf['keys'] != '|']

# convert the value counts to its own df
multiKeyCount = tweetdf['keys'].value_counts().to_frame()
# copy the key column (which actually holds counts), rename it and then drop the original
multiKeyCount['count'] = multiKeyCount['keys']
multiKeyCount = multiKeyCount.drop(labels='keys', axis=1)

multiKetCount = multiKeyCount.sort_values(by='count', ascending=False)

# only get the top 10 key groups, since there are too many to plot
multiKeyCount = multiKeyCount.head(10)
hvplot.save(multiKeyCount.head(10).hvplot.bar(y='count', use_index=True, title='Top 10 Topic Combinations', xlabel='topic combinations', ylabel='# tweets').opts(xrotation=90),\
    'figures/key-groups-count-bar.png')

In [None]:
# determine whether each tweet has keywords related to each category
def getCategories(row, keyList):
    for k in keyList:
        if row[k]:
            return True
        
    return False


# create a new column for each category
for c in keyCats.keys():
    tweetdf[c] = tweetdf.apply(getCategories, args=([keyCats[c]]), axis=1, meta=(c, 'boolean'))

# drop the original keyword columns
tweetdf = tweetdf.drop(columns=keyWords)


In [None]:
# count how many tweets each category has
def getCatCount(c, df):
    return df[c].sum()

#client submit and gather implementation modified from stackexchange
# similar to previous key group code, except it counts catgeories
futures = []
for c in keyCats.keys():
    futures.append(client.submit(getCatCount, c, tweetdf))

# gather the tasks
catCount = client.gather(futures)

catCount = dask.compute(*catCount)
catCountdf = pd.DataFrame(columns=['category', 'count'], data=list(zip(keyCats.keys(), catCount)))

catCountdf = catCountdf.sort_values(by='count', axis=0, ascending=False)

# convert the elements in category from their abbreviated name to the full name
catCountdf['category'] = catCountdf['category'].map(lambda x: abbrCats[x])

hvplot.save(catCountdf.hvplot.bar(x='category', y='count', xlabel='topic', ylabel='# tweets', title='Tweets per Topic').opts(xrotation=90),\
    'figures/topic-count-bar.png')


In [None]:
# get subjectivity of each tweet [0, 1]
def compSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# get polarity of tweet [-1, 1]
def compPolarity(text):
    return TextBlob(text).sentiment.polarity

# add subjectivity and polarity as new columns
tweetdf['subjectivity'] = tweetdf['text'].map(compSubjectivity)
tweetdf['polarity'] = tweetdf['text'].map(compPolarity)

# speeds up computation of mean and std of sub and pol for each cat
tweetdf.compute()
tweetdf = tweetdf.persist()

tweetdf

Unnamed: 0_level_0,time,text,year,month,day,hour,minute,keys,gen,dep,anx,ocd,adhd,ptsd,bip,asd,schiz,ed,subjectivity,polarity
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
,datetime64[ns],object,int64,int64,int64,int64,int64,object,boolean,boolean,boolean,boolean,boolean,boolean,boolean,boolean,boolean,boolean,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
# can store results in a pandas dataframe since we know it will fit in memory
mlres = pd.DataFrame()

# for each category, make a series of sub and pol mean and std
# then add it to the output pandas df
for c in keyCats.keys():
    tmp = tweetdf.loc[tweetdf[c]]
    data = (tmp['subjectivity'].mean().compute(),\
            tmp['polarity'].mean().compute(),\
            tmp['subjectivity'].std().compute(),\
            tmp['polarity'].std().compute()
            )
    ser = pd.Series(data=data, index=['subjectivity mean', 'polarity mean', 'subjectivity std', 'polarity std'], name=c)
    mlres = pd.concat([mlres, ser], axis=1)

mlres

Unnamed: 0,gen,dep,anx,ocd,adhd,ptsd,bip,asd,schiz,ed
subjectivity mean,0.397486,0.418807,0.446457,0.40119,0.389963,0.394128,0.411973,0.381295,0.375805,0.404271
polarity mean,0.024546,0.058445,0.000866,0.042355,0.06695,0.046806,0.020703,0.051647,0.012839,0.024946
subjectivity std,0.22261,0.29291,0.342674,0.303571,0.283639,0.285857,0.30399,0.298897,0.295384,0.303036
polarity std,0.221095,0.283889,0.24524,0.259828,0.24697,0.250086,0.27694,0.250761,0.261636,0.278271


In [None]:
hvplot.save(mlres.drop(index=['subjectivity std', 'polarity std', 'polarity mean']).hvplot.bar(use_index=True, title='Subjectivity Means', xlabel='Topic', ylabel='Subjectivity', ylim=(0, 1)),\
    'figures/subjectivity-mean-bar.png')

In [None]:
hvplot.save(mlres.drop(index=['subjectivity std', 'polarity std', 'subjectivity mean']).hvplot.bar(use_index=True, title='Polarity Means', xlabel='Topic', ylabel='Polarity', ylim=(-1, 1)),\
    'figures/polarity-mean-bar.png')

In [None]:
# print random tweets that fall on each end and of the sentiment analysis
neg = list(tweetdf.loc[tweetdf['polarity'] == -1]['text'].sample(frac=0.01))[0]
neut = list(tweetdf.loc[tweetdf['polarity'] == 0]['text'].sample(frac=0.01))[0]
pos = list(tweetdf.loc[tweetdf['polarity'] == 1]['text'].sample(frac=0.01))[0]
subj = list(tweetdf.loc[tweetdf['subjectivity'] == 0]['text'].sample(frac=0.01))[0]
eqSO = list(tweetdf.loc[tweetdf['subjectivity'] == 0.5]['text'].sample(frac=0.01))[0]
obj = list(tweetdf.loc[tweetdf['subjectivity'] == 1]['text'].sample(frac=0.01))[0]

print(f"Polarity = -1:\n{neg}\n")
print(f"Polarity = 0:\n{neut}\n")
print(f"Polarity = 1:\n{pos}\n")
print(f"Subjectivity = 0:\n{subj}\n")
print(f"Subjectivity = 0.5:\n{eqSO}\n")
print(f"Subjectivity = 1:\n{obj}")

Polarity = -1:
andADHD This is one of the worst takes Ive ever heard

Polarity = 0:
Treating PTSD With Ecstasy I The FeedCheck on YouTube    Smoothie Expert

Polarity = 1:
As teachers we should do our best to minimize student anxiety in our classes TeachingCoyotes

Subjectivity = 0:
Thankfully weed repairs that trauma damage to the brain

Subjectivity = 0.5:
ppl who hate on others for now beginning to boycott bbc are so strangei get you love loona and me  but likewould u rather they left the abusive company theyre already in or stay and endure even more abuse and trauma

Subjectivity = 1:
Taking Xanax during pregnancy does NOT increase risk of autism in babies key study rules   LatestNews


In [None]:
# counts and prints the number of tweets that fall into each sentiment analysis category
numMoreObj = len(tweetdf.loc[tweetdf['subjectivity'] < 0.5].compute())
numMoreSubj = len(tweetdf.loc[tweetdf['subjectivity'] > 0.5].compute())
numEquallySubjObj = len(tweetdf.loc[tweetdf['subjectivity'] == 0].compute())

numPos = len(tweetdf.loc[tweetdf['polarity'] > 0].compute())
numNeg = len(tweetdf.loc[tweetdf['polarity'] < 0].compute())
numNeut = len(tweetdf.loc[tweetdf['polarity'] == 0].compute())

print(f"Objective leaning: {numMoreObj}")
print(f"Subjective leaning: {numMoreSubj}")
print(f"Equally subjective and objective: {numEquallySubjObj}")
print(f"Positive: {numPos}")
print(f"Negative: {numNeg}")
print(f"Neutral: {numNeut}")

Objective leaning: 71080
Subjective leaning: 47452
Equally subjective and objective: 23683
Positive: 51196
Negative: 43399
Neutral: 28535


In [None]:
# level of significance for stat tests
alpha = 0.05

sigOpts = ['significant', 'not significant']

# One way ANOVA on column
def owANOVA(col):
    catArrs = []
    for c in keyCats.keys():
        tmp = tweetdf.loc[tweetdf[c]]
        tmp = tmp[col]
        tmp = tmp.to_dask_array()
        catArrs.append(tmp)

    return f_oneway(*catArrs)

# run the OWANOVA on the subjectivity and polarity columns
owaSubj = owANOVA('subjectivity')
owaPol = owANOVA('polarity')

# determine whether the result of the ANOVA is significant
def owaIsSig(pval, alpha):
    if pval < alpha:
        return sigOpts[0]
    else:
        return sigOpts[1]

# sprint whether or not the tests are significant, as well as the p values
owaStr = f"Subjectivity: {owaIsSig(owaSubj.pvalue, alpha)}, {owaSubj[1]}\n"
owaStr = owaStr + f"Polarity: {owaIsSig(owaPol.pvalue, alpha)}, {owaPol[1]}"

print(owaStr)

Subjectivity: significant, 7.612887176705211e-161
Polarity: significant, 2.470843946377159e-226


In [None]:
# run a t test on columns a and b
def tTest(col, a, b):
    a = tweetdf.loc[tweetdf[a]]
    a = a[col]
    a = a.to_dask_array()
    b = tweetdf.loc[tweetdf[b]]
    b = b[col]
    b = b.to_dask_array()
    return ttest_ind(a=a, b=b, equal_var=False)

# compute t tests for all pairs of categories, return them as a list
def allTTests(col):
    out = []
    keys = list(keyCats.keys())
    for x in range(len(keys)):
        for y in range(x+1, len(keys)):
            out.append((keys[x], keys[y], tTest(col, keys[x], keys[y])))
    return out

# run all t tests for subjectivity and polarity
subjT = allTTests('subjectivity')
polT = allTTests('polarity')

# determine if the result is significant for each test in res
def statSigT(res, alpha):
    out = []
    for i in res:
        sig = i[2][1] < alpha
        isSig = (abbrCats[i[0]], abbrCats[i[1]], sig)
        out.append(isSig)
    return out

subjSig = statSigT(subjT, alpha)
polSig = statSigT(polT, alpha)

# divide the sub lists into two dictionary lists for significant or not
def sortRes(lst):
    out = {
        sigOpts[0] : [],
        sigOpts[1] : []
    }

    sig = None
    for i in lst:
        if i[2]:
            sig = sigOpts[0]
        else:
            sig = sigOpts[1]
        out[sig].append(f"{i[0]} and {i[1]}")
    return out

subjSig = sortRes(subjSig)
polSig = sortRes(polSig)

# creates a string to be printed for the statistics output
def sigStr(lst, title):
    outStr = '********************\n'
    outStr = outStr + f"{title.upper()}:\n"

    for i in sigOpts:
        outStr = outStr + '\n--------------------\n'
        outStr = outStr + i + ':'
        outStr = outStr + '\n--------------------\n'
        for j in lst[i]:
            outStr = outStr + j + '\n'

    outStr = outStr + '********************\n'
    return outStr

tStr = sigStr(subjSig, 'subjectivity') + sigStr(polSig, 'polarity')

print(tStr)

********************
SUBJECTIVITY:

--------------------
significant:
--------------------
General and Depression
General and Anxiety
General and Bipolar
General and Autism
General and Schizophrenia
Depression and Anxiety
Depression and ADHD
Depression and PTSD
Depression and Autism
Depression and Schizophrenia
Anxiety and OCD
Anxiety and ADHD
Anxiety and PTSD
Anxiety and Bipolar
Anxiety and Autism
Anxiety and Schizophrenia
Anxiety and Eating Disorder
OCD and Autism
ADHD and Bipolar
PTSD and Bipolar
PTSD and Autism
Bipolar and Autism
Bipolar and Schizophrenia
Autism and Eating Disorder
Schizophrenia and Eating Disorder

--------------------
not significant:
--------------------
General and OCD
General and ADHD
General and PTSD
General and Eating Disorder
Depression and OCD
Depression and Bipolar
Depression and Eating Disorder
OCD and ADHD
OCD and PTSD
OCD and Bipolar
OCD and Schizophrenia
OCD and Eating Disorder
ADHD and PTSD
ADHD and Autism
ADHD and Schizophrenia
ADHD and Eating Disor

In [None]:
# write the statistics result file
with open('figures/stat_results.txt', 'w') as filehandler:
    filehandler.write(f"One-Way ANOVA Results:\n{owaStr}\n\nT Tests Results:\n{tStr}")

# MACHINE LEARNING START

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers

# reference
#https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

tmldf = tweetdf
max_words = 5000
max_len = 200

tknzr = Tokenizer(num_words=max_words)
