In [1]:
import numpy as np
import pandas as pd
from numpy import sign
import ftfy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
import re

In [3]:
#np.random.seed() is used to generate same set of numbers before rand() function is called
#random numbers work by starting with a number (the seed), multiplying it by a large number, 
#then taking modulo of that product. The resulting number is then used as the seed to generate the next "random" number.
#When you set the seed (every time), it does the same thing every time, giving you the same numbers.

# Loading Data

In [4]:
np.random.seed(1234)

In [5]:
depression_df=pd.read_csv('depressive_tweets.csv',sep=',',header=None,usecols=range(0,10),nrows=3200)
#file should be in same path as ipynb file
#sep is by which fields are seperated.
#if the first row of the file can act as a header or not.
#if header=None then usecols is used to give column names.
#nrows is used to pick no of rows from file.

In [6]:
random_df=pd.read_csv('random_tweets.csv',sep=',',header=None,usecols=range(0,10),nrows=12000)

In [7]:
random_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.040752e+18,2018-09-15,05:29:59,India Standard Time,8.580064e+17,CaiyaJae,@tori_schleicher I’m weak 😂,1,0,2
1,1.040752e+18,2018-09-15,05:29:59,India Standard Time,4.177066e+09,RyanL1872,"Watch out @DundeeFC YT, @Burns1872 is ready 🇬🇧...",0,2,3
2,1.040752e+18,2018-09-15,05:29:59,India Standard Time,2.557082e+09,SirLitness,Me vs the world.,1,0,2
3,1.040752e+18,2018-09-15,05:29:59,India Standard Time,2.281425e+08,oneofthe5daves,@Twitter I heard everyone knows at least 5 Dav...,0,0,0
4,1.040752e+18,2018-09-15,05:29:59,India Standard Time,2.858842e+08,ThatWeissGuy,"Paranoia about the ""death of print"" has become...",1,2,12
5,1.040752e+18,2018-09-15,05:29:59,India Standard Time,1.037502e+18,QUEENbAE408,"@ChrisV1988 Haha babe, it supposed to look lik...",0,0,1
6,1.040752e+18,2018-09-15,05:29:59,India Standard Time,1.923669e+08,shannapope,@simplyn2deep I was fortunate enough to have a...,1,0,0
7,1.040752e+18,2018-09-15,05:29:59,India Standard Time,3.311392e+09,creditbook68,@AdamRubinMedia Such a shame those 2 never got...,0,0,0
8,1.040752e+18,2018-09-15,05:29:59,India Standard Time,4.776908e+09,moonIightwitch,@technoviscera you’re so..... Thank You,0,0,1
9,1.040752e+18,2018-09-15,05:29:59,India Standard Time,2.536611e+07,writerslink,Stormy Daniels to Publish a Tell-All Book Abou...,0,0,0


In [8]:
depression_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.045021e+18,2018-09-27,00:12:11,India Standard Time,1.430323e+07,15dozentimes,It's always kind of weird when the depression ...,1,0,0
1,1.045021e+18,2018-09-27,00:12:06,India Standard Time,7.655485e+17,taIizzle,i was severely depressed during the summer of ...,0,0,0
2,1.045021e+18,2018-09-27,00:12:02,India Standard Time,1.043997e+18,DragonQueenHalo,What it's like to be me: I wake up in a panic ...,0,0,0
3,1.045021e+18,2018-09-27,00:12:00,India Standard Time,4.027673e+08,jnnfrsbdy,"@aquariusguts bpd, anxiety and depression 👻",0,0,0
4,1.045021e+18,2018-09-27,00:11:48,India Standard Time,8.204317e+17,MeganeMea,Depression makes you forget you were trying to...,0,0,0
5,1.045021e+18,2018-09-27,00:11:48,India Standard Time,2.677009e+08,heartCHANGMIN,can we really survive depression???? https://...,0,0,0
6,1.045021e+18,2018-09-27,00:11:37,India Standard Time,1.794834e+07,lesliekelly,@shilohwalker Much love to you and your family...,0,0,0
7,1.045021e+18,2018-09-27,00:11:37,India Standard Time,2.989073e+09,laursergui,CUTTING MY HAIR WONT CURE MY DEPRESSION,0,0,0
8,1.045021e+18,2018-09-27,00:11:31,India Standard Time,1.034209e+18,caitmev,@HereForTheTea2 I knew that she felt bullied. ...,0,0,0
9,1.045021e+18,2018-09-27,00:11:29,India Standard Time,2.490582e+09,_flexinnnn,Not all mental health is depression jack ass ...,0,0,0


# Preprocessing of tweets

1. Removal of links, @, and hashtags and emojis from the tweets.
2. Corecting the encoding of the broken code using ftfy.
3. Expanding contracted text.
4. Removing of punctuations.
5. Removal of stopwords.
6. Stemming

This all contractions are taken from the given link https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions

In [9]:
clist= {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

In [14]:
c_re = re.compile('(%s)' % '|'.join(clist.keys()))
#'|'.join(clist.keys()) is used to join all list members returned by clist.keys() 
#%s is string formatter.

In [15]:
def expandContraction(text,c_re=c_re):
    def replace(match):
        return clist[match.group(0)]
    return c_re.sub(replace,text)
        

In [16]:
def cleanTweets(tweets):
    c_t=[]      #array that will hold all tweets after cleaning and will be returned
    #working on each tweet.
    for tweet in tweets:
        tweet=str(tweet)
        #if the tweets doesnt contain URLs
        if re.match("(\w+:\/\/\S+)", tweet) == None and len(tweet) > 10:
            #strings starting with https://
            #match function return a match object if the pattern is there in the stirng otherwise return None
            tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", tweet).split())
            #re.sub function is used to replace all occurences of a pattern in the given string
            #the property of hastags and tags that they are continuous after a @ or # sign is used to make RE.
        tweet=ftfy.fix_text(tweet)#fixing faulty encoded text
        tweet = expandContraction(tweet)
        tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())#removing puctuations
        #removing stop words
        s_w=set(stopwords.words('english'))
        w_t=nltk.tokenize.word_tokenize(tweet)#creates a list of all words in tweet
        fil_sen=[w for w in w_t if not w in s_w]
        tweet=' '.join(fil_sen)
        #stemming
        tweet=PorterStemmer().stem(tweet)
        c_t.append(tweet)
        
    return c_t

In [17]:
dep_arr=[x for x in depression_df[6]]
ran_arr=[x for x in random_df[6]]
fin_dep=cleanTweets(dep_arr)
fin_ran=cleanTweets(ran_arr)


In [52]:
dep_arr

["It's always kind of weird when the depression manifests as actually feeling sad, rather than as a formless void",
 'i was severely depressed during the summer of 2017* that i seriously thought what i have isnt depression anymore it surpassed it that i’m the first holder of a new worse mental illness',
 "What it's like to be me: I wake up in a panic knowing all the things that have to get done today, already dreading the moment my feet touch the ground.   #MentalHealthIssues #Anxienty #Depression",
 '@aquariusguts bpd, anxiety and depression 👻',
 'Depression makes you forget you were trying to not gain weight',
 'can we really survive depression????  https://twitter.com/depressionarmy/status/1045015996321476608\xa0…',
 '@shilohwalker Much love to you and your family. Depression and mental illness are killers....I fight it every day. Your mind and heart need rest and peace. Books can wait.',
 'CUTTING MY HAIR WONT CURE MY DEPRESSION',
 '@HereForTheTea2 I knew that she felt bullied. Jus

In [53]:
fin_dep

['it always kind weird depression manifests actually feeling sad rather formless void',
 'severely depressed summer 2017 seriously thought isnt depression anymore surpassed first holder new worse mental il',
 'what like i wake panic knowing things get done today already dreading moment feet touch ground',
 'bpd anxiety depress',
 'depression makes forget trying gain weight',
 'really survive depression https twitter com depressionarmy status 1045015996321476608',
 'much love family depression mental illness killers i fight every day your mind heart need rest peace books wait',
 'cutting my hair wont cure my depress',
 'i knew felt bullied just think one tweet one video could one makes someone take life depression real there quest',
 'not mental health depression jack ass https twitter com jessicahardy 0 status 1044564980249563136',
 'i blind see read also i listened plenty literature depression know several people i is promotion book i stated depression illness choice https twitter com

In [54]:
fin_ran

['schleicher i weak',
 'watch yt readi',
 'me vs world',
 'i heard everyone knows least 5 dave david',
 'paranoia death print become self fulfilling prophecy anxious editors writers abandoned pretext sound judgement favor mainlining rat poison cut drain cleaner clickbait form',
 'haha babe supposed look like thank baby fiiinnnnneeee',
 'i fortunate enough amazing 50 something boss early 20 taught value older friends mentors she knew meant truly pay dues tried makes things easier cam',
 'such shame 2 never got raise champ flag i thought got jose 2nd go round paired coming together fab 5 sp setup nicely win 2 outa 3 maybe 3 outa 5 especially 5 fab 5 locked contractually awhil',
 'thank you',
 'stormy daniels publish tell all book about trump just before midterms https nym ag 2n5bwvr',
 'pretty lame tbh',
 'got a first english pap',
 'lucktastic could provide month free groceries https lucktastic com twsharefunnel',
 '20 00 00 temp 86 7 f heat index 93 7 f dew point 72 6 f rain today 0 00

# Tokenizing
