In [1]:
import tarfile,sys
import pandas as pd
import os
import re as regex
import numpy as np
import random

## Read from CSV file containing features

In [2]:
traindata = pd.read_csv('train')
traindata

Unnamed: 0.1,Unnamed: 0,class,user_name,id,tweet,avgsentiment,favorite_count,firstperson,geo,possibly_sensitive,retweet_count,retweeted,tsugawa,number_of_uppercase,number_of_exclamation,number_of_question,number_of_ellipsis
0,0,depression,k1Tgty92JWM,6195046850825297023,"RT @FpJgkXU6: My cat is sad because, having be...",0.112638,0,2670,,0.0,249,False,0,2,0,0,0
1,0,control,f8Da_Rvi,8522289883066978092,Running Boston for Children's Hospital again. ...,0.118159,0,142,,0.0,1,False,0,0,0,0,0
2,0,depression,hehQgSVVRpv,322688550056773303,My mom is just swell like that. :)####In other...,0.047492,0,2562,,,0,False,0,1,0,0,0
3,0,control,qPQRyK39gnG,2184602040112857282,Ain't nothing like starting your morning with ...,0.099925,0,2592,,,0,False,0,1,1,0,0
4,0,depression,j_1H5RgR2_6,2834216959061162834,RT @GAndbcDpitkbVU: #ThePinkPrintCoverTomorrow...,0.211423,0,2891,,,10,False,0,2,0,0,0
5,0,control,vlwXqvDcXHd8vR,2835019316888866733,RT @hdoQYECq: Dzhokhar Tsarnaev landed the cov...,0.138308,0,496,,,959,False,0,1,0,0,0
6,0,depression,oTz15zT,3959835890448092091,Today #AtTheMovies the clips come from my daug...,0.27874,0,1375,,,0,False,0,0,1,0,0
7,0,control,i_y79Pl4VIi7k,5666978709914902022,The Cross Keys Endell St Dutty Dutty London ht...,0.078549,0,453,,0.0,0,False,0,0,0,0,0
8,0,depression,xGzqP93lsEjiaG,1578519607921696117,"AND TEEEELLLLL ME WE BELONG TOGETHERRRE####""No...",0.066448,0,2058,,,0,False,0,6,0,0,0
9,0,control,gHgnIT,4042618482248831958,@lUnvYAewx3J Thanks haha! http://t.co/ilF1VFy4...,0.017524,3,2145,,0.0,2,False,0,0,1,0,0


In [3]:
data_test = pd.read_csv('testing_random')
print(data_test.shape)

(30000, 4)


## Separate each tweet in its own row

In [4]:
b = pd.DataFrame(traindata['tweet'].str.split('####').tolist(), index=[traindata['user_name'], traindata['class']]).stack()
b = b.reset_index()[[0, 'user_name','class']] # var1 variable is currently labeled 0
b.columns = ['tweet', 'user_name','class'] # renaming var1
final_train_data = b

## Clean Up data

In [5]:
def remove_by_regex(tweets, regexp):
    tweets.loc[:, "tweet"].replace(regexp, "", inplace=True)
    return tweets

def remove_urls(tweets):
    return remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

def remove_na(tweets):
    return tweets[tweets["tweet"] != "Not Available"]

def remove_special_chars(tweets):  # it unrolls the hashtags to normal words
    for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                 "@", "%", "^", "*", "(", ")", "{", "}",
                                                                 "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                 "!", "?", ".", "'",
                                                                 "--", "---", "#"]):
        tweets.loc[:, "tweet"].replace(remove, "", inplace=True)
    return tweets

def remove_usernames(tweets):
    return remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

def remove_numbers(tweets):
    return remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [6]:
data = final_train_data
data = remove_urls(data)
data = remove_na(data)
data = remove_usernames(data)
data = remove_special_chars(data)
data = remove_numbers(data)

## Remove empty tweets or tweets with one word

In [7]:
x = data.query('tweet != ""')
x.shape
data=x
data = data[data.tweet.str.split(' ').str.len() > 1]
data=data.reset_index(drop=True)

In [8]:
data['class'].value_counts()

control       42519
depression    33446
ptsd          10450
Name: class, dtype: int64

## Batching Function 

In [10]:
# Batch Iterator
import torch
def prepare_data(data):
  # pad data
    #maxlen = max(map(len, data))
    #maxlen=max(maxlen,10)
    maxlen = 100
    data = [ex + [0] * (maxlen-len(ex)) for ex in data]
    data = np.array(data)[:,0:20]

  # wrap in tensor
    return torch.LongTensor(data)


def prepare_labels(labels):
    try:
        return torch.LongTensor(labels)
    except:
        return labels


def batch_iterator(dataset, batch_size, forever=False):
    dataset_size = len(dataset)
    order = None
    nbatches = dataset_size // batch_size

    def init_order():
        return random.sample(range(dataset_size), dataset_size)

    def get_batch(start, end):
        batch = [dataset.iloc[ii] for ii in order[start:end]]
        data = prepare_data([ex.tweet for ex in batch])
        labels = prepare_labels([ex['class'] for ex in batch])
        return data, labels

    order = init_order()

    while True:
        for i in range(nbatches):
            start = i*batch_size
            end = (i+1)*batch_size
            yield get_batch(start, end)

        if nbatches*batch_size < dataset_size:
            yield get_batch(nbatches*batch_size, dataset_size)

        if not forever:
            break

        order = init_order()


## Utility Functions for Model

In [11]:
def copy_state(state):
    if isinstance(state, tuple):
    	return (Variable(state[0].data), Variable(state[1].data))
    else:
    	return Variable(state.data) 

def batchify(data, bsz):
    nbatch = len(data) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data        


## Build a vocabulary of words and construct the embedding matrix from pre-trained glove vectors 

In [12]:
PAD_TOKEN = '_PAD_'
UNK_TOKEN = '_UNK_'
def build_vocab(datasets):
    vocab = dict()
    vocab[PAD_TOKEN] = len(vocab)
    vocab[UNK_TOKEN] = len(vocab)
    for data in datasets:
        for tweet in data['tweet']:
            tokens = tweet.split(' ')
            for word in tokens:
                if word not in vocab:
                    vocab[word] = len(vocab)

    print('Vocab size: {}'.format(len(vocab)))

    return vocab

class TokenConverter(object):
    def __init__(self, vocab):
        self.vocab = vocab
        self.unknown = 0

    def convert(self, token):
        if token in self.vocab:
            id = self.vocab.get(token.lower())
        else:
            id = self.vocab.get(UNK_TOKEN)
            self.unknown += 1
        return id


def load_embeddings(path, vocab, cache=False, cache_path=None):
    rows = []
    new_vocab = [UNK_TOKEN]

    if cache_path is None:
        cache_path = path + '.cache'

  # Use cache file if it exists.
    if os.path.exists(cache_path):
        path = cache_path

    print("Reading embeddings from {}".format(path))

  # first pass over the embeddings to vocab and relevant rows
    with open(path) as f:
        for line in f:
            word, row = line.split(' ', 1)
            if word == UNK_TOKEN:
                raise ValueError('The unk token should not exist w.in embeddings.')
            if word in vocab:
                #print(word)
                rows.append(line)
                new_vocab.append(word)

  # optionally save relevant rows to cache file.
    if cache and not os.path.exists(cache_path):
        with open(cache_path, 'w') as f:
            for line in rows:
                f.write(line)
            print("Cached embeddings to {}".format(cache_path))

  # turn vocab list into a dictionary
    new_vocab = {w: i for i, w in enumerate(new_vocab)}

    print('New vocab size: {}'.format(len(new_vocab)))

    assert len(rows) == len(new_vocab) - 1

  # create embeddings matrix
    embeddings = np.zeros((len(new_vocab), 25), dtype=np.float32)
    for i, line in enumerate(rows):
        embeddings[i+1] = list(map(float, line.strip().split(' ')[1:]))

    return new_vocab, embeddings


vocab = build_vocab([data])
vocab, embeddings = load_embeddings('glove.twitter.27B.25d.txt', vocab, cache=True)

Vocab size: 74993
Reading embeddings from glove.twitter.27B.25d.txt.cache
New vocab size: 25660


## Convert each word to its respective index in vocab

In [13]:
def convert2ids(id_data,vocab):
    converter = TokenConverter(vocab)
    x1 = id_data.tweet.str.lower()
    x1 = x1.str.split(' ')
    x1 = x1.apply(lambda x: list(map(converter.convert, x)))
    id_data['tweet'] = x1
    print('Found {} unknown tokens.'.format(converter.unknown))
    return id_data
id_data = data.copy(deep=True)
convert2ids(id_data, vocab)

Found 98471 unknown tokens.


Unnamed: 0,tweet,user_name,class
0,"[1, 11, 1117, 13, 572, 193, 431, 180, 10064, 1...",k1Tgty92JWM,depression
1,"[309, 14, 1478, 68, 220, 494]",k1Tgty92JWM,depression
2,"[82, 194, 129, 82, 2782, 128, 23, 4, 343, 856,...",k1Tgty92JWM,depression
3,"[616, 37, 3, 87, 264, 726, 397, 6, 14504, 806,...",k1Tgty92JWM,depression
4,"[95, 1416, 106, 156, 65, 2138, 2209, 209, 4, 3...",k1Tgty92JWM,depression
5,"[55, 42, 4, 6127, 19, 0, 5400, 159, 401, 107, ...",k1Tgty92JWM,depression
6,"[57, 273, 1152, 565, 10, 94, 34, 260, 631]",k1Tgty92JWM,depression
7,"[125, 129, 82, 263, 42, 153, 220, 90, 42, 5, 4...",k1Tgty92JWM,depression
8,"[125, 347, 14, 560, 4223, 9714, 237, 157, 3114...",k1Tgty92JWM,depression
9,"[1, 5, 71, 155, 5372, 36, 555, 45, 5, 71, 8409...",k1Tgty92JWM,depression


## Define an RNN model with hidden_states = 100, outputsize = len(vocab), GRU layer 2, dropout

In [14]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import time
class Net(nn.Module):
    def __init__(self, features, cls_size):
        super(Net, self).__init__()
        self.hidden_size = 100
        self.embedding = nn.Embedding(features, self.hidden_size)
        self.rnn1 = nn.GRU(input_size=features,
                            hidden_size=self.hidden_size,num_layers=2, batch_first=True, dropout=1)
        self.dense1 = nn.Linear(self.hidden_size, cls_size)
        #self.softmax = nn.LogSoftmax(dim=1)
        

    def forward(self, x, hidden):
        word = self.embedding.weight[x]
        emb = word.view((len(word),1,25))
        output, hidden = self.rnn1(emb, hidden)
        output = self.dense1(output)
        #output = self.softmax(self.softmax(output[0]))
        return output, hidden

    def init_hidden(self, batch_size, pretrained_word_vectors):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        weight = next(self.parameters()).data
        return Variable(weight.new(2, batch_size, self.hidden_size).zero_())


## Train the model

In [15]:
from tqdm import tqdm_notebook

In [16]:
def var(x):
    x = Variable(x)
    return x
    
model = Net(features=25, cls_size=len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
hiddens = []
y=[]
nbatches= len(id_data) // 1000
print ("Total training progresss:")



def train():
    pbar1 = tqdm_notebook(total=nbatches )
    model.train()
    batch_size = 1000
    hidden_init = model.init_hidden(batch_size,embeddings)
    loss_avg = 0
    s=0
    for x, labels in batch_iterator(id_data, batch_size, forever=False):
        s+=1
        bsize=len(labels)
        if bsize==batch_size: 
            model.zero_grad()
            hidden = hidden_init
            loss =0
            start = time.time()
            #print ('Batch '+str(s)+ ' progress: ')
            #pbar2= tqdm_notebook(total=19)
            for i in range(19):
                batch  = x[:,i]
                output, hidden = model(batch, var(hidden.data))
                output = output.view(bsize,output.shape[2])
                s1 = Variable(x[:,i+1])
                loss += criterion(output, s1)
            tempHidden=hidden.view(bsize,200)
            tempHidden=list(tempHidden.data.numpy())
            hiddens.append(tempHidden)
                #pbar2.update(1)
            y.append(labels)
            loss.backward()
            hidden_init = copy_state(hidden)
            optimizer.step()
            loss_avg = .99*loss_avg + .01*loss.data[0]/bsize
            pbar1.update(1)
            #print(epoch, loss_avg,  "Time in sec: " + str(time.time()-start))
            #print ()
            #pbar2.close()
    pbar1.close()
for epoch in range(10):
    print ('Epoch '+str(epoch))
    del hiddens[:]
    del y[:]
    train()

Total training progresss:
Epoch 0



Epoch 1



Epoch 2



Epoch 3



Epoch 4



Epoch 5



Epoch 6



Epoch 7



Epoch 8



Epoch 9





In [17]:
hiddens=np.array(hiddens)
hiddens.shape

(86, 1000, 200)

In [20]:
hiddens=np.reshape(hiddens, (86000, 200))
hiddens.shape

(86000, 200)

In [21]:
y=np.array(y)
y.shape

(86, 1000)

In [22]:
y=np.reshape(y, (86000, 1))

In [23]:
y.shape

(86000, 1)

## Encode Labels

In [27]:
#hiddens=np.squeeze(hiddens)

In [28]:
from sklearn.preprocessing import LabelEncoder
crisisEncoder = LabelEncoder()
y = crisisEncoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


## Logistic Regression via Cross Fold Validation

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
log = cross_val_score(LogisticRegression(), hiddens, y,cv=5)
print("Logisitic regression average accuracy : ",log.mean())

Logisitic regression average accuracy :  0.490639587589


## Naive Bayes via CV

In [30]:
# from sklearn.naive_bayes import GaussianNB
# nb = cross_val_score(GaussianNB(), hiddens, y,cv=5)
# print("Naive bayes average accuracy : ",nb.mean())

## Random Forest Via CV

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf = cross_val_score(RandomForestClassifier(), hiddens, y,cv=5)
print("Random forest average accuracy : ",rf.mean())

Random forest average accuracy :  0.420487334915


## Convert to ids

In [33]:
id_data_test = data_test.copy(deep=True)
convert2ids(id_data_test, vocab)

Found 44441 unknown tokens.


Unnamed: 0.1,Unnamed: 0,tweet,user_name,class
0,0,"[4126, 0, 2397]",nm0LlMOxGaNT7,control
1,1,"[2276, 0, 37, 355, 464, 88, 14, 14655, 6, 237,...",yeXlVonSZgEPCc,control
2,2,"[6796, 56, 111, 832, 0, 1099, 1128, 0, 244, 11...",o3CoXuCrL1nQ,control
3,3,"[2, 40, 11, 678, 258, 2137, 6227]",uDJvT6i6ES6Tm93,control
4,4,"[0, 1517, 93, 35, 2233, 523, 578, 23, 202, 123...",kMWzLd,control
5,5,"[257, 0, 90, 358, 71, 27, 32, 0, 229, 31, 358,...",cnB_zqr_6,control
6,6,"[1, 263, 5, 208, 18, 4266, 0, 56, 1240]",jMFRQPg_eaHq,control
7,7,"[125, 31, 931, 49, 692, 55, 2, 93, 0]",jJOsHAge061B6ku,control
8,8,"[446, 2212, 188, 2467, 3956, 10, 5777, 2638, 1...",9n0aRlnk5n3Rr,control
9,9,"[11, 2519, 81, 391]",rSaU1hkUY,control


## Logistic Regresion Train

In [34]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(hiddens, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Run on test set

In [37]:
hiddens_test=[]
y_test = []
nbatches= len(id_data_test) // 1000


def test():
    pbar1 = tqdm_notebook(total=nbatches )
    model.eval()
    batch_size = 1000
    hidden_init = model.init_hidden(batch_size,embeddings)
    loss_avg = 0
    s=0
    for x, labels in batch_iterator(id_data_test, batch_size, forever=False):
        s+=1
        bsize=len(labels)
        if bsize==batch_size: 
            hidden = hidden_init
            loss =0
            start = time.time()
            #print ('Batch '+str(s)+ ' progress: ')
            #pbar2= tqdm_notebook(total=19)
            for i in range(19):
                batch  = x[:,i]
                output, hidden = model(batch, var(hidden.data))
                output = output.view(bsize,output.shape[2])
                s1 = Variable(x[:,i+1])
                loss += criterion(output, s1)
            tempHidden=hidden.view(bsize,200)
            tempHidden=list(tempHidden.data.numpy())
            hiddens_test.append(tempHidden)
                #pbar2.update(1)
            y_test.append(labels)
            hidden_init = copy_state(hidden)

            loss_avg = .99*loss_avg + .01*loss.data[0]/bsize
            pbar1.update(1)
            #print(epoch, loss_avg,  "Time in sec: " + str(time.time()-start))
            #print ()
            #pbar2.close()
    pbar1.close()
for epoch in range(1):
    print ('Epoch '+str(epoch))
    del hiddens_test[:]
    del y_test[:]
    test()


Epoch 0


In [38]:
hiddens_test=np.array(hiddens_test)
hiddens_test.shape


(30, 1000, 200)

In [39]:
hiddens_test=np.reshape(hiddens_test, (30000, 200))
hiddens_test.shape

(30000, 200)

In [40]:
y_test=np.array(y_test)
y_test.shape
y_test=np.reshape(y_test, (30000, 1))
y_test.shape

(30000, 1)

In [43]:

def color():
    hidden_init = model.init_hidden(1,embeddings)
    loss_avg = 0
    for x, labels in batch_iterator(id_data_test, 100, forever=False):
        bsize=len(labels)
        hidden = hidden_init
        loss =0
        start = time.time()
        for i in range(bsize-1):
            batch  = x[:,i]
            output, hidden = model(batch, var(hidden.data))
            tempHidden=hidden.view(100)
            #tempHidden=list(tempHidden.data.numpy())
            tempHidden = tempHidden.data.numpy()
            tempHidden=tempHidden.reshape(1,-1)
            label=logreg.predict(tempHidden)
            word = list(vocab.keys())[list(vocab.values()).index(batch)]
            if label==0: #control - green 
                text="\033[0;30;42m "+ word
                print (text, end=" ")
            if label==1: #depressed - red
                text="\033[1;30;41m "+ word
                print (text, end=" ")
            if label==2: #ptsd-blue
                text="\033[1;30;44m "+ word
                print (text, end=" ")
            if i+1 < len(tokens):
                s1 = Variable(torch.LongTensor([tokens[i+1]]))
                loss += criterion(output, s1)
        hidden_init = copy_state(hidden)
        loss_avg = .99*loss_avg + .01*loss.data[0]/bsize
        pbar.update(1)
        print(epoch, loss_avg,  "Time in sec: " + str(time.time()-start))
    pbar.close()
    
for epoch in range(1):
    hiddens_test = []
    color()

## Visualize hidden states predictions

In [36]:
def predict():
    hidden_init = model.init_hidden(1,embeddings)
    loss_avg = 0
    for s in range(9000,10000):
        hidden = hidden_init
        loss =0
        #tokens = tweet
        tokens = id_data_test.tweet[s]
        for i,word in enumerate(tokens):
            output, hidden = loaded_model(word, var(hidden.data))
            ## Send to Logistic
            colorWord=hidden.view(100)
            colorWord = colorWord.data.numpy()
            colorWord = colorWord.reshape(1,-1)
            label  = logreg.predict(colorWord)
            output = output.view(1,output.shape[2])
            #word = list(vocab.keys())[list(vocab.values()).index(word)]
            if label==0: #control - green 
                text="\033[0;30;42m "+ word
                print (text, end=" ")
            if label==1: #depressed - red
                text="\033[1;30;41m "+ word
                print (text, end=" ")
            if label==2: #ptsd-blue
                text="\033[1;30;44m "+ word
                print (text, end=" ")
            if i+1 < len(tokens):
                s1 = Variable(torch.LongTensor([tokens[i+1]]))
                loss += criterion(output, s1)
        print("/n/n")
        hidden_init = copy_state(hidden)
predict()

## Get test labels

In [44]:
y_test = crisisEncoder.transform(y_test)
print(y_test.shape)

(30000,)


  y = column_or_1d(y, warn=True)


## Testing Accuracy of Logistic Regression

In [None]:
y_pred = logreg.predict(hiddens_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(hiddens_test, y_test)))

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

import seaborn as sns
import numpy
import matplotlib.pyplot as plt
xcm=confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)
xcm=numpy.array(xcm)
ax = sns.heatmap(xcm)
plt.xlabel('Test Labels')
plt.ylabel('Predicted Labels')
plt.show()

## Testing Accuracy of Other Classifiers 

In [None]:
## Naive Bayes
from sklearn.naive_bayes import GaussianNB
#nb = cross_val_score(MultinomialNB(), X_new, y,cv=3)
#print("Naive Bayes average accuracy : ",nb.mean())
clf = GaussianNB().fit(hiddens, y)
y_pred_nb = clf.predict(np.array(X_new_test))
print('Accuracy of Naive Bayes classifier on test set: {:.2f}'.format(clf.score(np.array(X_new_test), y_test)))

rf = RandomForestClassifier().fit(hiddens, y)
y_pred_et = extra_tree.predict(np.array(X_new_test))
print('Accuracy of Random Forest classifier on test set: {:.2f}'.format(rf.score(X_new_test, y_test)))

extra_tree = ExtraTreesClassifier().fit(hiddens, y)
y_pred_et = extra_tree.predict(np.array(X_new_test))
print('Accuracy of Extra Tree classifier on test set: {:.2f}'.format(extra_tree.score(np.array(X_new_test), y_test)))