## Load Data

In [18]:
import tarfile,sys
import pandas as pd
import os
import simplejson
import re as regex
 
def untar(fname):
    if (fname.endswith("tar") or fname.endswith("tgz") ):
        tar = tarfile.open(fname)
        tar.extractall()
        tar.close()
        print "Extracted in Current Directory"
    else:
        print "Not a tar.gz file: '%s '" % sys.argv[0]

In [4]:
untar('shared_task_data.tar')

Extracted in Current Directory


In [5]:
for i in range(1,2): 
    fpath='final_training_data/'+str(i)+'.tgz'
    untar(fpath)

Extracted in Current Directory


In [6]:
userInfo=pd.read_csv('anonymized_user_info_by_chunk.csv')

## Function to extract features

In [None]:
def add_first_person(temp):
    import numpy as np
    #first person 
    first =  ["i", "i'd", "i'll", "i'm", "i've", "me", "my"]
    temp['firstperson'] =temp['text'].str.lower().str.split()
    count_fp = 0
    for i in temp['text']:
        count_fp += len(set(i) & set(first))
    temp['firstperson'] = count_fp
    
#tsugawa_terms 
def tsugawag(temp):
    tsugawa_terms = ["even if", "low fever", "very", "workplace", "hopeless", "disappear", "too much", "sickness", "bad", "hospital"]
    temp_words =temp['text'].str.lower().str.split()
    count_tsugawa = 0
    for i in temp['text']:
        count_tsugawa += len(set(i) & set(tsugawa_terms))
    count_final = count_tsugawa/len(temp["text"])
    temp['tsugawa'] = count_final
    
def avg_sentiment(temp):
    from textblob import TextBlob
    avg = 0
    for i in temp['text']:
        blob = TextBlob(i)
        sentiment = blob.sentiment.polarity
        avg += sentiment
    avg_sentiment = avg / len(temp['text'])
    temp['avgsentiment'] = avg_sentiment

def get_all_tweets(temp):
    all_text = ""
    for i in temp['text']:
        all_text+=i
    temp['tweet'] = all_text
    #print(all_text)

def count_by_lambda(expression, word_array):
    return len(list(filter(expression, word_array)))

def count_occurences(character, word_array):
    counter = 0
    for j, word in enumerate(word_array):
        for char in word:
            if char == character:
                counter += 1

    return counter

def count_by_regex(regex, plain_text):
    return len(regex.findall(plain_text))


def add_columns_punctuations(temp):
    temp["splitted_text"] = map(lambda txt: txt.split(" "), temp["text"])
    # number of uppercase words
    uppercase = list(map(lambda txt: count_by_lambda(lambda word: word == word.upper(), txt),temp["splitted_text"]))
       
    temp["number_of_uppercase"] =  uppercase

    # number of !
    exclamations = list(map(lambda txt: count_occurences("!", txt),temp["splitted_text"]))

    temp["number_of_exclamation"] = exclamations

    # number of ?
    questions = list(map(lambda txt: count_occurences("?", txt),temp["splitted_text"]))

    temp["number_of_question"] = questions

    # number of ...
    ellipsis = list(map(lambda txt: count_by_regex(regex.compile(r"\.\s?\.\s?\."), txt),temp["text"]))

    temp["number_of_ellipsis"] = ellipsis

## Load all tweets with the above features and construct a big feature matrix

In [20]:
columns = ['class','user_name','id','tweet','avgsentiment', 'favorite_count', 'firstperson','geo','possibly_sensitive','retweet_count','retweeted','tsugawa','number_of_uppercase','number_of_exclamation','number_of_question','number_of_ellipsis']
data=pd.DataFrame(columns=columns)
for index, row in userInfo.iterrows():
    user=row['anonymized_screen_name']+'.tweets'
    if os.path.isfile(user): 
        print 'Reading file '+user
        try: 
            temp=pd.read_json(str(user), lines=True)
            add_first_person(temp)
            tsugawag(temp)
            avg_sentiment(temp)
            get_all_tweets(temp)
            add_columns_punctuations(temp)
            temp['user_name'] = row['anonymized_screen_name']
            temp['class'] = row['condition']
            final_temp = temp[columns]
            if data.empty:
                data=final_temp.loc[0:0]
            else:
                data = data.append(final_temp.loc[0:0])
        except:
            print "Bad File!"

Reading file p1oWIeRy95.tweets
1064.26092142
0.354753640472
Reading file u6Q1cPYaegBS.tweets
118.556660476
0.155586168603
Reading file q5nbV1Dwkx3Uz8y.tweets
Bad File!
Reading file cpZb1zL.tweets
34.9936931067
0.08555915185
Reading file nC8trwq.tweets
278.282699199
0.0927608997331
Reading file mbKpnEfzJFe.tweets
344.904996417
0.114968332139
Reading file sdfBjrV9y.tweets
Bad File!
Reading file cMwx3Gi9RuskPTa.tweets
202.143428532
0.0941515736061
Reading file gnau5VYmZon.tweets
724.772792218
0.241590930739
Reading file w5zZbsisYcaA.tweets
192.546016189
0.201830205649
Reading file iBjhOmD.tweets
447.713145671
0.149237715224
Reading file q69HY7gQXtabz.tweets
502.975796395
0.167658598798
Reading file rHk1BCuEdYFyMB.tweets
274.205049695
0.0914016832316
Reading file ee7ABQXno.tweets
582.961258286
0.194320419429
Reading file g15QMjE.tweets
Bad File!
Reading file htBBgz9n8CLGzlp.tweets
356.688462186
0.118896154062
Reading file c_i5fdp13L.tweets
828.788868285
0.276262956095
Reading file kOjd8viH

In [21]:
data

Unnamed: 0,class,user_name,id,tweet,avgsentiment,favorite_count,firstperson,geo,possibly_sensitive,retweet_count,retweeted,tsugawa,number_of_uppercase,number_of_exclamation,number_of_question,number_of_ellipsis
0,depression,p1oWIeRy95,3858261610217213963,@b4a7LKoKrkpq goodnight chicken strip i love y...,0.354754,1,2227,,,0,False,0,1,0,0,0
0,control,u6Q1cPYaegBS,3270683617599762436,Dinner wth my gals yeeeeyyy!!@kPc5t4sovdlz @jx...,0.155586,0,645,,,0,False,0,0,2,0,0
0,control,cpZb1zL,5787167597109857946,Ask me a question | http://t.co/H~dGIlbtRmWhat...,0.085559,0,367,,0.0,0,False,0,1,0,0,0
0,depression,nC8trwq,4258046557791254797,RT @pvuoFh0N: Being a gentleman never goes out...,0.092761,0,2222,,,210,False,0,1,0,0,0
0,control,mbKpnEfzJFe,9036456942659112671,RT @nT9KP4M4GGud: An intelligent man is so att...,0.114968,0,2431,,,3,False,0,2,0,0,0
0,control,cMwx3Gi9RuskPTa,2135526198652413743,RT @ewczqMHQWKora: Wash the dirt off and shine...,0.094152,0,1835,,,3,False,0,1,0,0,0
0,depression,gnau5VYmZon,9109593059979792776,@mSU4tZcqIw70Bk praying for her safe return!RT...,0.241591,1,2620,,,0,False,0,0,1,0,0
0,control,w5zZbsisYcaA,5554764028281426930,"Today stats: One follower, No unfollowers via ...",0.20183,0,659,,0.0,0,False,0,0,0,0,0
0,ptsd,iBjhOmD,1008160895688102822,RT @g2zLMrKgT8FHn: Today is the day! lowest ra...,0.149238,0,2554,,0.0,3,False,0,2,2,0,0
0,control,q69HY7gQXtabz,5692898157399754606,#MondayBlogs You Owe It To Your Readers To Be ...,0.167659,3,2697,,0.0,9,False,0,1,1,0,0


## Clean Up data

In [22]:
def remove_by_regex(tweets, regexp):
    tweets.loc[:, "tweet"].replace(regexp, "", inplace=True)
    return tweets

def remove_urls(tweets):
    return remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

def remove_na(tweets):
    return tweets[tweets["tweet"] != "Not Available"]

def remove_special_chars(tweets):  # it unrolls the hashtags to normal words
    for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                 "@", "%", "^", "*", "(", ")", "{", "}",
                                                                 "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                 "!", "?", ".", "'",
                                                                 "--", "---", "#"]):
        tweets.loc[:, "tweet"].replace(remove, "", inplace=True)
    return tweets

def remove_usernames(tweets):
    return remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

def remove_numbers(tweets):
    return remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [24]:
data = data
data = remove_urls(data)
data = remove_na(data)
data = remove_special_chars(data)
data = remove_usernames(data)
data = remove_numbers(data)
#data.cleanup(TwitterCleanuper())
data.head(5)

Unnamed: 0,class,user_name,id,tweet,avgsentiment,favorite_count,firstperson,geo,possibly_sensitive,retweet_count,retweeted,tsugawa,number_of_uppercase,number_of_exclamation,number_of_question,number_of_ellipsis
0,depression,p1oWIeRy95,3858261610217213963,baLKoKrkpq goodnight chicken strip i love you ...,0.354754,1,2227,,,0,False,0,1,0,0,0
0,control,u6Q1cPYaegBS,3270683617599762436,Dinner wth my gals yeeeeyyykPctsovdlz jxXTU ha...,0.155586,0,645,,,0,False,0,0,2,0,0
0,control,cpZb1zL,5787167597109857946,Ask me a question are you wearing right now —...,0.085559,0,367,,0.0,0,False,0,1,0,0,0
0,depression,nC8trwq,4258046557791254797,RT pvuoFhN Being a gentleman never goes out of...,0.092761,0,2222,,,210,False,0,1,0,0,0
0,control,mbKpnEfzJFe,9036456942659112671,RT nTKPMGGud An intelligent man is so attracti...,0.114968,0,2431,,,3,False,0,2,0,0,0


## Tokenize Words, Remove Stop Words, Lemmatize

In [25]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
def clean_data(ingredients):
    lmtzr = WordNetLemmatizer()
    def split_word(strOfWords):
        #print(strOfWords)
        tempWords  = []
        toRemove  = []
        for word in strOfWords.split():
            word = "".join(x for x in word if x.isalpha())
            word = "".join(lmtzr.lemmatize(word))
            if word.lower() not in toRemove:
                 tempWords.append(word)
        return tempWords
    return [split_word(y) for y in ingredients]

In [26]:
data["tweet"]= clean_data(data["tweet"])

In [27]:
data["total_words"] = data["tweet"].map(",".join)
data["total_words"] = data["total_words"].str.strip()

In [28]:
# Remove Na and change False to a number
data = data.fillna(0)
data.shape
data.drop('id', axis=1)
data['retweeted'] = data['retweeted']*1
data

Unnamed: 0,class,user_name,id,tweet,avgsentiment,favorite_count,firstperson,geo,possibly_sensitive,retweet_count,retweeted,tsugawa,number_of_uppercase,number_of_exclamation,number_of_question,number_of_ellipsis,total_words
0,depression,p1oWIeRy95,3858261610217213963,"[baLKoKrkpq, goodnight, chicken, strip, i, lov...",0.354754,1,2227,0.0,0.0,0,0,0,1,0,0,0,"baLKoKrkpq,goodnight,chicken,strip,i,love,you,..."
0,control,u6Q1cPYaegBS,3270683617599762436,"[Dinner, wth, my, gal, yeeeeyyykPctsovdlz, jxX...",0.155586,0,645,0.0,0.0,0,0,0,0,2,0,0,"Dinner,wth,my,gal,yeeeeyyykPctsovdlz,jxXTU,hah..."
0,control,cpZb1zL,5787167597109857946,"[Ask, me, a, question, are, you, wearing, righ...",0.085559,0,367,0.0,0.0,0,0,0,1,0,0,0,"Ask,me,a,question,are,you,wearing,right,now,,M..."
0,depression,nC8trwq,4258046557791254797,"[RT, pvuoFhN, Being, a, gentleman, never, go, ...",0.092761,0,2222,0.0,0.0,210,0,0,1,0,0,0,"RT,pvuoFhN,Being,a,gentleman,never,go,out,of,s..."
0,control,mbKpnEfzJFe,9036456942659112671,"[RT, nTKPMGGud, An, intelligent, man, is, so, ...",0.114968,0,2431,0.0,0.0,3,0,0,2,0,0,0,"RT,nTKPMGGud,An,intelligent,man,is,so,attracti..."
0,control,cMwx3Gi9RuskPTa,2135526198652413743,"[RT, ewczqMHQWKora, Wash, the, dirt, off, and,...",0.094152,0,1835,0.0,0.0,3,0,0,1,0,0,0,"RT,ewczqMHQWKora,Wash,the,dirt,off,and,shine,o..."
0,depression,gnau5VYmZon,9109593059979792776,"[mSUtZcqIwBk, praying, for, her, safe, returnR...",0.241591,1,2620,0.0,0.0,0,0,0,0,1,0,0,"mSUtZcqIwBk,praying,for,her,safe,returnRT,mSUt..."
0,control,w5zZbsisYcaA,5554764028281426930,"[Today, stats, One, follower, No, unfollowers,...",0.20183,0,659,0.0,0.0,0,0,0,0,0,0,0,"Today,stats,One,follower,No,unfollowers,via,st..."
0,ptsd,iBjhOmD,1008160895688102822,"[RT, gzLMrKgTFHn, Today, is, the, day, lowest,...",0.149238,0,2554,0.0,0.0,3,0,0,2,2,0,0,"RT,gzLMrKgTFHn,Today,is,the,day,lowest,rate,fo..."
0,control,q69HY7gQXtabz,5692898157399754606,"[MondayBlogs, You, Owe, It, To, Your, Readers,...",0.167659,3,2697,0.0,0.0,9,0,0,1,1,0,0,"MondayBlogs,You,Owe,It,To,Your,Readers,To,Be,I..."


## CountVectorizer

In [None]:
def wordTokenizer(text):
    return text.split(",")
#Use count vectorizer to create X = n*d where n = number of samples, d=dishes. 
cv = CountVectorizer(stop_words="english",lowercase=True,tokenizer=wordTokenizer,token_pattern="[A-Za-z]*",strip_accents='ascii',vocabulary=None)
X = cv.fit_transform(data["total_words"].values)
print("Unique ingredients after data cleaning: ", X.shape)

## TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True,stop_words='english', lowercase=True,tokenizer=wordTokenizer,token_pattern="[A-Za-z]*",strip_accents='ascii',vocabulary=None)
train_corpus_tf_idf = vectorizer.fit_transform(data["total_words"].values) 
#test_corpus_tf_idf = vectorizer.transform(X_test)
    

In [None]:
print("Unique tokens", train_corpus_tf_idf.shape)
X = train_corpus_tf_idf

## Add features to the word vectors

In [None]:
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
t = data.iloc[:,3:11]
t = sparse.csr_matrix(t)
print(t.shape)
X_new = sparse.hstack((X,t))
print(X_new.shape)

## Encode Labels 

In [None]:
from sklearn.preprocessing import LabelEncoder
crisisEncoder = LabelEncoder()
y = crisisEncoder.fit_transform(data['class'])

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log = cross_val_score(LogisticRegression(), X_new, y,cv=3)
print("Logisitic regression average accuracy : ",log.mean())

## Naive Bayes

In [30]:
## Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb = cross_val_score(MultinomialNB(), X_new, y,cv=3)
print("Naive Bayes average accuracy : ",nb.mean())
#clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

NameError: name 'X_new' is not defined