# Analyzing ISIS tweets Dataset

A one-hour trainining of analysing ISIS tweet dataset from kaggle - https://www.kaggle.com/kzaman/how-isis-uses-twitter

This Notebook implements text analytics process:
* tokenizing
* stemming
* stop-words removal
* index and inverted index creation for token-count


In [2]:
import pandas as pd
import nltk
import string
import time
import math
from tqdm import tqdm
from sklearn.cross_validation import train_test_split



In [4]:
data = pd.read_excel('tweets.xlsx')
data


Unnamed: 0,name,username,description,location,followers,numberstatuses,time,tweets
0,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:07,ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...
1,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:27,ENGLISH TRANSLATION: SHEIKH FATIH AL JAWLANI '...
2,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:29,ENGLISH TRANSLATION: FIRST AUDIO MEETING WITH ...
3,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:37,ENGLISH TRANSLATION: SHEIKH NASIR AL WUHAYSHI ...
4,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:45,ENGLISH TRANSLATION: AQAP: 'RESPONSE TO SHEIKH...
5,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:51,THE SECOND CLIP IN A DA'WAH SERIES BY A SOLDIE...
6,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 22:04,ENGLISH TRANSCRIPT : OH MURABIT! : http://t.co...
7,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 22:06,ENGLISH TRANSLATION: 'A COLLECTION OF THE WORD...
8,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 22:17,Aslm Please share our new account after the pr...
9,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/10/2015 0:05,ENGLISH TRANSLATION: AQAP STATEMENT REGARDING ...


## extract Tokens from text

In [6]:
def tokenize(msg):
    #converts a text into a list of stemmed , non-stop word, tokens
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # stop character sensitive tokenizer
    tok = nltk.word_tokenize(msg) # white space tokenizes
    stemmer = nltk.stem.porter.PorterStemmer() #Porter stemmer object
    out = [] # result list
    for i in tok:
        # for tokens contains digits - we'll take them as-is, while other tokens will be splitted using puctuation
        if any(c.isdigit() for c in i): #token contains numbers
            steminput = [i] # add the token to stemmer input
        else: # non-digits
            steminput = tokenizer.tokenize(i) #split using punctuation
        for mem in steminput:
            w = stemmer.stem(mem.lower()) # w is stemmed lowercase token
            if w in nltk.corpus.stopwords.words('english') or w in list(string.punctuation): continue
            # stop words and sole-punctuation tokens removal
            else: out.append(w) #add the final token to the output
    return out # list of stemmed , non-stop word, tokens

In [7]:
data['token'] = data.apply(lambda x: tokenize(x['tweets']) ,axis = 1)

In [10]:
tw= data.iloc[:,[1,8]].groupby('username').sum().reset_index() # user and his words

In [11]:
twavgstatuses =  data.iloc[:,[1,4,5]].groupby('username').max().reset_index()

In [13]:
tw['num'] =  twavgstatuses['numberstatuses']
tw['follow'] = twavgstatuses['followers']
tw.head()

Unnamed: 0,username,token,num,follow
0,04_8_1437,"[amaqag, martyrdom, oper, rig, vehicl, hit, ga...",56,71
1,06230550_IS,"[current, situat, map, wilayat, kheyr, close, ...",93,140
2,1515Ummah,"[asalam, alikum, la, familia, isi, dawlah, isl...",169,214
3,1Dawlah_III,"[wilayatninawa, tour, sukar, neighborhood, mos...",333,632
4,432Mryam,"[pleas, follow, support, http, //t.co/g2bqagw3...",70,169


# Build index for tf-idf

In [14]:
def createIndex(data):
    #builds inverted index based on documents
    index = {"$ N $":0} # amount of documents in corpus
    docs = {} # regular index of documents and included words
    print "initializing index of",len(data),"documents" ,time.ctime(), 
    for i in data:
        index["$ N $"]+=1
        tfdic = tf_calc(readDocument(path,i))
        docs[i] = tfdic # add document to documents index
        for w in tfdic:
            if index.has_key(w):
                index[w][i] = tfdic[w] # add document to the inverted index
            else:
                index[w] = {i:tfdic[w]}
                
    print "index created with",len(index),"words", time.ctime()
    return index,docs

In [15]:
def tf_calc(wordList):
    # reads list of words and returns their tf
    total = len(wordList)
    out = {}
    for w in wordList:
        if out.has_key(w):
            out[w]+= 1.0/total # add 1 word to the counter
        else:
            out[w] = 1.0/total
    return out # dictionary of tf rates

In [16]:
def createIndex(docs):
    #builds inverted index based on documents
    index = {"$ N $":0} # amount of documents in corpus
    print "initializing index of",len(docs),"users" ,time.ctime(), 
    for i in docs:
        index["$ N $"]+=1
        tfdic = docs[i]
        for w in tfdic:
            if index.has_key(w):
                index[w][i] = tfdic[w] # add document to the inverted index
            else:
                index[w] = {i:tfdic[w]}
    print    
    print "index created with",len(index),"words", time.ctime()
    return index

# Tf - idf implementation

In [17]:
def tfidf_calc(doc, word, index):
    # tf-idf calculation based on document and a word
    if doc.has_key(word) == False or index.has_key(word)== False:
        return 0 # if there are no suitable documents return 0
    tf = doc[word]*1.0 # tf based on the document
    df = len(index[word])
    idf = math.log(index['$ N $']*1.0/df,10) # idf based on global df
    return tf*idf

In [18]:
def getVectorSize(v, index):
    # calculates vector size (based on tf-idf)
    ans = 0
    for i in v:
        ans += math.pow(tfidf_calc(v,i,index),2)
    return math.sqrt(ans)

def getSimilarity(d,q, index):
    # calculate cosine similarity between a query and a document
    ans = 0
    for i in q:
        ans += tfidf_calc(q,i,index) * tfidf_calc(d,i,index)
    
    return ans/(getVectorSize(q, index)*getVectorSize(d, index))

In [19]:
# Find top K similar tweets

In [20]:
def TopKSimilarity(q,t,index,k):
    # get k most similar documents
    similarityDict = {}
    for doc in t:
        similarityDict[doc] = getSimilarity(t[doc],q,index)
    Topk =  sorted(similarityDict, key=similarityDict.get, reverse=True)[:k]
    return Topk # return list of relevant docs

## Popularity Prediction  - KNN regression

In [21]:
train, test = train_test_split(tw, test_size = 0.3)

In [22]:
# building index on the train
users = {}
for i in tqdm(train.iterrows()):
    users[i[1][0]] = tf_calc(i[1][1])

index = createIndex(users)

78it [00:00, 928.57it/s]


initializing index of 78 users Sun Jun 04 17:25:54 2017
index created with 22769 words Sun Jun 04 17:25:54 2017


In [23]:
def predict(user,users,index,k,train):
    topK = TopKSimilarity(user,users,index,k)
    scores = train[train['username'].isin(topK)].median()
    num,follow = scores['num'], scores['follow']    
    return (num,follow)

In [24]:
testusers = {}
for i in tqdm(test.iterrows()):
    testusers[i[1][0]] = tf_calc(i[1][1])

34it [00:00, 739.13it/s]


In [25]:
ks = [1,3,5,7,9]
for k in tqdm(ks):
    test.loc[:,'pnum'+str(k)] = test.apply(lambda x: predict(testusers[x['username']],users,index,k,train)[0] ,axis = 1)
    test.loc[:,'pfollow'+str(k)] = test.apply(lambda x: predict(testusers[x['username']],users,index,k,train)[1] ,axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:57<00:00, 23.44s/it]


# Results for differents K's

In [26]:
test.head()

Unnamed: 0,username,token,num,follow,pnum1,pfollow1,pnum3,pfollow3,pnum5,pfollow5,pnum7,pfollow7,pnum9,pfollow9
103,squadsquaaaaad,"[bismillah, acc, number, 45, follow, rt, jzk, ...",60,57,847.0,353.0,847.0,353.0,847.0,353.0,847.0,353.0,275.0,353.0
88,lNSlDEWAR,"[isi, know, ca, n, move, forward, fastli, larg...",259,528,8473.0,965.0,8473.0,7566.0,8473.0,2230.0,7374.0,2021.0,7374.0,2021.0
56,_IshfaqAhmad,"[game, come, espncricinfo, ellys, perri, beaut...",13960,1667,8473.0,965.0,7374.0,2230.0,7374.0,2230.0,5241.0,2021.0,807.0,1307.0
12,Abu_Ibn_Taha,"[rt, wikileak, hillari, email, egyptian, milit...",24,73,8473.0,965.0,8473.0,1482.0,8473.0,2230.0,7374.0,2021.0,5241.0,1482.0
105,thefIamesofhaqq,"[rt, eplc24, alert, gt, hijrahwit, plz, bewar,...",224,318,8473.0,965.0,8473.0,965.0,211.0,754.0,5241.0,965.0,5241.0,965.0


# RMSE evaluation

In [28]:
from sklearn.metrics import mean_squared_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

0.375

In [41]:
res = {}
for k in ks:
    res[k]=  (mean_squared_error(test['num'],test['pnum'+str(k)])**0.5, mean_squared_error(test['follow'],test['pfollow'+str(k)])**0.5)
res

{1: (10381.872764016871, 886.27477149484253),
 3: (7493.1455854471351, 3808.2648125298406),
 5: (7044.245500865979, 1347.8700626849582),
 7: (6632.943169380791, 1121.900095164187),
 9: (6211.0973975902953, 1050.3175850245318)}

<b> best k for num (number of tweets) - k=9, RMSE = 6211

best k for follow - k=1, RMSE = 886

# Single user example

In [230]:
line = test.iloc[1,:]
print line

username                                              nvor85j
token       [dowlty__isi, jazakallah, khairan, akhi, break...
num                                                       133
follow                                                    238
Name: 97, dtype: object


In [259]:
top5 = TopKSimilarity(u,users,index,5)
print(top5)
#scores = train[train['username'].isin(topK)].mean()
#num,follow = scores['num'], scores['follow'] 

[u'ro34th', u'btt_ar', u'k_kid04', u'pleaoftheummah', u'al_zaishan10']


In [250]:
train[train['username'].isin(topK)].median()

num       179
follow    187
dtype: float64