# Demostrate to build a basic statistical features driven classifier on Crisis Data.

In [1]:
# coding: utf8

# load required libraries
import spacy
import csv
import sys
import re
import pandas as pd
import numpy as np
import scipy as sp
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
from nltk.tokenize import regexp_tokenize
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support as prfs_score
from sklearn import cross_validation
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.externals import joblib
import pickle
import json

import codecs
import csv



# Upload Training and Test Data (if using a test file)

In [2]:
train_file = '../data/QFL.csv'
test_file = '../data/CFL.csv'

In [3]:
# Upload a customised multi-lingual stopword dictionary

data_json = json.load(codecs.open(
    '../data/selected_lang_stopwords.json', 'r', 'utf-8'))

s = set(data_json['en'])
fs = frozenset(s)

In [4]:
# Define a tokenizer and stemming

stemmer = PorterStemmer()

def tokenize_and_stem(text):
    
    tokens = regexp_tokenize(text, pattern=r"\s|[\.,:;'()?!]", gaps=True)
    # strip out punctuation and make lowercase
    tokens = [token.lower().strip(string.punctuation)
              for token in tokens if token.isalnum()]

    # now stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens


Upload language model in spaCy- the natural language processing library

In [5]:
nlp_en = spacy.load('en')

nlp = nlp_en

# Define Statistical Feature extraction method

In [6]:
def extract_statfeatures(tweet):
    string = tweet
    
    dict_pos = dict()

    doc = nlp(string.decode('utf-8'))

    for token in doc:

        if dict_pos.has_key(token.pos_):

            dict_pos[token.pos_] = dict_pos[token.pos_] + 1

        else:

            dict_pos[token.pos_] = 1

    pronouns = 0
    nouns = 0
    verbs = 0

    if 'PRON' in dict_pos:
        pronouns = dict_pos['PRON']

    if 'NOUN' in dict_pos:
        nouns = dict_pos['NOUN']

    if 'PROPN' in dict_pos:
        nouns = dict_pos['PROPN'] + nouns

    if 'VERB' in dict_pos:
        verbs = dict_pos['VERB']

    tweet_length = len(string)
    token_count = len(re.findall(r'\w+', string))
    numHashTag = len([i for i in string.split() if i.startswith("#")])
 
    return pronouns, nouns, verbs, tweet_length, token_count, numHashTag


In [7]:
t = ['Document', 'NumberOfNouns', 'NumberOfVerbs', 'NumberOfPronouns',
         'TweetLength', 'NumberOfWords', 'NumberOfHashTag']

In [8]:
# first pass loading the training data

with open(train_file) as fline:
    
    data_csv = list(csv.reader(fline, delimiter="\t", quoting=csv.QUOTE_NONE))

    data = np.array(data_csv[0:])
    
    data_train = np.empty(shape=(len(data_csv),7),dtype='|S300')
    
    # the labels/class for each document
    #label_train = data[:, 4].astype(np.float32)
    label_train = data[:, 2].astype(np.float32)


In [9]:
# creating the statistical features of the training data
for i in range(0,data.shape[0]):
    
    pronouns, nouns, verbs, tweet_length, token_count, numHashTag = extract_statfeatures(data[i][1])
    
    data_train[i][0] = data[i][1]
    
    data_train[i][1] = nouns
    data_train[i][2] = verbs
    data_train[i][3] = pronouns
    data_train[i][4] = tweet_length
    data_train[i][5] = token_count
    data_train[i][6] = numHashTag

In [10]:
frame_train = pd.DataFrame(data_train, columns=t)

In [11]:
# Just check how the data frame looks like

frame_train.head()

Unnamed: 0,Document,NumberOfNouns,NumberOfVerbs,NumberOfPronouns,TweetLength,NumberOfWords,NumberOfHashTag
0,RT if you are older than 9 #bigwet #getinvolved,4,2,1,47,9,2
1,#qldfloods lmao. Whoever put those things were...,5,3,1,122,22,1
2,#QLD Police: #bigwet Bruce H'Way near Mobil S...,12,1,0,107,18,2
3,RT @ACPMH: Check out @beyondblue looking after...,10,2,1,140,21,2
4,HANDY list of contacts and numbers. Have a loo...,7,6,1,125,23,2


# Transform the data to vectors

In [12]:
vectorizer = CountVectorizer(analyzer='word',tokenizer=tokenize_and_stem,
                                 stop_words=fs, lowercase=True, ngram_range=(1, 1), max_features=40000)

doc_vectorize = vectorizer.fit_transform(frame_train.Document)
tf_transform = TfidfTransformer()
tf_vectorize = tf_transform.fit_transform(doc_vectorize)

In [13]:
data_train_stack = sp.sparse.hstack((tf_vectorize, frame_train[['NumberOfNouns', 'NumberOfVerbs', 'NumberOfPronouns', 'TweetLength',
                                                  'NumberOfWords', 'NumberOfHashTag']].values.astype(np.float32)), format='csr')


In [14]:
# confirm the stack shape
print 'tf-vectorizer shape: ',tf_vectorize.shape
print 'overall feature stack shape: ',data_train_stack.shape

tf-vectorizer shape:  (556, 1485)
overall feature stack shape:  (556, 1491)


In [15]:
# check the data distribution class vice
unique, counts = np.unique(label_train, return_counts=True)
print dict(zip(unique, counts))

{0.0: 278, 1.0: 278}


# Declare the classifier and fit on the training data

In [16]:
# Define SVC kernel and fit

svc = SVC(kernel='linear', degree=3, gamma='auto', tol=0.001)

svc.fit(data_train_stack, label_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# Declare calling methods for the test data.

We create the methods to test on a text of test tweet.

In [17]:
def get_test_tweet_features(test_tweet):
    
    pronouns, nouns, verbs, tweet_length, token_count, numHashTag = extract_statfeatures(test_tweet)
    
    data_test = np.array([[test_tweet, nouns, verbs, pronouns,
                               tweet_length, token_count, numHashTag]])
    return data_test

In [18]:
def classify_tweet(test_tweet):

    data_test = get_test_tweet_features(test_tweet)

    frame_test = pd.DataFrame(data_test, columns=t)
    
    test_vectorize = vectorizer.transform(frame_test.Document)
    test_tf_vectorize = tf_transform.transform(test_vectorize)

    data_test_stack = sp.sparse.hstack((test_tf_vectorize, frame_test[['NumberOfNouns', 'NumberOfVerbs', 'NumberOfPronouns',
                                                                 'TweetLength', 'NumberOfWords', 'NumberOfHashTag']].values.astype(np.float32)), format='csr')
    
    label_test_predict = svc.predict(data_test_stack)
    
    return json.dumps({'class': str(label_test_predict[0])})

# End API call 

Results a json response of 'class' indicating 1.0 (crisis related) or 0.0 (not related)

In [20]:
print classify_tweet('The car got drowned in the heavy flood.')
print classify_tweet('The building collapsed in the heavy rainfall.')

{"class": "1.0"}
{"class": "0.0"}


In [21]:
#regexp_tokenize('Pygmy whales washed up in huge swells http://t.co/gt1wgSn9 @abcnews #bigwet', pattern=r"\s|[\.,:;'()?!]", gaps=True)

# Semantic Transformation of the Data

Create a BabelNet Key

In [22]:
import urllib2
import urllib
import json
import gzip

from StringIO import StringIO

# BabelNet Key : http://babelnet.org/ or Babelfy Key : http://babelfy.org/
key  = ''

# Define methods for calling annotation/disambiguation API, sense gathering, and semantic neighbours (hypernyms) 

In [23]:
# text annotation, returns a list of annotated SynsetIDs
def get_synsetID_text(text):
    
    service_url = 'https://babelfy.io/v1/disambiguate'

    #text = 'BabelNet is both a multilingual encyclopedic dictionary and a semantic network'
    lang = 'EN'
    
    synset_list = []
    
    params = {
        'text' : text,
        'lang' : lang,
        'key'  : key
    }

    url = service_url + '?' + urllib.urlencode(params)
    request = urllib2.Request(url)
    request.add_header('Accept-encoding', 'gzip')
    response = urllib2.urlopen(request)

    if response.info().get('Content-Encoding') == 'gzip':
        buf = StringIO( response.read())
        f = gzip.GzipFile(fileobj=buf)
        data = json.loads(f.read())

        # retrieving data
        for result in data:
            # retrieving token fragment
            '''tokenFragment = result.get('tokenFragment')
            tfStart = tokenFragment.get('start')
            tfEnd = tokenFragment.get('end')
            print str(tfStart) + "\t" + str(tfEnd)'''


            # retrieving char fragment
            charFragment = result.get('charFragment')
            cfStart = charFragment.get('start')
            cfEnd = charFragment.get('end')
            #print str(cfStart) + "\t" + str(cfEnd)


            # retrieving BabelSynset ID
            synsetId = result.get('babelSynsetID')
            synset_list.append([text[cfStart:cfEnd+1],synsetId])
            #print text[cfStart:cfEnd+1],' ',synsetId
            
    return synset_list

In [24]:
# retrieve the sense for each SynsetID, returns a dictionary for associated senses in English for each SynsetID
def get_babel_sense(id):
    
    service_url = 'https://babelnet.io/v5/getSynset'

    lang ='EN'
    sense_dict = dict()
    params = {
        'id' : id,
        'key'  : key,
        'targetLang' : lang
    }

    url = service_url + '?' + urllib.urlencode(params)
    request = urllib2.Request(url)
    request.add_header('Accept-encoding', 'gzip')
    response = urllib2.urlopen(request)

    if response.info().get('Content-Encoding') == 'gzip':
        buf = StringIO( response.read())
        f = gzip.GzipFile(fileobj=buf)
        data = json.loads(f.read())
        
        # retrieving BabelSense data
        senses = data['senses']
        for result in senses:
            
            #language = result.get('language')
            #print language.encode('utf-8') + "\t" + str(lemma.encode('utf-8'))
            if result['properties']['fullLemma'].lower() not in sense_dict:
                sense_dict[result['properties']['fullLemma'].lower()] = 1
            #print result['properties']['fullLemma']
            
    return sense_dict

In [25]:
# retrieve the neighbours of each SynsetID, in this case we extract Hypernyms to broaden the context
def get_babel_neighbours(id):
    
    service_url = 'https://babelnet.io/v5/getOutgoingEdges'

    #id = 'bn:00007287n'
    hyper_list = []
    lang = 'EN'
    params = {
        'id' : id,
        'key'  : key
    }

    url = service_url + '?' + urllib.urlencode(params)
    request = urllib2.Request(url)
    request.add_header('Accept-encoding', 'gzip')
    response = urllib2.urlopen(request)

    if response.info().get('Content-Encoding') == 'gzip':
        buf = StringIO( response.read())
        f = gzip.GzipFile(fileobj=buf)
        data = json.loads(f.read())
        
        # retrieving Edges data
        for result in data:
            
            target = result['target']
            
            language = result['language']

            # retrieving BabelPointer data
            pointer = result['pointer']
            relation = pointer.get('name')
            group = pointer.get('relationGroup')

            # Types of relationGroup: HYPERNYM,  HYPONYM, MERONYM, HOLONYM, OTHER
            #if ('hypernym' in group.lower() or 'hyponym' in group.lower()):

            if ('hypernym' in group.lower()) and str(language)=='EN':
                    #print (str(language) + "\t" + str(target) + "\t" + str(relation) + "\t" + str(group))
                    hyper_list.append([id, str(target), str(relation)])
                    
                    
            #elif ('antonym' in relation.lower()):
             #       print (str(language) + "\t" + str(target) + "\t" + str(relation) + "\t" + str(group))

    return hyper_list

# Semantify a sample tweet

In [26]:
# test the use case

tweet = 'The building collapsed in the heavy rainfall.'
overall_concept = tweet

In [27]:
# Get SynsetIDs in the text
synset_list_return = get_synsetID_text(tweet)

synset_list_return

[['building', u'bn:00084198v'],
 ['collapsed', u'bn:00085281v'],
 ['heavy', u'bn:00104050a'],
 ['rainfall', u'bn:00066032n']]

In [28]:
# Get Sense in English for each Syset ID
for i in range(0, len(synset_list_return)):
    
    sense_dictonary = get_babel_sense(synset_list_return[i][1])
    
    for j in sense_dictonary.keys():
        
        overall_concept = overall_concept.strip() + ' ' + j.strip()

In [29]:
overall_concept

u'The building collapsed in the heavy rainfall. make produce build construct collapse heavy raining intensity_frequency_and_duration wettest_spot_on_earth \U0001f326 \U0001f327 rained rainstorm pissing_it_down rainwater pluviophile heavy_rain_(meteorology) rainy rains hyetal rain_water rainfall \u26c6 wettest_places_on_earth rain rain_storm rain_measurement rainiest torrential_rain raindrops rainfall_intensity'

In [30]:
# Get neighbour SynsetIDs (hypernyms) for each SynsetID

for i in range(0, len(synset_list_return)):
    
    neighbour_list = get_babel_neighbours(synset_list_return[i][1])
    
    for j in range(0, len(neighbour_list)):
        
        # Get sense of each neighbouring Synset ID
        
        neighbour_sense_dictonary = get_babel_sense(neighbour_list[j][1])
        
        for k in neighbour_sense_dictonary.keys():
        
            overall_concept = overall_concept.strip() + ' ' + k.strip()

In [31]:
overall_concept

u'The building collapsed in the heavy rainfall. make produce build construct collapse heavy raining intensity_frequency_and_duration wettest_spot_on_earth \U0001f326 \U0001f327 rained rainstorm pissing_it_down rainwater pluviophile heavy_rain_(meteorology) rainy rains hyetal rain_water rainfall \u26c6 wettest_places_on_earth rain rain_storm rain_measurement rainiest torrential_rain raindrops rainfall_intensity make create fold_up fold turn_up precipitation_measurement cloud_condensation praecipitation atmospheric_precipitation downfall hydrometeor cumulonimbus_praecipitatio precipitaion precipitation_(meteorology) classification_of_clouds atmospheric_hydrometeor hydrometeors precipitaiton precipitation annual_precipitation convectional_precipitation'

# Semantic Enrichment Based Classification

In [32]:
# Loading already created model

filename = "../data/SVC_classifier_model.pkl"
loaded_model = pickle.load(open(filename, "rb"))

filename_vectorize = "../data/vectorizer.pkl"
filename_tf_transform = "../data/tf_transform.pkl"

loaded_vectorizer = pickle.load(open(filename_vectorize, "rb"))
loaded_tf_transform = pickle.load(open(filename_tf_transform, "rb"))


In [33]:
def semantic_enrichment(tweet):
    
    enriched_tweet = tweet
    
    synset_list_return = get_synsetID_text(tweet)
    
    # Get Sense in English for each Syset ID
    for i in range(0, len(synset_list_return)):

        sense_dictonary = get_babel_sense(synset_list_return[i][1])

        for j in sense_dictonary.keys():

            enriched_tweet = enriched_tweet.strip() + ' ' + j.strip()
            
    # Get neighbour SynsetIDs (hypernyms) for each SynsetID

    for i in range(0, len(synset_list_return)):

        neighbour_list = get_babel_neighbours(synset_list_return[i][1])

        for j in range(0, len(neighbour_list)):

            # Get sense of each neighbouring Synset ID

            neighbour_sense_dictonary = get_babel_sense(neighbour_list[j][1])

            for k in neighbour_sense_dictonary.keys():

                enriched_tweet = enriched_tweet.strip() + ' ' + k.strip()
                
    return enriched_tweet

In [34]:
test_sem_tweet = 'The building collapsed in the heavy rainfall.'

In [35]:
enriched_tweet_response = semantic_enrichment(test_sem_tweet)

In [36]:
pronouns, nouns, verbs, tweet_length, token_count, numHashTag = extract_statfeatures(test_sem_tweet)
    
data_sem_test = np.array([[enriched_tweet_response, nouns, verbs, pronouns, tweet_length, token_count, numHashTag]])

In [37]:
data_sem_test

array([[u'The building collapsed in the heavy rainfall. make produce build construct collapse heavy raining intensity_frequency_and_duration wettest_spot_on_earth \U0001f326 \U0001f327 rained rainstorm pissing_it_down rainwater pluviophile heavy_rain_(meteorology) rainy rains hyetal rain_water rainfall \u26c6 wettest_places_on_earth rain rain_storm rain_measurement rainiest torrential_rain raindrops rainfall_intensity make create fold_up fold turn_up precipitation_measurement cloud_condensation praecipitation atmospheric_precipitation downfall hydrometeor cumulonimbus_praecipitatio precipitaion precipitation_(meteorology) classification_of_clouds atmospheric_hydrometeor hydrometeors precipitaiton precipitation annual_precipitation convectional_precipitation',
        u'2', u'1', u'0', u'45', u'7', u'0']], dtype='<U736')

In [38]:
frame_sem_test = pd.DataFrame(data_sem_test, columns=t)

test_sem_vectorize = loaded_vectorizer.transform(frame_sem_test.Document)
test_sem_tf_vectorize = loaded_tf_transform.transform(test_sem_vectorize)

data_sem_test_stack = sp.sparse.hstack((test_sem_tf_vectorize, frame_sem_test[['NumberOfNouns', 'NumberOfVerbs', 'NumberOfPronouns',
                                                             'TweetLength', 'NumberOfWords', 'NumberOfHashTag']].values.astype(np.float32)), format='csr')

label_sem_test_predict = loaded_model.predict(data_sem_test_stack)

json.dumps({'class': str(label_sem_test_predict[0])})

'{"class": "1.0"}'