# Text Classification using hybrid features
> Features: Bag-of-word + N-gram + POS-tag + Entity Types

File Name: GKB_Text_Classification_Hybrid

Created Date: 1 March 2017

Author: Boya Chen

Purpose: Machine Learning Experiments and Results Generation (Hybrid features)

Enviornment: Python [2.7.10] ; IPython [5.3.0]

Dependencies: scikit-learn [0.18.1] ; numpy [1.12.1] ; scipy [0.19.0]


In [83]:
import json
import random
from time import time
from scipy.sparse import csr_matrix
import csv
import numpy as np
import sklearn
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, defaultdict
from time import time
from scipy.sparse import hstack

## 1 - Reading CSV and do pre-processing

[OUTPUT] labels, t_labels, t_pos, p_labels, p_pos,  

In [85]:
# GEO for 'Geolocation' related classes, (First Column Labels)
# APP for 'Appearance' related classes, (Second Column Labels)
GEO = 0
APP = 1
TOPIC = GEO # <- Select One Topic

# Dataset Files Paths 
DATASET_PATH = './dataset.txt'
CLEANED_SENTENCES_PATH = "./dataset_cleaned_sentences_93994.csv"
POS_TAGS_PATH = './dataset_pos_tags_93994.csv'
ENTITY_OSMINFO_PATH = "./entities_set.json"

##########################################
#  Prepare training and prediction data  #
##########################################
t1 = time(); print '[0 - Preparing Data]: ... ...'

content = []
with open(DATASET_PATH, 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        content.append(row)

# Reading the list of labels
labels = []
for item in content:
    labels.append(item[TOPIC])

# Reading raw sentences
sents = []
for item in content:
    sents.append(item[2])
    
# Reading Pre-processed and Cleaned Sents
cleaned_sents = []
with open(CLEANED_SENTENCES_PATH, "rb") as f:
    csvReader = csv.reader(f)
    for row in csvReader:
        cleaned_sents.append(row)

# Acquire ent for each sentences
with open(ENTITY_OSMINFO_PATH, "rb") as f:
    ent_with_info = json.load(f)    
ent_sents = []
for item in content:
    ent_sents.append(item[3])
type_sents = []
for item in ent_sents:
    try:
        type_feat = defaultdict()
        for t in ent_with_info[item]['ent_type']:
            type_feat[t] = 1
        type_sents.append(type_feat)
    except:
        type_sents.append({})

# Reading POS tags (Pre-processed - 93994 lines)
pos_sents = []
with open(POS_TAGS_PATH, 'rb') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        pos_sents.append(row)

# Splitting the data
cut_num = 17000 # | 0~cut_num --> Training; cut_num~end --> Prediction(unlabeled)
t_labels = np.array(labels[:cut_num])
t_sents = np.array(sents[:cut_num])
t_pos_sents = np.array(pos_sents[:cut_num])

# Rest of sentences are for prediction
p_sents = np.array(sents[cut_num:])
p_pos_sents = np.array(pos_sents[cut_num:])

print '[0 - Preparation Completed]:', time()-t1, "s"

[0 - Preparing Data]: ... ...
[0 - Preparation Completed]: 2.16776990891 s


### !------- Imbalance Classification Handling ---------!

In [87]:
# Resampling datset in order to handle the dataset imbalance problem

index_of_t = [l for l in range(len(t_labels)) if t_labels[l] == "t"]
index_of_f = [l for l in range(len(t_labels)) if t_labels[l] == "f"]
# Shuffle the negative documents in order to do random sampling
random.shuffle(index_of_f)
# Resample from the labeled data
index_of_f = index_of_f[:int(len(index_of_t) * 1.5)]
index_of_resample = list(sorted(index_of_t + index_of_f))
np.random.shuffle(index_of_resample) # randomly shuffle the order
t_labels_resample = [t_labels[i] for i in range(len(t_labels)) if i in index_of_resample]

print "size of positive documents:", len(index_of_t)
print "size of negative documents:", len(index_of_f)
print "size of training dataset:", len(t_labels_resample)

size of positive documents: 1016
size of negative documents: 1524
size of training dataset: 2540


## 2 - Feature Engineering

###  [1] BAG OF WORDS

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW

def prep_train_date(data):
    feature_matrix = [get_BOW(line) for line in data]
    # Using Vectorization functions or Scikit-learn
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
    return dataset

print "1 - Building BOW feature matrix ... ..."
t0 = time()
dataset_1 = prep_train_date(cleaned_sents)
t_dataset_1 = dataset_1[:len(t_labels)]
p_dataset_1 = dataset_1[len(t_labels):]

t_dataset_1_r = csr_matrix([dataset_1[i].toarray()[0] for i in index_of_resample])

print "1 - [DONE] ---------->", time()-t0, "s"

t_dataset_1, t_dataset_1_r, p_dataset_1

1 - Building BOW feature matrix ... ...
1 - [DONE] ----------> 6.68109893799 s


(<17000x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 194240 stored elements in Compressed Sparse Row format>,
 <2540x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 30759 stored elements in Compressed Sparse Row format>,
 <76994x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 901587 stored elements in Compressed Sparse Row format>)

### [2] N-gram + vectorization

In [82]:
################################################
#  Preparing Training Data with Vectorization  #
################################################
def get_bag_of_ngram(text, n):
    ngram_cnt = Counter()
    for i in range(len(text)-(n-1)):
        ngram_cnt[tuple(text[i:i+n])] += 1
    return ngram_cnt

def prep_train_date_NGRAM(data, n):
    feature_matrix = [get_bag_of_ngram(sent, n) for sent in data]
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
    return dataset

t0 = time(); print "2 - Building Bigram Feature Matrix ... ..."
# -- Call the Function (1~2 Minutes)
dataset_2 = prep_train_date_NGRAM(cleaned_sents, 2)

# Split the dataset into TRAINING and PREDICTION datasets
t_dataset_2 = dataset_2[:len(t_labels)]
p_dataset_2 = dataset_2[len(t_labels):]
t_dataset_2_r = csr_matrix([dataset_2[i].toarray()[0] for i in index_of_resample])

print "2 - [DONE] ---------->", time()-t0, "s"
t_dataset_2, t_dataset_2_r, p_dataset_2, len(t_labels)

2 - Building Bigram Feature Matrix ... ...
2 - [DONE] ----------> 77.8475618362 s


(<17000x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 189809 stored elements in Compressed Sparse Row format>,
 <2540x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 30482 stored elements in Compressed Sparse Row format>,
 <76994x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 883730 stored elements in Compressed Sparse Row format>,
 17000)

### [3] BAG OF POS TAGS - bigram | vectorization 

In this phase, builiding a feature matrix which contain POS-tags' bigram ("n" or gram can be manually defined) information.

In [81]:
# Function of getting "bag of pos tags"
def get_ngram_POS(poses, n):
    pos_cnt = Counter()
    for i in range(len(poses)-(n-1)):
        pos_cnt[tuple(poses[i:i+n])] += 1
    return pos_cnt
    
def prep_train_date_POS(data, n):
    feature_matrix = [get_ngram_POS(poses, n) for poses in data]
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)    
    return dataset

# -- set time break
t0 = time(); print '3 - Building POS TAG Dataset ... ...'

dataset_3 = prep_train_date_POS(pos_sents, 2)
t_dataset_3 = dataset_3[:cut_num]
p_dataset_3 = dataset_3[cut_num:]

t_dataset_3_r = csr_matrix([dataset_3[i].toarray()[0] for i in index_of_resample])

print '3 - [DONE] ---------->', time()-t0, "s"

dataset_3, t_dataset_3_r, t_dataset_3, p_dataset_3

3 - Building POS TAG Dataset ... ...
3 - [DONE] ----------> 5.07329511642 s


(<93994x906 sparse matrix of type '<type 'numpy.float64'>'
 	with 1390294 stored elements in Compressed Sparse Row format>,
 <2540x906 sparse matrix of type '<type 'numpy.float64'>'
 	with 37185 stored elements in Compressed Sparse Row format>,
 <17000x906 sparse matrix of type '<type 'numpy.float64'>'
 	with 246710 stored elements in Compressed Sparse Row format>,
 <76994x906 sparse matrix of type '<type 'numpy.float64'>'
 	with 1143584 stored elements in Compressed Sparse Row format>)

### [4] Entities Types

Using Entity's OSM type as feature to see if it help to improve the model

In [77]:
def prep_train_date_ET(data):
    feature_matrix = data
    # Vectorization
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
    return dataset

t1 = time(); print '4 - Building OSM Types Feature Matrix ... ...'
dataset_4 = prep_train_date_ET(type_sents)
t_dataset_4 = dataset_4[:cut_num]
p_dataset_4 = dataset_4[cut_num:]

t_dataset_4_r = csr_matrix([dataset_4[i].toarray()[0] for i in index_of_resample])

print '4 - [DONE] ---------->', time()-t1
dataset_4, t_dataset_4_r, t_dataset_4, p_dataset_4

4 - Building OSM Types Feature Matrix ... ...
4 - [DONE] ----------> 0.617810964584


(<93994x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 114223 stored elements in Compressed Sparse Row format>,
 <2540x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 2670 stored elements in Compressed Sparse Row format>,
 <17000x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 16995 stored elements in Compressed Sparse Row format>,
 <76994x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 97228 stored elements in Compressed Sparse Row format>)

### [F] Merge Features

In [15]:
dataset = hstack((dataset_1, dataset_3, dataset_4))

t_dataset = hstack((t_dataset_1, t_dataset_3, t_dataset_4))
t_dataset_r = hstack((t_dataset_1_r, t_dataset_3_r, t_dataset_4_r))
p_dataset = hstack((p_dataset_1, p_dataset_3, p_dataset_4))

dataset, t_dataset, t_dataset_r, p_dataset

(<93994x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 2032893 stored elements in COOrdinate format>,
 <17000x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 358092 stored elements in COOrdinate format>,
 <2540x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 55476 stored elements in COOrdinate format>,
 <76994x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 1674801 stored elements in COOrdinate format>)

### Reading entities sections

In [16]:
# Input Entity name to locate the entity section in the whole dataset
name_to_node = defaultdict()
for key in ent_with_info:
    try:
        name_to_node[ent_with_info[key]['tag']['name']] = key
    except:
        continue

print len(name_to_node.keys()), len(ent_with_info.keys())

3591 3643


## 3 - Model Validation and Classification

In [18]:
def check_results(predictions, classifications):
    lab = ['t', 'f']
    print "accuracy"
#     print accuracy_score(classifications, predictions)
    print classification_report(classifications, predictions, labels=lab)

def error_checking(real_predictions, labels, verbose):
    t_t = len([i for i in range(len(real_predictions)) if real_predictions[i] == 't' and labels[i] == 't'])
    f_f = len([i for i in range(len(real_predictions)) if real_predictions[i] == 'f' and labels[i] == 'f'])
    t_f = len([i for i in range(len(real_predictions)) if real_predictions[i] == 'f' and labels[i] == 't'])
    f_t = len([i for i in range(len(real_predictions)) if real_predictions[i] == 't' and labels[i] == 'f'])

    print '--Correct--'
    print 'True --> True:', t_t
    print 'False --> False:', f_f
    print ''
    print '--Incorrect--'
    print 'True --> False:', t_f
    print 'False --> True:', f_t
    print ''
    print '--> Accuracy:', round((t_t+f_f) / float(t_t+t_f+f_t+f_f) , 2)
    print '--> Precision:', round(t_t / float(t_t+f_t) , 2 )
    print '--> Recall:', round(t_t / float(t_t+t_f), 2 )
    print ''
    
    if len(verbose) == 2:
        test_case = [l for l in verbose]
        print "Sentence: ",test_case[0],"-->",test_case[1]
        print "-"*80
        for i, sent in enumerate(sents[:len(real_predictions)]):
            if real_predictions[i] == test_case[1] and labels[i] == test_case[0]:
                print '>', i, '\t|  ', sent

### 3.1 - CROSS VALIDATION

In [93]:
from sklearn.metrics import accuracy_score, classification_report

############################
# Classifiers from Sklearn #
############################
# # Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf_nb = MultinomialNB()

# Support Vector Machine
from sklearn import svm
clf_svm = svm.LinearSVC(C=0.1)

# # --- Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier()

# # --- Random Foreset Classifier
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier()

# # --- Bagging Classifier
from sklearn.ensemble import BaggingClassifier
clf_bag = BaggingClassifier()

# K Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
clf_knc = KNeighborsClassifier(n_neighbors=3)

# --- Cross Validation
from sklearn.model_selection import cross_val_predict

test_clf = [clf_svm, clf_rfc, clf_knc]
for clf in test_clf:
    print 'CLASSIFIER -->', str(clf)
#     print '------- CROSS VALIDATION --------'
    crossval_predicted = cross_val_predict(clf, t_dataset_r, t_labels_resample, cv=10)
    check_results(crossval_predicted, t_labels_resample)
    print '-'*100
    print '\n'

CLASSIFIER --> LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
accuracy
             precision    recall  f1-score   support

          t       0.40      0.31      0.35      1016
          f       0.60      0.69      0.64      1524

avg / total       0.52      0.54      0.53      2540

----------------------------------------------------------------------------------------------------


CLASSIFIER --> RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
accuracy
             precision    recall  f1

### 3.2 - Prediction

In [29]:
# This phase will use the classifiers to train the real-world (unlabelled data)
# Fitting the model
real_predictions = clf_rfc.fit(t_dataset_r, t_labels_resample).predict(p_dataset)

In [92]:
# Randomly select a entity and check the results
# If sentences are labelled as positve, they will be printed with staring symbol "[v]"; 
# if not, sentences the start symbols will be "[x]"

e_names = name_to_node.keys()
random.shuffle(e_names)
print "Query:\t\t", e_names[0], "\nEntityId:\t", name_to_node[e_names[0]]
chosen_node = name_to_node[e_names[0]]
print ''

p_content = content[cut_num:]

for i in range(len(p_sents)):
    cnt = 0
    if p_content[i][3] == chosen_node:
        cnt += 1
        if real_predictions[i] == 'f':
            print '[x]', i+cut_num, "||",  p_sents[i]
        elif real_predictions[i] == 't':
            print '\n', '~' * 80
            print '[v]', i+cut_num, "||",  p_sents[i]
            print '~' * 80
            print ''

if cnt == 0:
    print "[error] the query is not the p rediction section"

Query:		Mount Martha 
EntityId:	node_2607419375

[error] the query is not the p rediction section
