# Text Classification and Machine Learning
*Research Project 2017 - Boya Chen*

This section will do dataset preparation, classifier building and data prediction

In [1]:
import json
import random
from time import time
from scipy.sparse import csr_matrix

## 1 - Reading CSV and prepare data

[OUTPUT] labels, t_labels, t_pos, p_labels, p_pos,  

In [10]:
import csv
import numpy as np
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, defaultdict
from textblob import TextBlob as tb
from time import time
from scipy.sparse import hstack

# GEO for Geolocation related classes
# APP for Appearance related classes
GEO = 0
APP = 1

TOPIC = GEO

##########################################
#  Prepare training and prediction data  #
##########################################
t1 = time(); print '[Preparin Dataset]: ... ...'

content = []
with open('./dataset.txt', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        content.append(row)

# Acquire the list of labels
labels = []
for item in content:
    labels.append(item[TOPIC])

# Acquire raw sentences
sents = []
for item in content:
    sents.append(item[2])
    
# Read Pre-processed and Cleaned Sents
cleaned_sents = []
with open("./cleaned_sents.csv", "rb") as f:
    csvReader = csv.reader(f)
    for row in csvReader:
        cleaned_sents.append(row)

# Acquire ent for each sentences
with open("./entities_set.json", "rb") as f:
    ent_with_info = json.load(f)    
ent_sents = []
for item in content:
    ent_sents.append(item[3])
type_sents = []
for item in ent_sents:
    try:
        type_feat = defaultdict()
        for t in ent_with_info[item]['ent_type']:
            type_feat[t] = 1
        type_sents.append(type_feat)
    except:
        type_sents.append({})

# Acquire POS tags
pos_sents = []
with open('./training_pos_tag.csv', 'rb') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        pos_sents.append(row)

# Splitting the data
cut_num = 17000 # | 0~cut_num --> Training; cut_num~end --> Prediction(unlabeled)
t_labels = np.array(labels[:cut_num])
t_sents = np.array(sents[:cut_num])
t_pos_sents = np.array(pos_sents[:cut_num])

# Rest of sentences are for prediction
p_sents = np.array(sents[cut_num:])
p_pos_sents = np.array(pos_sents[cut_num:])

print '[Preparation Completed]:', time()-t1, "s"

[Preparin Dataset]: ... ...
[Preparation Completed]: 2.63643598557 s


### !------- Imbalance Classification Handling ---------!

In [11]:
index_of_t = [l for l in range(len(t_labels)) if t_labels[l] == "t"]
index_of_f = [l for l in range(len(t_labels)) if t_labels[l] == "f"]
random.shuffle(index_of_f)
# Resample from the labeled data
index_of_f = index_of_f[:len(index_of_t) * 2]
index_of_resample = list(sorted(index_of_t + index_of_f))
t_labels_resample = [t_labels[i] for i in range(len(t_labels)) if i in index_of_resample]

print "size of positive documents:", len(index_of_t)
print "size of negative documents:", len(index_of_f)
print "size of training dataset:", len(t_labels_resample)

size of positive documents: 1016
size of negative documents: 2032
size of training dataset: 3048


## 2 - Feature Engineering

###  [1] BAG OF WORDS - vectorization

In [6]:
# sents_with_t = [cleaned_sents[l] for l in range(len(cleaned_sents)) if labels[l] == 't']
# words_with_t = []
# for sent in sents_with_t:
#     words_with_t.extend(sent)
# words_freq = Counter()
# for word in words_with_t:
#     words_freq[word] += 1

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW

def prep_train_date(data):
    feature_matrix = [get_BOW(line) for line in data]
    
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
#     transformer = TfidfTransformer(smooth_idf=True)
#     dataset = transformer.fit_transform(dataset)
    return dataset

print "[Building Dataset] ... ..."
t1 = time()
dataset_1 = prep_train_date(cleaned_sents)
t_dataset_1 = dataset_1[:len(t_labels)]
p_dataset_1 = dataset_1[len(t_labels):]

t_dataset_1_r = csr_matrix([dataset_1[i].toarray()[0] for i in index_of_resample])

print "[Building Accomplished] ---", time()-t1, 's'

t_dataset_1, t_dataset_1_r, p_dataset_1

[Building Dataset] ... ...
[Building Accomplished] --- 8.38937711716 s


(<17000x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 194240 stored elements in Compressed Sparse Row format>,
 <3048x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 36772 stored elements in Compressed Sparse Row format>,
 <76994x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 901587 stored elements in Compressed Sparse Row format>,
 <93994x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 1095827 stored elements in Compressed Sparse Row format>)

### [2]  Bag of Word - N-gram + vectorization

In [13]:
################################################
#  Preparing Training Data with Vectorization  #
################################################
def get_bag_of_ngram(text, n):
    ngram_cnt = Counter()
    for i in range(len(text)-(n-1)):
        ngram_cnt[tuple(text[i:i+n])] += 1
    return ngram_cnt

def prep_train_date(data):
    feature_matrix = [get_bag_of_ngram(sent, 2) for sent in data]
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
    return dataset

t0 = time()
# -- Call the Function
dataset_2 = prep_train_date(cleaned_sents)

# Split the dataset into TRAINING and PREDICTION datasets
t_dataset_2 = dataset_2[:len(t_labels)]
p_dataset_2 = dataset_2[len(t_labels):]

t_dataset_2_r = csr_matrix([dataset_2[i].toarray()[0] for i in index_of_resample])

print "-"*50
print "TIME:", time()-t0, "s"

t_dataset_2, t_dataset_2_r, p_dataset_2, len(t_labels)

--------------------------------------------------
TIME: 85.5692539215 s


(<17000x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 189809 stored elements in Compressed Sparse Row format>,
 <3048x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 36301 stored elements in Compressed Sparse Row format>,
 <76994x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 883730 stored elements in Compressed Sparse Row format>,
 17000)

### [3] BAG OF POS TAGS - bigram | vectorization 

In [14]:
# Acquire Bag of Pos-tags
def get_ngram_POS(poses, n):
    pos_cnt = Counter()
    for i in range(len(poses)-(n-1)):
        pos_cnt[tuple(poses[i:i+n])] += 1
    return pos_cnt
    
def prep_train_date(data):
    feature_matrix = [get_ngram_POS(poses, 1) for poses in data]

    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
#     transformer = TfidfTransformer(smooth_idf=False)
#     dataset = transformer.fit_transform(dataset)
    
    return dataset

# -- set time break
t1 = time()
print 'Building Dataset ... ...'

# Dataset
dataset_3 = prep_train_date(pos_sents)
# Splitting Dataset
t_dataset_3 = dataset_3[:cut_num]
p_dataset_3 = dataset_3[cut_num:]

t_dataset_3_r = csr_matrix([dataset_3[i].toarray()[0] for i in index_of_resample])

print 'done --', time()-t1, 's'

dataset_3, t_dataset_3_r, t_dataset_3, p_dataset_3

Building Dataset ... ...
done -- 4.88035583496 s


(<93994x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 822843 stored elements in Compressed Sparse Row format>,
 <3048x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 26895 stored elements in Compressed Sparse Row format>,
 <17000x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 146857 stored elements in Compressed Sparse Row format>,
 <76994x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 675986 stored elements in Compressed Sparse Row format>)

### [4] Entities Types

In [15]:
def prep_train_date(data):
    # data is formed like (data[0], data[1])
    # data[0] is list of POS tag list of each sentence
    # data[1] is the labels
    feature_matrix = data

    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
#     transformer = TfidfTransformer(smooth_idf=False)
#     dataset = transformer.fit_transform(dataset)
    
    return dataset

dataset_4 = prep_train_date(type_sents)
t_dataset_4 = dataset_4[:cut_num]
p_dataset_4 = dataset_4[cut_num:]

t_dataset_4_r = csr_matrix([dataset_4[i].toarray()[0] for i in index_of_resample])

dataset_4, t_dataset_4_r, t_dataset_4, p_dataset_4

(<93994x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 114223 stored elements in Compressed Sparse Row format>,
 <3048x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 3207 stored elements in Compressed Sparse Row format>,
 <17000x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 16995 stored elements in Compressed Sparse Row format>,
 <76994x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 97228 stored elements in Compressed Sparse Row format>)

### [F] Merge Features

In [16]:
dataset = hstack((dataset_1, dataset_3, dataset_4))
t_dataset = hstack((t_dataset_1, t_dataset_3, t_dataset_4))
t_dataset_r = hstack((t_dataset_1_r, t_dataset_3_r, t_dataset_4_r))
p_dataset = hstack((p_dataset_1, p_dataset_3, p_dataset_4))

dataset, t_dataset, t_dataset_r, p_dataset

(<93994x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 2032893 stored elements in COOrdinate format>,
 <17000x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 358092 stored elements in COOrdinate format>,
 <3048x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 66874 stored elements in COOrdinate format>,
 <76994x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 1674801 stored elements in COOrdinate format>)

### Reading entities sections

In [17]:
# Input Entity name to locate the entity section in the whole dataset
name_to_node = defaultdict()
for key in ent_with_info:
    try:
        name_to_node[ent_with_info[key]['tag']['name']] = key
    except:
        continue

print len(name_to_node.keys()), len(ent_with_info.keys())

3591 3643


In [214]:
# from collections import defaultdict
# from osmapi import OsmApi
# MyApi = OsmApi()

# ent_with_info = defaultdict()
# flag = 0
# t1 = time()
# print '[Reading OSM info] ... ...'
# for nodeName in node_list:
#     try:
#         if 'node' in nodeName:
#             nodeId = nodeName[5:]
#             ent_with_info[nodeName] = MyApi.NodeGet(nodeId)
#         else:
#             nodeId = nodeName[4:]
#             ent_with_info[nodeName] = MyApi.WayGet(nodeId)
#     except:
#         print '*Problem At:', nodeName
#         ent_with_info[nodeName] = {}
#     flag += 1
#     if flag%40 == 0:
#         print str(round(float(flag)/len(node_list)*100, 2))+'%', "- loaded"

# for key in ent_with_info:
#     try:
#         ent_with_info[key]['timestamp'] = str(ent_with_info[key]['timestamp'])
#     except:
#         continue
        
# with open("./entities_set.json", "wb") as output:
#     json.dump(ent_with_info, output)

# print '[Completed]', time()-t1, 's'

## 3 - Classification and machine leanring

In [18]:
def check_results(predictions, classifications):
    lab = ['t', 'f']
    print "accuracy"
#     print accuracy_score(classifications, predictions)
    print classification_report(classifications, predictions, labels=lab)

def error_checking(real_predictions, labels, verbose):
    t_t = len([i for i in range(len(real_predictions)) if real_predictions[i] == 't' and labels[i] == 't'])
    f_f = len([i for i in range(len(real_predictions)) if real_predictions[i] == 'f' and labels[i] == 'f'])
    t_f = len([i for i in range(len(real_predictions)) if real_predictions[i] == 'f' and labels[i] == 't'])
    f_t = len([i for i in range(len(real_predictions)) if real_predictions[i] == 't' and labels[i] == 'f'])

    print '--Correct--'
    print 'True --> True:', t_t
    print 'False --> False:', f_f
    print ''
    print '--Incorrect--'
    print 'True --> False:', t_f
    print 'False --> True:', f_t
    print ''
    print '--> Accuracy:', round((t_t+f_f) / float(t_t+t_f+f_t+f_f) , 2)
    print '--> Precision:', round(t_t / float(t_t+f_t) , 2 )
    print '--> Recall:', round(t_t / float(t_t+t_f), 2 )
    print ''
    
    if len(verbose) == 2:
        test_case = [l for l in verbose]
        print "Sentence: ",test_case[0],"-->",test_case[1]
        print "-"*80
        for i, sent in enumerate(sents[:len(real_predictions)]):
            if real_predictions[i] == test_case[1] and labels[i] == test_case[0]:
                print '>', i, '\t|  ', sent

### 3.1 - CROSS VALIDATION

In [121]:
from sklearn.metrics import accuracy_score, classification_report

############################
# Classifiers from Sklearn #
############################
# # Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf_nb = MultinomialNB()

# Support Vector Machine
from sklearn import svm
clf_svm = svm.LinearSVC(C=0.1)

# # --- Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier()

# # --- Random Foreset Classifier
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier()

# # --- Bagging Classifier
from sklearn.ensemble import BaggingClassifier
clf_bag = BaggingClassifier()

# K Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
clf_knc = KNeighborsClassifier(n_neighbors=3)

# --- Cross Validation
from sklearn.model_selection import cross_val_predict

test_clf = [clf_svm, clf_rfc, clf_knc]
for clf in test_clf:
    print 'CLASSIFIER -->', str(clf)
#     print '------- CROSS VALIDATION --------'
    crossval_predicted = cross_val_predict(clf, t_dataset_r, t_labels_resample, cv=10)
    check_results(crossval_predicted, t_labels_resample)
    print '-'*100
    print '\n'

CLASSIFIER --> LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
accuracy
             precision    recall  f1-score   support

          t       0.85      0.88      0.86      1016
          f       0.94      0.92      0.93      2032

avg / total       0.91      0.91      0.91      3048

----------------------------------------------------------------------------------------------------


CLASSIFIER --> RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
accuracy
             precision    recall  f1

### 3-2 Predict whole dataset

In [122]:
p_dataset.shape

(76994, 60207)

In [171]:
# Training Classifier
clf_is_fit = 0
if clf_is_fit == 1:
    print "#"*10, "PREDICTION SUMMARY", "#"*10
    real_predictions = clf_rfc.fit(t_dataset_r, t_labels_resample).predict(p_dataset)

for i, l in enumerate(real_predictions):
    if l == "t":
        print ">", i+cut_num, "|", sents[i+cut_num]
# check_results(real_predictions, t_labels)    

# # print "Size of Dataset \t\t|", t_dataset_1.shape[0]
# # print ''

# # Print all the entity's sentences
# print len(real_predictions), len(t_labels)
# error_checking(real_predictions, t_labels, '')

## Real World Prediction

In [16]:
# This phase will use the classifiers to train the real-world (unlabelled data)
real_predictions = clf_rfc.fit(t_dataset_r, t_labels_resample).predict(p_dataset)

In [41]:
# p_content = content[cut_num:]
# for i in range(len(p_sents)): 
#     if real_predictions[i] == 't':
#         print i,

In [172]:
e_names = name_to_node.keys()
random.shuffle(e_names)
print "Query:\t\t", e_names[0], "\nEntityId:\t", name_to_node[e_names[0]]
chosen_node = name_to_node[e_names[0]]
print ''

p_content = content[cut_num:]

for i in range(len(p_sents)):
    cnt = 0
    if p_content[i][3] == chosen_node:
        cnt += 1
        if real_predictions[i] == 'f':
            print '[x]', i+cut_num, "||",  p_sents[i]
        elif real_predictions[i] == 't':
            print '\n', '~' * 80
            print '[v]', i+cut_num, "||",  p_sents[i]
            print '~' * 80
            print ''

if cnt == 0:
    print "[error] the query is not the p rediction section"