# Text Classification and Machine Learning
*Research Project 2017 - Boya Chen*

This section will do dataset preparation, classifier building and data prediction

In [1]:
import json
import random
from time import time
from scipy.sparse import csr_matrix

## 1 - Reading CSV and prepare data

[OUTPUT] labels, t_labels, t_pos, p_labels, p_pos,  

In [10]:
import csv
import numpy as np
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, defaultdict
from textblob import TextBlob as tb
from time import time
from scipy.sparse import hstack

# GEO for Geolocation related classes
# APP for Appearance related classes
GEO = 0
APP = 1

TOPIC = GEO

##########################################
#  Prepare training and prediction data  #
##########################################
t1 = time(); print '[Preparin Dataset]: ... ...'

content = []
with open('./dataset.txt', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        content.append(row)

# Acquire the list of labels
labels = []
for item in content:
    labels.append(item[TOPIC])

# Acquire raw sentences
sents = []
for item in content:
    sents.append(item[2])
    
# Read Pre-processed and Cleaned Sents
cleaned_sents = []
with open("./cleaned_sents.csv", "rb") as f:
    csvReader = csv.reader(f)
    for row in csvReader:
        cleaned_sents.append(row)

# Acquire ent for each sentences
with open("./entities_set.json", "rb") as f:
    ent_with_info = json.load(f)    
ent_sents = []
for item in content:
    ent_sents.append(item[3])
type_sents = []
for item in ent_sents:
    try:
        type_feat = defaultdict()
        for t in ent_with_info[item]['ent_type']:
            type_feat[t] = 1
        type_sents.append(type_feat)
    except:
        type_sents.append({})

# Acquire POS tags
pos_sents = []
with open('./training_pos_tag.csv', 'rb') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        pos_sents.append(row)

# Splitting the data
cut_num = 17000 # | 0~cut_num --> Training; cut_num~end --> Prediction(unlabeled)
t_labels = np.array(labels[:cut_num])
t_sents = np.array(sents[:cut_num])
t_pos_sents = np.array(pos_sents[:cut_num])

# Rest of sentences are for prediction
p_sents = np.array(sents[cut_num:])
p_pos_sents = np.array(pos_sents[cut_num:])

print '[Preparation Completed]:', time()-t1, "s"

[Preparin Dataset]: ... ...
[Preparation Completed]: 2.63643598557 s


### !------- Imbalance Classification Handling ---------!

In [11]:
index_of_t = [l for l in range(len(t_labels)) if t_labels[l] == "t"]
index_of_f = [l for l in range(len(t_labels)) if t_labels[l] == "f"]
random.shuffle(index_of_f)
# Resample from the labeled data
index_of_f = index_of_f[:len(index_of_t) * 2]
index_of_resample = list(sorted(index_of_t + index_of_f))
t_labels_resample = [t_labels[i] for i in range(len(t_labels)) if i in index_of_resample]

print "size of positive documents:", len(index_of_t)
print "size of negative documents:", len(index_of_f)
print "size of training dataset:", len(t_labels_resample)

size of positive documents: 1016
size of negative documents: 2032
size of training dataset: 3048


## 2 - Feature Engineering

###  [1] BAG OF WORDS - vectorization

In [6]:
# sents_with_t = [cleaned_sents[l] for l in range(len(cleaned_sents)) if labels[l] == 't']
# words_with_t = []
# for sent in sents_with_t:
#     words_with_t.extend(sent)
# words_freq = Counter()
# for word in words_with_t:
#     words_freq[word] += 1

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW

def prep_train_date(data):
    feature_matrix = [get_BOW(line) for line in data]
    
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
#     transformer = TfidfTransformer(smooth_idf=True)
#     dataset = transformer.fit_transform(dataset)
    return dataset

print "[Building Dataset] ... ..."
t1 = time()
dataset_1 = prep_train_date(cleaned_sents)
t_dataset_1 = dataset_1[:len(t_labels)]
p_dataset_1 = dataset_1[len(t_labels):]

t_dataset_1_r = csr_matrix([dataset_1[i].toarray()[0] for i in index_of_resample])

print "[Building Accomplished] ---", time()-t1, 's'

t_dataset_1, t_dataset_1_r, p_dataset_1

[Building Dataset] ... ...
[Building Accomplished] --- 8.38937711716 s


(<17000x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 194240 stored elements in Compressed Sparse Row format>,
 <3048x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 36772 stored elements in Compressed Sparse Row format>,
 <76994x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 901587 stored elements in Compressed Sparse Row format>,
 <93994x60147 sparse matrix of type '<type 'numpy.float64'>'
 	with 1095827 stored elements in Compressed Sparse Row format>)

### [2]  Bag of Word - N-gram + vectorization

In [13]:
################################################
#  Preparing Training Data with Vectorization  #
################################################
def get_bag_of_ngram(text, n):
    ngram_cnt = Counter()
    for i in range(len(text)-(n-1)):
        ngram_cnt[tuple(text[i:i+n])] += 1
    return ngram_cnt

def prep_train_date(data):
    feature_matrix = [get_bag_of_ngram(sent, 2) for sent in data]
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
    return dataset

t0 = time()
# -- Call the Function
dataset_2 = prep_train_date(cleaned_sents)

# Split the dataset into TRAINING and PREDICTION datasets
t_dataset_2 = dataset_2[:len(t_labels)]
p_dataset_2 = dataset_2[len(t_labels):]

t_dataset_2_r = csr_matrix([dataset_2[i].toarray()[0] for i in index_of_resample])

print "-"*50
print "TIME:", time()-t0, "s"

t_dataset_2, t_dataset_2_r, p_dataset_2, len(t_labels)

--------------------------------------------------
TIME: 85.5692539215 s


(<17000x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 189809 stored elements in Compressed Sparse Row format>,
 <3048x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 36301 stored elements in Compressed Sparse Row format>,
 <76994x656284 sparse matrix of type '<type 'numpy.float64'>'
 	with 883730 stored elements in Compressed Sparse Row format>,
 17000)

### [3] BAG OF POS TAGS - bigram | vectorization 

In [14]:
# Acquire Bag of Pos-tags
def get_ngram_POS(poses, n):
    pos_cnt = Counter()
    for i in range(len(poses)-(n-1)):
        pos_cnt[tuple(poses[i:i+n])] += 1
    return pos_cnt
    
def prep_train_date(data):
    feature_matrix = [get_ngram_POS(poses, 1) for poses in data]

    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
#     transformer = TfidfTransformer(smooth_idf=False)
#     dataset = transformer.fit_transform(dataset)
    
    return dataset

# -- set time break
t1 = time()
print 'Building Dataset ... ...'

# Dataset
dataset_3 = prep_train_date(pos_sents)
# Splitting Dataset
t_dataset_3 = dataset_3[:cut_num]
p_dataset_3 = dataset_3[cut_num:]

t_dataset_3_r = csr_matrix([dataset_3[i].toarray()[0] for i in index_of_resample])

print 'done --', time()-t1, 's'

dataset_3, t_dataset_3_r, t_dataset_3, p_dataset_3

Building Dataset ... ...
done -- 4.88035583496 s


(<93994x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 822843 stored elements in Compressed Sparse Row format>,
 <3048x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 26895 stored elements in Compressed Sparse Row format>,
 <17000x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 146857 stored elements in Compressed Sparse Row format>,
 <76994x35 sparse matrix of type '<type 'numpy.float64'>'
 	with 675986 stored elements in Compressed Sparse Row format>)

### [4] Entities Types

In [15]:
def prep_train_date(data):
    # data is formed like (data[0], data[1])
    # data[0] is list of POS tag list of each sentence
    # data[1] is the labels
    feature_matrix = data

    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
#     transformer = TfidfTransformer(smooth_idf=False)
#     dataset = transformer.fit_transform(dataset)
    
    return dataset

dataset_4 = prep_train_date(type_sents)
t_dataset_4 = dataset_4[:cut_num]
p_dataset_4 = dataset_4[cut_num:]

t_dataset_4_r = csr_matrix([dataset_4[i].toarray()[0] for i in index_of_resample])

dataset_4, t_dataset_4_r, t_dataset_4, p_dataset_4

(<93994x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 114223 stored elements in Compressed Sparse Row format>,
 <3048x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 3207 stored elements in Compressed Sparse Row format>,
 <17000x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 16995 stored elements in Compressed Sparse Row format>,
 <76994x25 sparse matrix of type '<type 'numpy.float64'>'
 	with 97228 stored elements in Compressed Sparse Row format>)

### [F] Merge Features

In [16]:
dataset = hstack((dataset_1, dataset_3, dataset_4))
t_dataset = hstack((t_dataset_1, t_dataset_3, t_dataset_4))
t_dataset_r = hstack((t_dataset_1_r, t_dataset_3_r, t_dataset_4_r))
p_dataset = hstack((p_dataset_1, p_dataset_3, p_dataset_4))

dataset, t_dataset, t_dataset_r, p_dataset

(<93994x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 2032893 stored elements in COOrdinate format>,
 <17000x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 358092 stored elements in COOrdinate format>,
 <3048x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 66874 stored elements in COOrdinate format>,
 <76994x60207 sparse matrix of type '<type 'numpy.float64'>'
 	with 1674801 stored elements in COOrdinate format>)

### Reading entities sections

In [17]:
# Input Entity name to locate the entity section in the whole dataset
name_to_node = defaultdict()
for key in ent_with_info:
    try:
        name_to_node[ent_with_info[key]['tag']['name']] = key
    except:
        continue

print len(name_to_node.keys()), len(ent_with_info.keys())

3591 3643


In [214]:
# from collections import defaultdict
# from osmapi import OsmApi
# MyApi = OsmApi()

# ent_with_info = defaultdict()
# flag = 0
# t1 = time()
# print '[Reading OSM info] ... ...'
# for nodeName in node_list:
#     try:
#         if 'node' in nodeName:
#             nodeId = nodeName[5:]
#             ent_with_info[nodeName] = MyApi.NodeGet(nodeId)
#         else:
#             nodeId = nodeName[4:]
#             ent_with_info[nodeName] = MyApi.WayGet(nodeId)
#     except:
#         print '*Problem At:', nodeName
#         ent_with_info[nodeName] = {}
#     flag += 1
#     if flag%40 == 0:
#         print str(round(float(flag)/len(node_list)*100, 2))+'%', "- loaded"

# for key in ent_with_info:
#     try:
#         ent_with_info[key]['timestamp'] = str(ent_with_info[key]['timestamp'])
#     except:
#         continue
        
# with open("./entities_set.json", "wb") as output:
#     json.dump(ent_with_info, output)

# print '[Completed]', time()-t1, 's'

## 3 - Classification and machine leanring

In [18]:
def check_results(predictions, classifications):
    lab = ['t', 'f']
    print "accuracy"
#     print accuracy_score(classifications, predictions)
    print classification_report(classifications, predictions, labels=lab)

def error_checking(real_predictions, labels, verbose):
    t_t = len([i for i in range(len(real_predictions)) if real_predictions[i] == 't' and labels[i] == 't'])
    f_f = len([i for i in range(len(real_predictions)) if real_predictions[i] == 'f' and labels[i] == 'f'])
    t_f = len([i for i in range(len(real_predictions)) if real_predictions[i] == 'f' and labels[i] == 't'])
    f_t = len([i for i in range(len(real_predictions)) if real_predictions[i] == 't' and labels[i] == 'f'])

    print '--Correct--'
    print 'True --> True:', t_t
    print 'False --> False:', f_f
    print ''
    print '--Incorrect--'
    print 'True --> False:', t_f
    print 'False --> True:', f_t
    print ''
    print '--> Accuracy:', round((t_t+f_f) / float(t_t+t_f+f_t+f_f) , 2)
    print '--> Precision:', round(t_t / float(t_t+f_t) , 2 )
    print '--> Recall:', round(t_t / float(t_t+t_f), 2 )
    print ''
    
    if len(verbose) == 2:
        test_case = [l for l in verbose]
        print "Sentence: ",test_case[0],"-->",test_case[1]
        print "-"*80
        for i, sent in enumerate(sents[:len(real_predictions)]):
            if real_predictions[i] == test_case[1] and labels[i] == test_case[0]:
                print '>', i, '\t|  ', sent

### 3.1 - CROSS VALIDATION

In [121]:
from sklearn.metrics import accuracy_score, classification_report

############################
# Classifiers from Sklearn #
############################
# # Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf_nb = MultinomialNB()

# Support Vector Machine
from sklearn import svm
clf_svm = svm.LinearSVC(C=0.1)

# # --- Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier()

# # --- Random Foreset Classifier
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier()

# # --- Bagging Classifier
from sklearn.ensemble import BaggingClassifier
clf_bag = BaggingClassifier()

# K Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
clf_knc = KNeighborsClassifier(n_neighbors=3)

# --- Cross Validation
from sklearn.model_selection import cross_val_predict

test_clf = [clf_svm, clf_rfc, clf_knc]
for clf in test_clf:
    print 'CLASSIFIER -->', str(clf)
#     print '------- CROSS VALIDATION --------'
    crossval_predicted = cross_val_predict(clf, t_dataset_r, t_labels_resample, cv=10)
    check_results(crossval_predicted, t_labels_resample)
    print '-'*100
    print '\n'

CLASSIFIER --> LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
accuracy
             precision    recall  f1-score   support

          t       0.85      0.88      0.86      1016
          f       0.94      0.92      0.93      2032

avg / total       0.91      0.91      0.91      3048

----------------------------------------------------------------------------------------------------


CLASSIFIER --> RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
accuracy
             precision    recall  f1

### 3-2 Predict whole dataset

In [122]:
p_dataset.shape

(76994, 60207)

In [25]:
# Training Classifier
clf_is_fit = 0
if clf_is_fit == 1:
    print "#"*10, "PREDICTION SUMMARY", "#"*10
    real_predictions = clf_rfc.fit(t_dataset_r, t_labels_resample).predict(p_dataset)

for i, l in enumerate(real_predictions):
    if l == "t":
        print ">", i+cut_num, "|", sents[i+cut_num]
# check_results(real_predictions, t_labels)    

# # print "Size of Dataset \t\t|", t_dataset_1.shape[0]
# # print ''

# # Print all the entity's sentences
# print len(real_predictions), len(t_labels)
# error_checking(real_predictions, t_labels, '')

> 17004 | Sea Life Melbourne Aquarium is home to the first-ever in-vitro fertilised (IVF) shark in existence.
> 17017 | Mont Albert North is a suburb of Melbourne, Victoria, Australia, 13 km east of Melbourne's Central Business District.
> 17018 | Its local government area is the City of Whitehorse.
> 17020 | The northern border or the suburb is the Eastern Freeway and the southern border is Kenmare Street, while Elgar Road constitutes the boundary in the east.
> 17031 | Altona Meadows is a suburb of Melbourne, Victoria, Australia, 17 km south-west from Melbourne's central business district.
> 17032 | Its local government area is the City of Hobsons Bay.
> 17041 | === Queen of Peace Parish Primary School === Queen of Peace Parish Primary School is a Catholic co-educational primary school.
> 17125 | === Parkville campus === The Parkville campus is situated in the Melbourne suburb of Parkville, around 2 km north of the Melbourne CBD on Royal Parade.
> 17136 | It is situated on a 55-hecta

> 20649 | The City of Mount Isa is a local government area in north west Queensland.
> 20652 | The largest industry in the City is the Mount Isa Mines, a source of lead, copper, silver and zinc.
> 20661 | Northbourne Avenue is a major road in Canberra, Australia.
> 20662 | It extends from City Hill in the south to the Federal Highway in the north.
> 20674 | Stockland The Pines is a sub regional shopping centre located in the north-eastern suburb of Doncaster East in the city of Melbourne, Australia.
> 20675 | The centre is approximately 19 km east of the Melbourne CBD and is situated on the corner of Blackburn and Reynolds Roads.
> 20702 | Deloraine High School is a co-educational public high school in Deloraine, Tasmania, Australia.
> 20714 | Frankston is an outer-suburb of Melbourne in Victoria, Australia, in the local government area of the City of Frankston.
> 20715 | It is located 41 km south-east of the Melbourne city centre, at the northernmost point of the Mornington Peninsula.

> 30419 | Albany Regional Prison is a maximum security prison located 8 km West of Albany, Western Australia, Australia.
> 30433 | In its upper reaches, the Wingecarribee River forms the Wingecarribee Swamp, the only substantial peat bog in New South Wales.
> 30444 | The Anzac Highway is an 8.7-kilometre-long (5.4 mi) main arterial road heading southwest from the city of Adelaide, the capital of South Australia, to the beachside suburb of Glenelg.
> 30514 | In 2015, "Trailblazers: Australia's 50 greatest explorers" opened, honouring the work of Bourke and Wills, Nancy Bird Walton, Dick Smith, Jessica Watson and Tim Jarvis, among others.
> 30586 | Under Section 5A of the New South Wales Constitution Act (1902), a bill appropriating revenue for the ordinary annual services of the Government can be presented to the Governor for assent even if the upper house has not agreed to it.
> 30600 | The Bundara River (formerly known as the Bundara Mungee River and the Bundarah River), a perennial r

> 39300 | It is a member of the Cemeteries and Cremetoria Association of Victoria, and is located at the south of the town.
> 39315 | God's Acre Cemetery is a heritage-listed cemetery in front of Archerfield Airport, along Beatty Road, between Kerry & Mortimer Roads, Archerfield, Brisbane, Queensland, Australia.
> 39356 | Southern River College formerly known as Gosnells Senior High School is a public co-educational high school in Western Australia.
> 39357 | The school is located on Southern River Road in the suburb of Gosnells.
> 39361 | The catchment area for the school is Gosnells, Huntingdale and Southern River.
> 39370 | Fitzroy Crossing Airport (IATA: FIZ, ICAO: YFTZ) is located 2 nautical miles (3.7 km; 2.3 mi) northwest of Fitzroy Crossing, Western Australia.
> 39415 | Noosa River Caravan Park is a heritage-listed caravan park at Russell Street, Noosaville, Shire of Noosa, Queensland, Australia.
> 39418 | == History == Noosa River Caravan Park is located between Russell Street

> 46744 | The street continues as Rundle Street (as before) to the east and Hindley Street to the west.
> 46767 | It lies at the western end of the Mall, on the corner of King William Street, and was originally owned by John Rundle.
> 46770 | Another noteworthy large lighting display is the Rundle Lantern which marks the eastern end of Rundle Mall and the beginning of Rundle Street.
> 46790 | Muirfield High School is a public, co-educational, secondary day school located in North Rocks, a north-western suburb of Sydney, New South Wales, Australia.
> 46804 | In 1989, the school was identified as one of twenty-six Technology High Schools in New South Wales, becoming one of the first Public schools in New South Wales to introduce computers into the curriculum.
> 46807 | It is bordered by two main roads, the M2 motorway and Barclay Road, a major thoroughfare for the North West part of Sydney.
> 46811 | Kingsgrove North High School (KNHS) is a co-educational public high school located in Ki

> 55037 | Lake Purrumbete is approximately 15 kilometres (9.3 mi) east of the town of Camperdown.
> 55046 | Cowies Creek is a creek in the northern suburbs of Geelong, Victoria, Australia.
> 55047 | It flows from wetlands in Moorabool to Corio Quay, Corio Bay in Port Phillip.
> 55050 | Cairn Curran reservoir is situated along the Loddon River near the townships of Baringhup, Newstead and Welshmans Reef in Victoria, Australia.
> 55058 | Warwick Senior High School is a co-educational government school in the northern suburbs of Perth, Western Australia.
> 55065 | Burpengary is a town and suburb in the Moreton Bay Region, Queensland, Australia.
> 55088 | == Transport == Burpengary railway station provides access to regular Queensland Rail City network services to Brisbane and Ipswich, as well as Caboolture and the Sunshine Coast.
> 55093 | Carmel Adventist College is a Seventh-day Adventist secondary school in Carmel, a suburb of Perth, Western Australia.
> 55117 | Forrestfield is a subur

> 62852 | Since 1996 the school has operated an International Students Program.
> 62858 | == Sport == Balwyn High School has a sports faculty, with many interschool sport teams representing the school in numerous sports, some at elite levels, like many other schools around Victoria.
> 62869 | Dame Phyllis Frost Centre (formerly the Deer Park Metropolitan Women's Correctional Centre) is a maximum security women's prison located at Deer Park, Victoria, Australia.
> 62887 | The Chevron Renaissance Shopping Centre is a retail complex located in Surfers Paradise, on the Gold Coast of Queensland, Australia.
> 62901 | Koonung Secondary College is a secondary state school in Mont Albert North, Victoria, in the eastern suburbs of Melbourne, Australia.
> 62965 | Porepunkah Airfield (ICAO: YPOK) is located in the Buckland Valley 1.5 nautical miles (2.8 km; 1.7 mi) southwest of the township of Porepunkah, Victoria, Australia.
> 63007 | == Recreation == Every October school holds the Japanese Schoo

> 70473 | Ravenswood School for Girls (often referred to as Ravenswood) is an independent, Uniting Church, day and boarding school for K-12 girls, situated in Gordon, an Upper North Shore suburb of Sydney, New South Wales, Australia.
> 70575 | The Gibson Desert Nature Reserve is an 18,900 km2 nature reserve located in the Gibson Desert in central Western Australia.
> 70581 | Killara is a suburb on the Upper North Shore of Sydney in the state of New South Wales, Australia 14 kilometres (8.7 mi) north of the Sydney Central Business District in the local government area of Ku-ring-gai Council.
> 70594 | == Infrastructure and development == Killara railway station is on the North Shore, Northern & Western Line of the Sydney Trains network.
> 70600 | The suburb is also home to Killara High School, the highest performing non-selective government secondary school in the state of New South Wales.
> 70636 | == Course == The river rises on the northern slopes of the Great Dividing Range, about 1

> 80127 | The Glennie School (formerly the Glennie Memorial School) is a girls' school in Toowoomba, Queensland, Australia.
> 80141 | Grand Central Shopping Centre is a retail shopping centre in Toowoomba, Queensland, Australia.
> 80150 | Pimpama is a suburb in the northern part of the City of Gold Coast.
> 80151 | It is located on the Pacific Motorway 30 kilometres (19 mi) north of Surfers Paradise.
> 80152 | The township of Pimpama is the last remaining rural town on the Pacific Motorway between Brisbane and the Gold Coast.
> 80204 | A railway station was located on the old South Coast railway line, which ran from Brisbane to Coolangatta.
> 80206 | == Heritage listings == Pimpama has a number of heritage-listed sites, including: Pacific Highway: Pimpama & Ormeau War Memorial Ruffles Road, Willow Vale: Laurel Hill Farmhouse (the farmhouse is now within the modern suburb of Willow Vale but was histori
> 80240 | The suburb is serviced by Queensland Rail City network through Ormeau railw

> 89307 | The Walter Taylor Bridge is a heritage-listed suspension bridge crossing the Brisbane River between Indooroopilly and Chelmer in Brisbane, Queensland, Australia.
> 89325 | The 50 Ha reserve is a remnant of bushland situated between the Lane Cove and Parramatta rivers within the suburb of East Ryde and near Gladesville, Hunters Hill and Ryde.
> 89333 | The reserve adjoins the Field of Mars Cemetery, bounded by Strangers Creek in the north and Wellington Road in the south.
> 89340 | Winton Motor Raceway is a motor racing track in Winton, near Benalla, Victoria, Australia.
> 89351 | The long circuit is called the Winton National Circuit.
> 89368 | == Australian Nations Cup Championship ==   == Australian Superbike Championship == Winton Raceway is one of the most prominent Superbike races on the Australian Superbike Championship racing calendar.
> 89376 | Moorabbin Oval, also known by its sponsorship name of Linen House Oval, is an Australian Rules Football ground in the city of

## Real World Prediction

In [16]:
# This phase will use the classifiers to train the real-world (unlabelled data)
real_predictions = clf_rfc.fit(t_dataset_r, t_labels_resample).predict(p_dataset)

In [41]:
# p_content = content[cut_num:]
# for i in range(len(p_sents)): 
#     if real_predictions[i] == 't':
#         print i,

In [170]:
e_names = name_to_node.keys()
random.shuffle(e_names)
print "Query:\t\t", e_names[0], "\nEntityId:\t", name_to_node[e_names[0]]
chosen_node = name_to_node[e_names[0]]
print ''

p_content = content[cut_num:]

for i in range(len(p_sents)):
    cnt = 0
    if p_content[i][3] == chosen_node:
        cnt += 1
        if real_predictions[i] == 'f':
            print '[x]', i+cut_num, "||",  p_sents[i]
        elif real_predictions[i] == 't':
            print '\n', '~' * 80
            print '[v]', i+cut_num, "||",  p_sents[i]
            print '~' * 80
            print ''

if cnt == 0:
    print "[error] the query is not the p rediction section"

Query:		Smithfield State High School 
EntityId:	way_408125292


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[v] 78290 || Smithfield State High School is a secondary school located in Smithfield, Queensland, in Australia.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[x] 78291 || Smithfield state High School operates an independant public Secondary School and is renown for its unique partnership with Trinity Beach State School and James Cook University, through the Tropical North Learning Academy.
[x] 78292 || The school attracts students primarily from Cairns and Kuranda.
[x] 78293 || As of 2015, there are about 1000 students (ranging from years 7 to 12) enrolled into the school.
[x] 78294 || == History == The school opened in 1983.
[x] 78295 || On 28 October 1998 an arson was committed at Smithfield State High School destroying a computer block and the school's radio station.
[x] 78296 || The arson remains unsolv

[x] 78351 || === Events and Activities === Australian Mathematics Competition Year 8 QAMT Mathematics Quiz Year 9 Enrichment Day   == Instrumental Music ==   === Philosophy of the Instrumental Music Program === The Instrumental Music Program aims to provide tuition on
[x] 78352 || This program aims to create and enhance opportunities for positive achievement an promote excellence in performance.
[x] 78353 || In keeping with this, students will be evaluated by a criteria of assessment, monitoring of attendance, practice ethic and commitment.
[x] 78354 || The instrumental teacher will explain the process at the beginning of the year.
[x] 78355 || === Instruments offered === WOODWIND - Oboe, Bassoon, Flute, Bass Clarinet, Alto Saxophone, Tenor Saxophone, Baritone Saxophone.
[x] 78356 || BRASS - Trumpet, French Horn, Trombone, Baritone/Euphonium, Tuba.
[x] 78357 || STRINGS - Violin, Viola, Cello, Double Bass.
[x] 78358 || PERCUSSSION - Drums, Mallet (tuned) and Concert (tune/un-tuned) Perc