# Data Cleaning for Government Land Sale Notices

In [101]:
# Import modules and packages
import nltk
import pandas as pd
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [98]:
#nltk.download()
#print(nltk.__version__)
#dir(nltk)

Source: Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.

## Read in Text

In [102]:
# load data
filename = 'RBL1203.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

def clean_text(text):
    # split into words
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words

# print(words[:100])
# Convert to df
#df_words = pd.DataFrame(words, columns = [filename])
#print(df_words)


## Apply TfidfVectorizer

In [103]:
# Fit a basic TFIDF Vectorizer and view the results
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(words)
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

(13126, 1329)
['absence', 'absolute', 'absolutely', 'absorbing', 'accept', 'accepted', 'accepts', 'access', 'accmmnodating', 'accommodating', 'accommodation', 'accord', 'accordance', 'account', 'acknowledgement', 'acknowledges', 'acoustic', 'across', 'act', 'actions', 'activities', 'adapted', 'addition', 'additional', 'adequate', 'adjacent', 'adjoining', 'administrative', 'advance', 'advanced', 'advances', 'adverse', 'adversely', 'advertising', 'affect', 'affected', 'affecting', 'affixed', 'aforesaid', 'agent', 'agents', 'aggregate', 'agreed', 'agreement', 'agrees', 'aids', 'air', 'airconditioning', 'alienation', 'aligmnents', 'alignment', 'alignments', 'allocate', 'allocated', 'allow', 'along', 'already', 'also', 'alter', 'alteration', 'alterations', 'altered', 'alternatively', 'amending', 'amendment', 'amendments', 'amenities', 'amiexed', 'among', 'amongst', 'amount', 'amounts', 'amr', 'analyzing', 'anchor', 'anchors', 'ancillary', 'animal', 'annexed', 'antecedent', 'anybody', 'appli

In [104]:
X_features = pd.DataFrame(X_tfidf.toarray())
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1319,1320,1321,1322,1323,1324,1325,1326,1327,1328
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
# split into sentences
from nltk import sent_tokenize
sentences = sent_tokenize(text)
#print(sentences[0])

# Convert to df
df_sent = pd.DataFrame(sentences, columns = [filename])
#print(df_sent)

# Define a function to remove punctuation
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df_sent['RBL1203.txt'] = df_sent['RBL1203.txt'].apply(lambda x: remove_punct(x))

print(df_sent.head())

                                                                                           RBL1203.txt
0  Possession \n\n 7 \n\nSPECIAL CONDITIONS \n\n 1 Subject to payment of the balance of the premium...
1                                     2 hereof and subject to the provisions of General \nCondition No
2  1 hereof possession of the lot shall be deemed to be given to \nthe Purchaser on the date of thi...
3  Acknowledgement of 2 a i The Purchaser acknowledges that as at the date of this \nthe Encroachin...
4  177 hereinafter referred to as RBL 177 \nRP within the areas shown coloured pink hatched \ngreen...


In [49]:
# stemming of words
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
print(stemmed[:100])

['possess', '-', '7', '-', 'special', 'condit', '(', '1', ')', 'subject', 'to', 'payment', 'of', 'the', 'balanc', 'of', 'the', 'premium', 'as', 'provid', 'in', 'gener', 'condit', 'no', '.', '2', 'hereof', 'and', 'subject', 'to', 'the', 'provis', 'of', 'gener', 'condit', 'no', '.', '1', 'hereof', ',', 'possess', 'of', 'the', 'lot', 'shall', 'be', 'deem', 'to', 'be', 'given', 'to', 'the', 'purchas', 'on', 'the', 'date', 'of', 'thi', 'agreement', '.', 'acknowledg', 'of', '(', '2', ')', '(', 'a', ')', '(', 'i', ')', 'the', 'purchas', 'acknowledg', 'that', 'as', 'at', 'the', 'date', 'of', 'thi', 'the', 'encroach', 'agreement', ',', 'there', 'are', 'encroach', 'upon', 'the', 'lot', ':', 'structur', '(', 'i', ')', 'a', 'part', 'of', 'a']
