# Text Preprocessing

    Date: 04/06/19

This python notebook will generate three txt files for further processing and Machine Learning.
- Input files : 
  - data.txt that contains the text of 5000 different advertisements 
  - stopwords.txt contains the stopwords in english that will be used to process the data file

- output files :
  - vocab.txt: 
  It contains 
  the unigram vocabulary in the following format:word_string:integer_index. Words in the vocabulary will be sorted in        alphabetical order. This file is the key to interpret the sparse encoding.
             
  - sparse.txt: 
  Each line of this file corresponds to one advertisement. So, they start with advertisement ID. The rest of each line is the sparse representation of the corresponding description in the form of word_index:word_freq separated by a comma. The order of the lines will match the order of the advertisements in the input file.
             
  - highFreq.txt & lowFreq.txt:
  These two files contain frequent words that appear in more than 100 advertisement descriptions as well as the word              appear only once in the whole job advertisement description. Each line contains only one word.

In [1]:
from nltk.tokenize import RegexpTokenizer 
from nltk.probability import *
import time
import re
from itertools import chain

In [2]:
start = time. time()
# Read data from file
File=open('./data.txt')
Raw_text=File.read().lower().splitlines()

# Filtering the ID and description into sepreate list
filter_ID = re.compile(r"^id: (#\d{8})")
filter_Descr = re.compile(r"^description: (.+)")

ID = list(filter(filter_ID.match, Raw_text)) 
Description = list(filter(filter_Descr.match, Raw_text))

In [3]:
# Cutting the words 'id' and 'description' at the beginning of each element to get the tidy data to work for
ID_list=[x[4:] for x in ID]
Description_list = [y[13:] for y in Description]

In [4]:
# Constructing the tokenizer and do the word tokenization for the description in each job number
tokenizer = RegexpTokenizer("[a-zA-Z]+(?:[-'][a-zA-Z]+)?")
tokens = [tokenizer.tokenize(text_descr) for text_descr in Description_list]
tokens_dict=dict(zip(ID_list,tokens))

In [None]:
# Cut the word that length is shorter than 3
for k, v in tokens_dict.items():
    tokens_dict[k] = [word for word in v if len(word)>=3]

In [None]:
# Loading the stopwords
stopwords = []
with open('./stopwords_en.txt') as f:
    stopwords = f.read().splitlines()

In [None]:
# Defining a word remove function
def exclude_words(target,wordset):
    for k,v in target.items():
        target[k] = [word for word in v if word not in wordset]

In [None]:
# Removing the stopwords from our token
# Using set will increase the process speed
stopwordsSet = set(stopwords)

exclude_words(tokens_dict,stopwordsSet)    


In [None]:
# Chain the token set with and without the set method to get two different word sets

words_set_1 = list(chain.from_iterable(tokens_dict.values()))
words_set_2 = list(chain.from_iterable([set(value) for value in tokens_dict.values()]))

fd_1 = FreqDist(words_set_1)
fd_2 = FreqDist(words_set_2)
# Find the list of low frequency words and high frequency words
LessFreqWords = list(fd_1.hapaxes())
HighFreqWords = list([k for k, v in fd_2.items() if v > 100])


In [None]:
# Remove high frequency words
HighFreqSet = set(HighFreqWords)
    
exclude_words(tokens_dict,HighFreqSet)   

In [None]:
# Remove low freqyency words
LessFreqSet = set(LessFreqWords)

exclude_words(tokens_dict,LessFreqSet)   

In [None]:
# Defining function getfreq to find the number of frequence in each unigram
def getfreq (target, wordset):
    return list([v for k, v in target.items() if k in wordset])

In [None]:
# Create the dict of High frequency words
HighFreqnum = getfreq(fd_2, HighFreqSet)
HighFreqout = dict(zip(HighFreqWords,HighFreqnum))

In [None]:
# Output highFreq text with frequency number sorted
with open('highFreq.txt', 'w') as f:
    high = [(k, HighFreqout[k]) for k in sorted(HighFreqout, key=HighFreqout.get, reverse=True)]
    for k in high:
        i=k[0]
        j=k[1]
        print(i+':'+str(j), file=f)

In [None]:
# Output lowFreq text
with open('lowFreq.txt', 'w') as f:
    LowFreqword = list(LessFreqSet)
    for i in sorted(LowFreqword):
        print(str(i), file=f)

In [None]:
vocab = set(sorted(chain.from_iterable(tokens_dict.values())))
# Output the vocab file
out_file = open("vocab.txt", 'w')
# Obtain each word from all job description, then assign each word with a index
vocab_dict={}
i=-1
for d in sorted(vocab):
    vocab_dict[d] = i
    i=i+1
    out_file.write(d+':'+str(i)+'\n')
out_file.close()

In [None]:
# Output sparse file

out_file = open("sparse.txt", 'w')


# create the loop to find the occurrences of each vocab in each ad.
for n,d in sorted(tokens_dict.items()):
    d_idx = [vocab_dict[w] for w in d]
    out_file.write(n+',')
    for k, v in sorted(FreqDist(d_idx).items()):
        out_file.write(" {}:{},".format(k+1,v))
    out_file.write('\n')
out_file.close()

In [None]:
end = time. time()
print('This program takes:' + str(end - start)+ 's')