This repository has been archived by the owner on Mar 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 298
/
preprocessing.py
49 lines (42 loc) · 1.5 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import string
def strip_punctuation(s):
"""Return the text without (hopefully...) any punctuation.
>>> strip_punctuation('this.is?as!funny@word') == 'thisisasfunnyword'
"""
return ''.join(ch for ch in s if ch not in string.punctuation)
def clean_text(text):
"Lowercase and strip all punctuation from a text."
return strip_punctuation(text.lower())
def read_file(filename):
"Read the contents of FILENAME and return as a string."
infile = open(filename)
contents = infile.read()
infile.close()
return contents
def list_textfiles(directory):
"Return a list of filenames ending in '.txt' in DIRECTORY."
textfiles = []
for filename in listdir(directory):
if filename.endswith(".txt"):
textfiles.append(directory + "/" + filename)
return textfiles
def split_sentences(text):
"Split a text string into a list of sentences."
sentences = []
start = 0
for end, character in enumerate(text):
if end_of_sentence_marker(character):
sentence = text[start: end + 1]
sentences.append(sentence)
start = end + 1
return sentences
def tokenize(text):
"Tokenize a text into a list of sentences each represented as a list of words."
return [clean_text(sentence).split() for sentence in split_sentences(text)]
def read_corpus(directory):
"Read and tokenize all files in DIRECTORY."
corpus = []
for filename in list_textfiles(directory):
corpus.append(tokenize(read_file(filename)))
return corpus