/
nlp.py
47 lines (34 loc) · 1.41 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""
nlp.py
"""
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from keras.datasets import imdb
def clean(text):
'''
Uses NLP algorithms to clean raw text and returns an array
of most commonly used words
Input:
text: raw text as string
Output:
ret: array of words converted to most commmonly used indices (from imdb database)
'''
# split string into tokens
tokens = word_tokenize(text)
# remove stop words & punctuation
stop_words = stopwords.words('english')
words = [word for word in tokens if word.isalpha() and word not in stop_words]
# convert into stem words
# porter = PorterStemmer()
# stemmed = [porter.stem(word) for word in words]
# convert into imdb frequency dictionary indices
vocabulary_size = 5000
word2id = imdb.get_word_index()
imdb_coded = [word2id.get(word, 0) for word in words]
imdb_coded_sized = [code for code in imdb_coded if code < vocabulary_size]
return imdb_coded_sized
# if __name__ == "__main__":
# raw = "Tesla is now the leader in self-driving technology for electric vehicles, but there is no guarantee that it will continue. Because Tesla wanted to reduce manufacturing costs, it defeated XPEV in the price war and abandoned the use of laser radar. This is a wrong choice. Tesla will not be able to upgrade the L5 level of autonomous driving technology in the future."
# print(clean(raw))