-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
42 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,3 +8,6 @@ | |
.DS_Store | ||
venv | ||
docs/build | ||
build | ||
dist | ||
data_stack.egg-info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import string | ||
from collections import defaultdict | ||
|
||
|
||
def make_char_to_ix(): | ||
""" Make a character to index dictionary. | ||
Returns: | ||
dict: character to index | ||
""" | ||
all_chars = string.printable + "°éèàëïüâêîôûç" | ||
char_to_ix = {c: i for i, c in enumerate(all_chars)} | ||
|
||
return char_to_ix | ||
|
||
|
||
def make_word_to_ix(train_sentences, char_to_split_at=" ", unk_tag="<UNK>"): | ||
""" Make a word to index dictionary | ||
Args: | ||
train_sentences (list): list of sentences | ||
char_to_split_at (str, optional): str. Character to use to split \ | ||
the sentence (for tokenization). Defaults to " ". | ||
unk_tag (str, optional): Unknown tag. Defaults to "<UNK>". | ||
Returns: | ||
[type]: [description] | ||
""" | ||
word_to_ix = defaultdict(str) | ||
|
||
word_to_ix[unk_tag] = 0 | ||
|
||
for sent in train_sentences: | ||
for word in sent.split(char_to_split_at): | ||
if word not in word_to_ix: | ||
word_to_ix[word] = len(word_to_ix) | ||
|
||
return word_to_ix | ||
|