# One hot encoding of bangla text

In [2]:
documents = ["কুকুর মানুষকে কামড়ায়", "মানুষ কুকুরকে কামড়ায়", "কুকুর মাংস খায়।", "মানুষ খাবার খায়।"]

processed_docs = [doc.lower().replace("।", "") for doc in documents]
print(processed_docs)

['কুকুর মানুষকে কামড়ায়', 'মানুষ কুকুরকে কামড়ায়', 'কুকুর মাংস খায়', 'মানুষ খাবার খায়']


In [3]:
#Build the vocabulary
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count = count +1
            vocab[word] = count
print(vocab)

{'কুকুর': 1, 'মানুষকে': 2, 'কামড়ায়': 3, 'মানুষ': 4, 'কুকুরকে': 5, 'মাংস': 6, 'খায়': 7, 'খাবার': 8}


In [4]:
#Get one hot representation for any string based on this vocabulary. 
#If the word exists in the vocabulary, its representation is returned. 
#If not, a list of zeroes is returned for that word. 
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab)
        if word in vocab:
            temp[vocab[word]-1] = 1 # -1 is to take care of the fact indexing in array starts from 0 and not 1
        onehot_encoded.append(temp)
    return onehot_encoded

In [5]:
print(processed_docs[1])
get_onehot_vector(processed_docs[1]) #o

মানুষ কুকুরকে কামড়ায়


[[0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0]]

# One hot encoding with scikit-learn

In [6]:
S1 = 'কুকুর মানুষকে কামড়ায়'
S2 = 'মানুষ কুকুরকে কামড়ায়'
S3 = 'কুকুর মাংস খায়'
S4 = 'মানুষ খাবার খায়'


In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0]+data[1]+data[2]+data[3]
print("The data: ",values)

#Label Encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded:",integer_encoded)

#One-Hot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()
print("Onehot Encoded Matrix:\n",onehot_encoded)

The data:  ['কুকুর', 'মানুষকে', 'কামড়ায়', 'মানুষ', 'কুকুরকে', 'কামড়ায়', 'কুকুর', 'মাংস', 'খায়', 'মানুষ', 'খাবার', 'খায়']
Label Encoded: [1 7 0 6 2 0 1 5 4 6 3 4]
Onehot Encoded Matrix:
 [[1. 0. 0. 0. 0. 1. 1. 0.]
 [0. 1. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 1.]]
