<a href="https://colab.research.google.com/github/luojie1024/TextClassification/blob/main/One_Hot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 独热编码(One-Hot)

In [19]:
!pip install scikit-learn
import numpy as np

Looking in indexes: http://pypi.douban.com/simple/
Collecting scikit-learn
  Downloading https://mirrors.cloud.tencent.com/pypi/packages/40/c6/2e91eefb757822e70d351e02cc38d07c137212ae7c41ac12746415b4860a/scikit_learn-1.3.2-cp311-cp311-macosx_12_0_arm64.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting joblib>=1.1.1 (from scikit-learn)
  Using cached https://mirrors.cloud.tencent.com/pypi/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl (302 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached https://mirrors.cloud.tencent.com/pypi/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.2 threadpoolctl-3.2.0


## 0 语料准备

In [20]:
# 语料
corpus = ['这 是 第一个 文档',
        '这是 第二个 文档',
        '这是 最后 一个 文档',
        '现在 没有 文档 了']

## 1. 手动实现（One-Hot）

In [21]:
# 词袋
words=[]
for corpu in corpus:
  words.extend(corpu.split())

# 词的列表
word_list=list(set(words))
# 字典
word_dct= {word:index for index,word in enumerate(word_list)}
# 词典大小
vocab_size=len(word_dct)
print(word_dct)

{'是': 0, '这': 1, '第一个': 2, '文档': 3, '这是': 4, '最后': 5, '没有': 6, '了': 7, '第二个': 8, '现在': 9, '一个': 10}


In [22]:
def get_one_hot(index):
  """
  获得one hot编码
  """
  # 初始化全0列表
  one_hot=[0 for i in range(vocab_size)]
  # 标记对应位置为1
  one_hot[index]=1
  # 将列表转换成矩阵
  return np.array(one_hot)

In [23]:
get_one_hot(1)

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### 原始句子

In [24]:
corpus[0]

'这 是 第一个 文档'

### 转换成索引

In [25]:
indexs=[word_dct[i] for i in corpus[0].split()]
indexs

[1, 0, 2, 3]

### 句子-> 索引 ->one hot

In [26]:
one_hot_list=np.array([get_one_hot(index) for index in indexs])
one_hot_list

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

## 2 . Sklearn实现

In [27]:
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer

### 初始化编码器

In [28]:
lb = LabelBinarizer()
lb.fit(word_list)
# lb.classes_=np.array(word_list)
lb.classes_

array(['一个', '了', '文档', '是', '最后', '没有', '现在', '第一个', '第二个', '这', '这是'],
      dtype='<U3')

### 原始句子

In [29]:
sentence=corpus[0].split()
sentence

['这', '是', '第一个', '文档']

### 编码（词列表-> one hot）

In [30]:
encode_sentence=lb.transform(sentence)
encode_sentence

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

### 解码（one hot->词列表）

In [31]:
lb.inverse_transform(encode_sentence)

array(['这', '是', '第一个', '文档'], dtype='<U3')

# 参考
[1] [Sklearn官方文档](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html#sklearn.preprocessing.LabelBinarizer)