### NLP的词向量表示

#### One-Hot Encoding

In [1]:
import numpy as np
import pandas as pd

# pandas:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]})
print(pd.get_dummies(df, prefix={'A':'p1', 'B':'p2'}))

# sklearn
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]})
one_hot=OneHotEncoder()
data_temp=pd.DataFrame(one_hot.fit_transform(df[['A','B']]).toarray(),
             columns=one_hot.get_feature_names(['A','B']), dtype='int32')
data_onehot=pd.concat((df,data_temp),axis=1)

data_onehot.head()

   C  p1_a  p1_b  p2_a  p2_b  p2_c
0  1     1     0     0     1     0
1  2     0     1     1     0     0
2  3     1     0     0     0     1




Unnamed: 0,A,B,C,A_a,A_b,B_a,B_b,B_c
0,a,b,1,1,0,0,1,0
1,b,a,2,0,1,1,0,0
2,a,c,3,1,0,0,0,1


#### Bag of Words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus of text documents
corpus = [
    "This is the first document. this.",
    "This is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a BOW representation
bow = vectorizer.fit_transform(corpus)

# Print the vocabulary and BOW representation of the first document
print(vectorizer.vocabulary_)
print(bow[0])


{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
  (0, 8)	2
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1


In [3]:
# 如果是中文预料可以用jieba分词
import jieba
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus of Chinese text documents
corpus = [
    "这是第一篇文章。",
    "这是第二篇文章。",
    "这是第三篇文章。",
    "这是第四篇文章。",
]

# Tokenize the text data using jieba
corpus_tokenized = [' '.join(jieba.cut(doc)) for doc in corpus]

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a BOW representation
bow = vectorizer.fit_transform(corpus_tokenized)

# Print the vocabulary and BOW representation of the first document
print(vectorizer.vocabulary_)
print(bow[0])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\82438\AppData\Local\Temp\jieba.cache


Loading model cost 0.807 seconds.
Prefix dict has been built successfully.


{'这是': 5, '第一篇': 1, '文章': 0, '第二篇': 3, '第三篇': 2, '第四篇': 4}
  (0, 5)	1
  (0, 1)	1
  (0, 0)	1


#### TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus of text documents
corpus = [
    "This is the first document.",
    "This is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a TF-IDF representation
tfidf = vectorizer.fit_transform(corpus)

# Print the vocabulary and TF-IDF representation of the first document
print(vectorizer.vocabulary_)
print(tfidf[0])

{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483


#### words embedding

##### Word2Vec

In [1]:
from gensim.models import fasttext
from gensim.models import word2vec
import pandas as pd
import logging
import jieba



In [2]:

sentance = [
    "This is the first document.",
    "This is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]
## 对句子进行分词分词
def segment_sen(sen):
    sen_list = []
    try:
        sen_list = jieba.lcut(sen)
    except:
            pass
    return sen_list   
# 将数据变成gensim中 word2wec函数的数据格式
sens_list = [segment_sen(i) for i in sentance]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\82438\AppData\Local\Temp\jieba.cache
Loading model cost 0.838 seconds.
Prefix dict has been built successfully.


In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# sg=0:CBOW, 1:Skip_gram
model = word2vec.Word2Vec(sens_list,min_count=1,iter=20, sg=0)
model.save("word2vec.model")

2023-08-08 14:56:06,884 : INFO : collecting all words and their counts
2023-08-08 14:56:06,885 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-08-08 14:56:06,886 : INFO : collected 14 word types from a corpus of 42 raw words and 4 sentences
2023-08-08 14:56:06,887 : INFO : Loading a fresh vocabulary
2023-08-08 14:56:06,888 : INFO : effective_min_count=1 retains 14 unique words (100% of original 14, drops 0)
2023-08-08 14:56:06,888 : INFO : effective_min_count=1 leaves 42 word corpus (100% of original 42, drops 0)
2023-08-08 14:56:06,890 : INFO : deleting the raw counts dictionary of 14 items
2023-08-08 14:56:06,891 : INFO : sample=0.001 downsamples 14 most-common words
2023-08-08 14:56:06,892 : INFO : downsampling leaves estimated 5 word corpus (11.9% of prior 42)
2023-08-08 14:56:06,893 : INFO : estimated required memory for 14 words and 100 dimensions: 18200 bytes
2023-08-08 14:56:06,894 : INFO : resetting layer weights
2023-08-08 14:56:06,900 : INFO :

##### TextCNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import numpy as np

# Define the CNN architecture
class TeCxCNN(nn.Module):
    def __init__(self, num_classes):
        super(TeCxCNN, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_sizes[0])
        self.conv2 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_sizes[1])
        self.conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_sizes[2])
        self.pool = nn.MaxPool1d(kernel_size=max_seq_len - max(filter_sizes) + 1)
        self.fc1 = nn.Linear(in_features=num_filters*len(filter_sizes), out_features=hidden_dim)
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features=num_classes)
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x1 = nn.functional.relu(self.conv1(x))
        x2 = nn.functional.relu(self.conv2(x))
        x3 = nn.functional.relu(self.conv3(x))
        x1 = self.pool(x1)
        x2 = self.pool(x2)
        x3 = self.pool(x3)
        x = torch.cat((x1, x2, x3), dim=1)
        x = x.view(-1, self.num_filters*len(self.filter_sizes))
        x = self.dropout(x)
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Define the hyperparameters
vocab_size = 10000
embedding_dim = 100
num_filters = 100
filter_sizes = [3, 4, 5]
max_seq_len = 100
hidden_dim = 100
dropout_prob = 0.5
num_epochs = 10
batch_size = 32
learning_rate = 0.001

# Load the dataset and preprocess the text data
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Load the dataset
df = pd.read_csv('dataset.csv')

# Split the dataset into the text data and label data
texts = df['text'].values.astype(str)
labels = df['label'].values.astype(str)

# Convert the label data into numerical format
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

# Preprocess the text data
stop_words = set(stopwords.words('english'))
tokenizer = Tokenizer(num_words=10000, lower=True)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
max_seq_len = 100
data = pad_sequences(sequences, maxlen=max_seq_len)

# Split the dataset into training, validation, and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=42)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# Convert the data and labels into PyTorch tensors
train_data = torch.from_numpy(train_data).long()
val_data = torch.from_numpy(val_data).long()
test_data = torch.from_numpy(test_data).long()
train_labels = torch.from_numpy(train_labels).long()
val_labels = torch.from_numpy(val_labels).long()
test_labels = torch.from_numpy(test_labels).long()

# Create PyTorch DataLoader objects for the training, validation, and test sets
train_dataset = torch.utils.data.TensorDataset(train_data, train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = torch.utils.data.TensorDataset(val_data, val_labels)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataset = torch.utils.data.TensorDataset(test_data, test_labels)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the CNN model and define the loss function and optimizer
model = TeCxCNN(num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the CNN on the training set and validate the model on the validation set
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch [%d/%d], Loss: %.4f' % (epoch+1, num_epochs, running_loss))

    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Validation Accuracy: %.2f %%' % (100 * correct / total))

#### Bert

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Tokenize the text
text = "This is a sample sentence."
tokens = tokenizer.tokenize(text)

# Convert the tokens to embeddings
input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs[0]

# Average the embeddings
embedding = torch.mean(embeddings, dim=1)

# Use the embedding for downstream tasks

In [None]:
# Fine-tune 
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load the task-specific dataset
data = pd.read_csv('dataset.csv')

# Preprocess the data
X = data['text'].values.astype(str)
y = data['label'].values.astype(int)
encoded_labels = torch.tensor(y)
tokenized_texts = [tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt') for text in X]
input_ids = [tokenized_text['input_ids'][0] for tokenized_text in tokenized_texts]
attention_masks = [tokenized_text['attention_mask'][0] for tokenized_text in tokenized_texts]

# Split the data into training, validation, and testing sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, encoded_labels, test_size=0.2, random_state=42)
train_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.2, random_state=42)

# Fine-tune the BERT model
train_dataset = torch.utils.data.TensorDataset(torch.stack(train_inputs), torch.stack(train_masks), train_labels)
val_dataset = torch.utils.data.TensorDataset(torch.stack(val_inputs), torch.stack(val_masks), val_labels)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()

# Evaluate the model
test_inputs = torch.stack(tokenizer(X_test, padding=True, truncation=True, max_length=128, return_tensors='pt')['input_ids'])
test_masks = torch.stack(tokenizer(X_test, padding=True, truncation=True, max_length=128, return_tensors='pt')['attention_mask'])
test_dataset = torch.utils.data.TensorDataset(test_inputs, test_masks)
trainer.evaluate(test_dataset)

# Use the model
model.eval()
with torch.no_grad():
    inputs = tokenizer("This is a test sentence.", padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    output = model(input_ids, attention_mask=attention_mask)
    logits = output.logits
    predicted_labels = torch.argmax(logits, dim=1)
    print(predicted_labels)