In [1]:
import numpy as np
import pandas as pd
import keras
import jieba
import re
import csv
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split

In [2]:
# 我们使用tensorflow的keras接口来建模
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from keras.utils import to_categorical

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


!bzip2 -d sgns.zhihu.bigram-char.bz2

In [6]:
embedding = KeyedVectors.load_word2vec_format('sgns.zhihu.bigram-char', binary=False, unicode_errors="ignore")

In [7]:
#embedding维度300
embedding_dim = embedding['中国'].shape[0]

In [8]:
embedding.similarity('中国','美国')

0.5562877

In [10]:
word_index = embedding.key_to_index['中国']
print("Word Index:", word_index)

Word Index: 51


In [13]:
embedding.most_similar(positive=['中国'], topn=10)

[('中国人', 0.5616261959075928),
 ('美国', 0.5562876462936401),
 ('我国', 0.531586766242981),
 ('全中国', 0.5306392908096313),
 ('中国茶', 0.5249154567718506),
 ('中国海', 0.5224687457084656),
 ('中国武协', 0.5200953483581543),
 ('外国', 0.5197730660438538),
 ('中国篮球', 0.5111113786697388),
 ('日本', 0.5098268389701843)]

In [15]:
import pandas as pd

# Load the CSV file with semicolon delimiter
file_path = 'zhihu_cm_data.csv'  # Replace with your file path
data = pd.read_csv(file_path, delimiter=';', encoding='utf-8', on_bad_lines='skip')

# Extract relevant columns
train_text_orig = data['content'].tolist()
train_target = data['author_gender'].astype(int).tolist()

# Print the number of examples
print(f"{len(train_text_orig)} text examples in trainset")

# Prepare the DataFrame for saving
prepared_data = pd.DataFrame({
    'text': train_text_orig,
    'label': train_target
})

# Save to a new CSV file
output_file_path = 'zhihu_prepared_data.csv'  # Specify the desired file path and name
prepared_data.to_csv(output_file_path, index=False, encoding='utf-8')

print(f"Processed data has been saved to {output_file_path}")


960 text examples in trainset
Processed data has been saved to zhihu_prepared_data.csv


In [17]:
train_text_orig = []
train_target = []

csv_orig = csv.reader(open('zhihu_prepared_data.csv'))
next(csv_orig, None)
for line in csv_orig:
    # Swap the columns
    train_text_orig.append(line[0])
    train_target.append(line[1])

train_target = np.array(train_target).astype('int')
print('%d text examples in trainset' % len(train_text_orig))
# ...existing code...

960 text examples in trainset


In [18]:
#convert to onehot
train_target = to_categorical(train_target)

In [19]:
#清晰数据
def clean_text(text):
    text = re.sub("<[^>]+>", "", text)
    text = text.replace("&nbsp;", "")
    text = text.replace("\n", "")
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）-]+", "", text)
    text = re.sub("[^0-9A-Za-z\u4e00-\u9fa5]", "", text)
    text = re.sub( "\\(.*?\\)|\\{.*?}|\\[.*?]", "", text)
    return text

def tokenize_text(text):
    words = [w for w in jieba.cut(text)]
    embedding_vectors = []
    for idx, word in enumerate(words):
        try:
            embedding_vectors.append(embedding.vocab[word].index)
        except KeyError:
            embedding_vectors.append(0)
    return embedding_vectors

In [34]:
def tokenize_text(text, model):
    tokens = []
    for word in text.split():
        if word in model.key_to_index:  # removed .wv
            tokens.append(word)
    return tokens
#tokenize
train_tokens = []
model = w2v_model = embedding
for text in  train_text_orig:
    pure_text = clean_text(text)
    tokens = tokenize_text(pure_text,model)
    train_tokens.append(tokens)

In [35]:
#平均token数

if train_tokens:
    num_tokens = [len(tokens) for tokens in train_tokens]
    print(np.mean(num_tokens))

0.00625


In [36]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

0

In [37]:
# 取tokens的长度为80时，大约 93%的样本被涵盖
# 我们对长度不足的进行padding，超长的进行修剪
max_tokens = 80
np.sum( np.array(num_tokens) < max_tokens ) / len(num_tokens)

1.0

In [38]:
# 用来将tokens转换为文本
def reverse_tokens(tokens):
    text = ''
    for i in tokens:
        if i != 0:
            text = text + embedding.index2word[i]
        else:
            text = text + ' '
    return text

reverse_tokens(train_tokens[10])

''

In [40]:
len(embedding.index_to_key)

259753

In [41]:
#取100000/259753个词
num_words = 100000

In [43]:
# 进行padding和truncating， 输入的train_tokens是一个list
# 返回的train_pad是一个numpy array
from keras.preprocessing.sequence import pad_sequences

# Ensure train_tokens contains only integer indices
train_tokens = [[model.key_to_index[word] for word in tokens if word in model.key_to_index] for tokens in train_tokens]

# Padding and truncating
train_pad = pad_sequences(train_tokens, maxlen=max_tokens, padding='pre', truncating='pre')

# Replace out-of-vocabulary words with 0
train_pad[train_pad >= num_words] = 0

# Check the padded sequence
train_pad[20]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0, 532], dtype=int32)

In [45]:
# 使用259753个词
num_words = 259753
embedding_dim = 300
# 初始化embedding_matrix，之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words，embedding_dim] 的矩阵
for i in range(num_words):
  embedding_matrix[i, :] = embedding[embedding.index_to_key[i]]  # replaced index2word with index_to_key
embedding_matrix = embedding_matrix.astype('float32')
np.array(embedding_matrix).shape

(259753, 300)

In [46]:
# 90%的样本用来训练，剩余10%用来测试
X_train, X_test, y_train, y_test = train_test_split(train_pad,
                                                    train_target,
                                                    test_size=0.1,
                                                    random_state=12)

In [47]:
model = Sequential()
model.add(Embedding(num_words,
                   embedding_dim,
                   weights=[embedding_matrix],
                   input_length = max_tokens,
                   trainable = False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(LSTM(units=16, return_sequences=False))
model.add(Dense(4, activation='softmax'))
# 我们使用adam以0.001的learning rate进行优化
model.summary()



In [48]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [50]:
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_tokens))
model.add(LSTM(units=128, return_sequences=False))
model.add(Dense(units=2, activation='softmax'))  # Ensure this matches y_train shape

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(X_train, y_train, validation_split=0.1, epochs=20, batch_size=256)


Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2s/step - accuracy: 0.8963 - loss: 0.5674 - val_accuracy: 0.9195 - val_loss: 0.2843
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.9101 - loss: 0.3044 - val_accuracy: 0.9195 - val_loss: 0.2809
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - accuracy: 0.9111 - loss: 0.3013 - val_accuracy: 0.9195 - val_loss: 0.2825
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/step - accuracy: 0.9136 - loss: 0.2956 - val_accuracy: 0.9195 - val_loss: 0.2956
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.9020 - loss: 0.3278 - val_accuracy: 0.9195 - val_loss: 0.2871
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2s/step - accuracy: 0.9077 - loss: 0.3095 - val_accuracy: 0.9195 - val_loss: 0.2800
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7b7a372aa800>

In [51]:
model.evaluate(X_test, y_test)
#95%

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.9414 - loss: 0.2256


[0.23534631729125977, 0.9375]

In [57]:
model.save('if_needed.h5')



In [52]:
def predict_sentiment(text):
    pure_text = clean_text(text)
    tokens = tokenize_text(pure_text)
    tokens_pad = pad_sequences([tokens], maxlen=max_tokens,
                           padding='pre', truncating='pre')
    # 预测
    result = model.predict(tokens_pad)
    result_text = ['喜悦','愤怒', '厌恶','低落']
    print(result)
    print(result_text[np.argmax(result)])
    return np.argmax(result)

In [60]:
from keras.models import load_model

# Load the model using a supported file format
model = load_model('if_needed.h5')  # Update with your model path

def predict_sentiment(text, model):
    pure_text = clean_text(text)
    tokens = tokenize_text(pure_text, w2v_model)  # pass the model here
    tokens_pad = pad_sequences([tokens], maxlen=max_tokens, padding='pre', truncating='pre')
    prediction = model.predict(tokens_pad)
    return prediction

# Example usage
print(predict_sentiment("品控不好，还没到一个月就坏了", model))
print(predict_sentiment("品控不错，挺好的", model))
print(predict_sentiment("太开心了", model))
print(predict_sentiment("难受啊", model))
print(predict_sentiment("谢天牛逼啊", model))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 346ms/step
[[0.0068893  0.99311066]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[[0.0068893  0.99311066]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[[0.0068893  0.99311066]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[[0.0068893  0.99311066]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[[0.0068893  0.99311066]]


In [61]:
y_pred = model.predict(X_test)
y_pred = [np.argmax(arr) for arr in y_pred]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step


In [62]:
ss = 0
for i in range(len(y_pred)):
    if(y_pred[i]==np.argmax(y_test[i])):
        ss+=1
print(ss)
print(len(y_pred))
print(ss/len(y_pred))

90
96
0.9375


In [64]:
predict_sentiment('小米业界良心',model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step


array([[0.0068893 , 0.99311066]], dtype=float32)

In [65]:
#from keras.models import load_model
from keras.models import load_model
model_loaded = load_model('if_needed.h5')
model_loaded.summary()

