### 采用航空公司的数据集

In [16]:
import tensorflow as tf
import pandas as  pd
import numpy  as np
import re

In [17]:
data  = pd.read_csv('./dataset/Tweets.csv')

### 数据处理 

In [18]:
data = data[['airline_sentiment', 'text']]

In [19]:
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [20]:
data_p = data[data.airline_sentiment == 'positive']
data_n =  data[data.airline_sentiment == 'negative']
data_n = data_n.iloc[:len(data_p)]

In [21]:
data = pd.concat([data_n, data_p])

In [22]:
data['review'] = (data.airline_sentiment == 'positive').astype('int')

In [23]:
del data['airline_sentiment']

In [24]:
data.reset_index(drop=True, inplace=True)

#### 把文本向量化
* 去掉特殊字符
* 把文本数字化
* 通过tf.keras.layers.Embedding训练向量化

In [25]:
token = re.compile('[A-Za-z]+|[!?,.()]')

In [26]:
def reg_text(text):
    new_text = token.findall(text)
    new_text = [word.lower() for word in new_text]
    return new_text

In [27]:
data['text'] = data.text.apply(reg_text)

In [14]:
# 获取所有的英文单词
word_set = set()
for text in data.text:
    for word in text:
        word_set.add(word)

In [19]:
word_list = list(word_set)

# 记录单词数
max_word = len(word_list) + 1

In [20]:
word_index = dict((word, (index+1)) for (index, word) in enumerate(word_list))

In [21]:
text_ok = data.text.apply(lambda x: [word_index.get(word) for word in x])

In [22]:
# 将数据处理成同样长度，首先获取最长的评论数
maxlen = max(len(x) for x in text_ok)

In [23]:
# 用keras填充数据
text_ok = tf.keras.preprocessing.sequence.pad_sequences(text_ok, maxlen)

创建模型

In [24]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_word, 50, input_length=maxlen))
model.add(tf.keras.layers.LSTM(64))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [26]:
model.compile(
    optimizer="adam",
    loss='binary_crossentropy',
    metrics=['acc']
)

In [None]:
model.fit(text_ok, data.review.values, epochs=10, batch_size=120, validation_split=0.2)