In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
print("Tensorflow version: {}".format(tf.__version__))

Tensorflow version: 2.10.0


In [3]:
data = pd.read_csv("./dataset/Tweets.csv")

In [4]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
# pandas去除列表数据
data = data[["airline_sentiment","text"]]
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [6]:
data.airline_sentiment.unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [7]:
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [8]:
positive_data = data[data.airline_sentiment == "positive"]
negative_data = data[data.airline_sentiment == "negative"]
negative_data = negative_data.iloc[:len(positive_data)]

In [9]:
len(positive_data),len(negative_data)

(2363, 2363)

In [10]:
data = pd.concat([positive_data,negative_data])

In [11]:
data = data.sample(len(data))

In [12]:
data["review"] = (data.airline_sentiment == "positive").astype("int")

In [13]:
data

Unnamed: 0,airline_sentiment,text,review
3363,negative,@united I work in customer support- extremely ...,0
14529,positive,@AmericanAir Mad love http://t.co/4ojrSDWPkK ...,1
3241,negative,"@united worst airline ever! Staff is nasty, wi...",0
1476,negative,@united Thanks for the vague canned response t...,0
7738,positive,@JetBlue on the plane now! Hopefully no longe...,1
...,...,...,...
3058,negative,@united don't know if you are aware that ALL o...,0
2106,negative,@united I'm still trying to get things worked ...,0
2712,positive,@united give her the recognition she deserves!,1
3012,negative,@united is the worst http://t.co/27aitZl6nd,0


In [14]:
del data["airline_sentiment"]

In [15]:
data

Unnamed: 0,text,review
3363,@united I work in customer support- extremely ...,0
14529,@AmericanAir Mad love http://t.co/4ojrSDWPkK ...,1
3241,"@united worst airline ever! Staff is nasty, wi...",0
1476,@united Thanks for the vague canned response t...,0
7738,@JetBlue on the plane now! Hopefully no longe...,1
...,...,...
3058,@united don't know if you are aware that ALL o...,0
2106,@united I'm still trying to get things worked ...,0
2712,@united give her the recognition she deserves!,1
3012,@united is the worst http://t.co/27aitZl6nd,0


__tf.keras.layers.Embedding把文本向量化__

In [16]:
import re

In [17]:
token = re.compile("[A-Za-z]+|[!?,.()]")

In [18]:
def reg_text(text):
    new_text = token.findall(text)
    new_text = [word.lower() for word in new_text]
    return new_text

In [19]:
# 文本规范化
data["text"] = data.text.apply(reg_text)

In [20]:
# 英文单词每一个对应成一个整数序列
word_set = set()
for text in data.text:
    for word in text:
        word_set.add(word)

In [21]:
maxword = len(word_set) + 1

In [22]:
word_list = list(word_set)

In [23]:
word_list.index("spending")

3403

In [24]:
word_index_dict = dict((word, word_list.index(word)+1) for word in word_list)

In [25]:
data_train_data = data.text.apply(lambda x: [word_index_dict.get(word, 0) for word in x])

In [26]:
data_train_data.values

array([list([3605, 2792, 5691, 1718, 2349, 2774, 1511, 5934, 3605, 6202, 5240, 3917, 6237, 722, 790, 862, 2792, 3163, 3983, 288, 3943, 621, 3983, 4109, 4063, 5110, 4692]),
       list([5266, 3434, 4059, 2583, 5240, 4692, 5366, 1484, 1446]),
       list([3605, 4603, 4467, 757, 4829, 2570, 3570, 5495, 5873, 22, 2968, 1212, 1605, 579, 621, 3983, 6946, 2100, 2100, 4952, 593, 1107, 2108, 3570, 1962, 4692, 2219, 6989, 4580, 1718, 6820]),
       ..., list([3605, 3220, 1360, 1107, 5605, 2935, 3148, 4829]),
       list([3605, 3570, 1107, 4603, 2583, 5240, 4692, 5366, 843, 4332]),
       list([3605, 3207, 877, 3818, 1423, 2835, 5826, 4952, 2792, 3903, 6687, 5240, 61, 5249, 6565, 1267, 5249, 3943, 3983, 199, 4692, 740, 4059, 3983, 1496, 4629, 576, 4996, 4692])],
      dtype=object)

In [27]:
maxlen = max(len(x) for x in data_train_data)

In [28]:
data_train_data = keras.preprocessing.sequence.pad_sequences(data_train_data.values, maxlen=maxlen)

In [29]:
data_train_data.shape

(4726, 40)

In [30]:
data.review.values

array([0, 1, 0, ..., 1, 0, 0])

In [31]:
model = keras.Sequential()

Embeding : 把文本映射为一个密集向量

In [32]:
model.add(layers.Embedding(maxword, 50, input_length=maxlen))

In [33]:
model.add(layers.LSTM(64))

In [34]:
model.add(layers.Dense(1, activation="sigmoid"))

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 50)            355050    
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 384,555
Trainable params: 384,555
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["acc"]
)

In [37]:
history = model.fit(
    data_train_data,
    data.review.values,
    epochs=10,
    batch_size=128,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
