In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('./dataset/Tweets.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
data = data[['airline_sentiment', 'text']]

In [5]:
data.airline_sentiment.unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [6]:
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [7]:
data_p = data[data.airline_sentiment == 'positive']

In [8]:
data_n = data[data.airline_sentiment == 'negative']

In [9]:
data_n = data_n.iloc[:len(data_p)]

In [10]:
len(data_n), len(data_p)

(2363, 2363)

In [11]:
data = pd.concat([data_n, data_p])

In [12]:
data = data.sample(len(data))

In [13]:
data['review'] = (data.airline_sentiment == 'positive').astype('int')

In [14]:
del data['airline_sentiment']

tf.keras.layers.Embedding  把文本向量化

In [15]:
import re

In [16]:
token = re.compile('[A-Za-z]+|[!?,.()]')

In [17]:
def reg_text(text):
    new_text = token.findall(text)
    new_text = [word.lower() for word in new_text]
    return new_text

In [18]:
data['text'] = data.text.apply(reg_text)

In [19]:
word_set = set()
for text in data.text:
    for word in text:
        word_set.add(word) 

In [20]:
max_word = len(word_set) + 1
max_word

7101

In [21]:
word_list = list(word_set)

In [22]:
word_list.index('spending')

4272

In [23]:
word_index =  dict((word, word_list.index(word) + 1) for word in word_list)

In [24]:
word_index

{'travelzoo': 1,
 'hearts': 2,
 'junction': 3,
 'cking': 4,
 'lrfo': 5,
 'fair': 6,
 'hdn': 7,
 'dai': 8,
 'intended': 9,
 'squished': 10,
 'speaking': 11,
 'diego': 12,
 'carseat': 13,
 'jedediah': 14,
 'firstclass': 15,
 'from': 16,
 'enoughisenough': 17,
 'aopdtsq': 18,
 'lil': 19,
 'experienced': 20,
 'restrm': 21,
 'virginamerica': 22,
 'thisiscoach': 23,
 'qualified': 24,
 'dqjl': 25,
 'branding': 26,
 'program': 27,
 'makingthingseasy': 28,
 'destinationdragons': 29,
 'winters': 30,
 'embody': 31,
 'repeatedly': 32,
 'frankly': 33,
 'waitingforbags': 34,
 'shampoo': 35,
 'disrespected': 36,
 'dontmakemebeg': 37,
 'celebrate': 38,
 'lsn': 39,
 'liable': 40,
 'beanie': 41,
 'trash': 42,
 'normal': 43,
 'cutest': 44,
 'lea': 45,
 'ellahenderson': 46,
 'cnn': 47,
 'virgin': 48,
 'hbsj': 49,
 'performance': 50,
 'acknowledgement': 51,
 'nogate': 52,
 'delivery': 53,
 'providing': 54,
 'hiccups': 55,
 'intuitlife': 56,
 'americanisbetter': 57,
 'overcharge': 58,
 'makeup': 59,
 'ptfo'

In [25]:
data_ok = data.text.apply(lambda x: [word_index.get(word, 0) for word in x])

In [26]:
len(data_ok.iloc[2])

29

In [27]:
maxlen = max(len(x) for x in data_ok)

In [28]:
maxlen

40

In [29]:
data_ok = keras.preprocessing.sequence.pad_sequences(data_ok.values, maxlen=maxlen)

In [30]:
data_ok.shape

(4726, 40)

In [31]:
data.review.values

array([1, 0, 0, ..., 0, 0, 0])

In [32]:
model = keras.Sequential()
model.add(layers.Embedding(max_word, 50, input_length=maxlen))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.MaxPooling1D(3))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(1, activation='sigmoid'))

In [33]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 50)            355050    
_________________________________________________________________
conv1d (Conv1D)              (None, 40, 32)            11232     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 13, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 13, 32)            7200      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 373,515
Trainable params: 373,515
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.compile(optimizer=keras.optimizers.RMSprop(),
              loss='binary_crossentropy',
              metrics=['acc']
)

In [35]:
history = model.fit(data_ok, data.review.values, epochs=10, batch_size=128, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3780 samples, validate on 946 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
model = keras.Sequential()
model.add(layers.Embedding(max_word, 50, input_length=maxlen))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.MaxPooling1D(3))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(1, activation='sigmoid'))

In [37]:
model.compile(optimizer=keras.optimizers.RMSprop(),
              loss='binary_crossentropy',
              metrics=['acc']
)