In [13]:
import numpy as np
import pandas as pd

In [14]:
df = pd.read_csv("../data/new_york_times_comments/ArticlesMarch2018.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1385 entries, 0 to 1384
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   articleID         1385 non-null   object
 1   byline            1385 non-null   object
 2   documentType      1385 non-null   object
 3   headline          1385 non-null   object
 4   keywords          1385 non-null   object
 5   multimedia        1385 non-null   int64 
 6   newDesk           1385 non-null   object
 7   printPage         1385 non-null   int64 
 8   pubDate           1385 non-null   object
 9   sectionName       1385 non-null   object
 10  snippet           1385 non-null   object
 11  source            1385 non-null   object
 12  typeOfMaterial    1385 non-null   object
 13  webURL            1385 non-null   object
 14  articleWordCount  1385 non-null   int64 
dtypes: int64(3), object(12)
memory usage: 162.4+ KB


In [15]:
df.head()

Unnamed: 0,articleID,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL,articleWordCount
0,5a974697410cf7000162e8a4,By BINYAMIN APPELBAUM,article,"Virtual Coins, Real Resources","['Bitcoin (Currency)', 'Electric Light and Pow...",1,Business,1,2018-03-01 00:17:22,Economy,America has a productivity problem. One explan...,The New York Times,News,https://www.nytimes.com/2018/02/28/business/ec...,1207
1,5a974be7410cf7000162e8af,By HELENE COOPER and ERIC SCHMITT,article,U.S. Advances Military Plans for North Korea,"['United States Defense and Military Forces', ...",1,Washington,11,2018-03-01 00:40:01,Asia Pacific,The American military is looking at everything...,The New York Times,News,https://www.nytimes.com/2018/02/28/world/asia/...,1215
2,5a9752a2410cf7000162e8ba,By THE EDITORIAL BOARD,article,Mr. Trump and the ‘Very Bad Judge’,"['Trump, Donald J', 'Curiel, Gonzalo P', 'Unit...",1,Editorial,26,2018-03-01 01:08:46,Unknown,Can you guess which man is the model public se...,The New York Times,Editorial,https://www.nytimes.com/2018/02/28/opinion/tru...,1043
3,5a975310410cf7000162e8bd,By JAVIER C. HERNÁNDEZ,article,"To Erase Dissent, China Bans Pooh Bear and ‘N’","['China', 'Xi Jinping', 'Term Limits (Politica...",1,Foreign,1,2018-03-01 01:10:35,Asia Pacific,Censors swung into action after Mr. Xi’s bid t...,The New York Times,News,https://www.nytimes.com/2018/02/28/world/asia/...,1315
4,5a975406410cf7000162e8c3,"By JESSE DRUCKER, KATE KELLY and BEN PROTESS",article,Loans Flowed to Kushner Cos. After Visits to t...,"['Kushner, Jared', 'Kushner Cos', 'United Stat...",1,Business,1,2018-03-01 01:14:41,Unknown,"Apollo, the private equity firm, and Citigroup...",The New York Times,News,https://www.nytimes.com/2018/02/28/business/ja...,1566


In [16]:
formatted_headline = []

for headline in df['snippet']:
    formatted_headline.append(headline)
    
formated_text = '\n'.join(formatted_headline)
formated_text



In [17]:
import string
# 去除文本中的标点符号
def remove_puntuations(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [18]:
formated_text = remove_puntuations(formated_text)

In [21]:
formated_text = formated_text.lower()

In [22]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Embedding,LSTM,Dense,Bidirectional,Dropout, MultiHeadAttention,LayerNormalization,Input,Layer,Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence  import pad_sequences

In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([formated_text])
len(tokenizer.word_index)

7010

In [24]:
voc = len(tokenizer.word_index) + 1

In [25]:
input_sequence = []
for sentence in formated_text.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        n_gram = tokenized_sentence[:i+1]
        input_sequence.append(n_gram)

In [27]:
max_len = max([len(x) for x in input_sequence])
print(max_len)

41


In [28]:
padded_input_seq = pad_sequences(input_sequence,maxlen=max_len,padding='pre')

In [29]:
X = padded_input_seq[:,:-1]
y = padded_input_seq[:,-1]

In [30]:
y = to_categorical(y)
y.shape

(26481, 7011)

In [31]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [32]:
inputs = Input(shape = (max_len-1,))

embedding_layer = Embedding(voc,200)(inputs)

transformer_block = TransformerBlock(embed_dim=200,num_heads=4,ff_dim=128)

x = transformer_block(embedding_layer,training=True)

x = LayerNormalization(epsilon=1e-6)(x)

x = Dropout(0.2)(x)

x = Dense(512)(x)

x = Dropout(0.2)(x)

x = Flatten()(x)

outputs = Dense(voc,activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

In [33]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [34]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 40)]              0         
                                                                 
 embedding (Embedding)       (None, 40, 200)           1402200   
                                                                 
 transformer_block (Transfo  (None, 40, 200)           694928    
 rmerBlock)                                                      
                                                                 
 layer_normalization_2 (Lay  (None, 40, 200)           400       
 erNormalization)                                                
                                                                 
 dropout_2 (Dropout)         (None, 40, 200)           0         
                                                                 
 dense_2 (Dense)             (None, 40, 512)           102912

In [35]:
rlrong = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    mode='min',
    min_lr = 1e-5,
    patience = 2,
    verbose=1
)
estop = EarlyStopping(
    monitor = 'val_loss',
    mode= 'min',
    patience = 30,
    verbose = 1,
    restore_best_weights = True
)

In [36]:
history = model.fit(X,y,epochs=100,validation_split=0.2,callbacks = [rlrong,estop])

Epoch 1/100

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'],label='loss',color='red')
plt.plot(history.history['val_loss'],label='val Loss',color='blue')
plt.grid()
plt.legend()
plt.show()

In [None]:

text = "the museum says the demise of its payasyouwish"

#tokenize
for i in range(5):
    token_text = tokenizer.texts_to_sequences([text])[0]

    padded_token_text = pad_sequences([token_text],maxlen=40,padding='pre')

    
    pos = np.argmax(model.predict(padded_token_text))
    
    for word,index in tokenizer.word_index.items():
        if index == pos:
            text = text + " " + word
            print(text)