# <center>Model Training (v2)</center>

<br>
<br>
<p>As a first step, we need to run the previously developed code.</p>
<br>
<br>

In [None]:
!wget -O trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
print('unziping ...')
!unzip -o -j trainingandtestdata.zip

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


data = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None, encoding='ISO-8859-1')
test = pd.read_csv("testdata.manual.2009.06.14.csv", header=None, encoding='ISO-8859-1')


data.columns = ["target", "ids", "date", "flag", "user", "text"]
test.columns = ["target", "ids", "date", "flag", "user", "text"]


data["target"] = data["target"].replace(4, 1)
test["target"] = test["target"].replace(4, 1)


df = data[["target", "text"]]
ts = test[["target", "text"]]


ts_bin = ts[ts["target"]!=2]
ts_neut = ts[ts["target"]==2]




df.to_csv('training_data.csv')
ts_bin.to_csv('test_data.csv')
ts_neut.to_csv('neutral_data.csv')

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split




df_m = pd.read_csv("training_data.csv")

labels = df_m["target"]
tweets = df_m["text"]

labels.count()

1600000

In [4]:
tok = Tokenizer(num_words=10000)
tok.fit_on_texts(tweets)

In [5]:
max_length = 30


tweets_seq = tok.texts_to_sequences(tweets)

padded_tweets = pad_sequences(tweets_seq, maxlen=max_length, padding='post')

print(padded_tweets[:5])

[[  39  147   56  473  144    4 1221    7 3659   48  828   12 1955   30
     2   41    9  385    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   8  818   17  111   69  565  193  536  126 2097    9    6  299  551
    85    4 2399  149   40  273 1170    0    0    0    0    0    0    0
     0    0]
 [   1  321  363   11    3 1298 1751    2  935 1164    3  493   37   31
    12    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   5  450  851  504 3036    6   34   71   13 1169    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [  36   42   24   23   32   19  617  113   62    1   91  217    1   69
    68    7   32  135   86    0    0    0    0    0    0    0    0    0
     0    0]]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(padded_tweets, labels, test_size=0.2, random_state=2)

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, Input
from keras.optimizers import Adam




model = Sequential()
model.add(Embedding(10000, 32, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])


print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 32)            320000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 960)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 961       
Total params: 320,961
Trainable params: 320,961
Non-trainable params: 0
_________________________________________________________________
None


<br>
<br>
<p>Now we'll train and test our second model.</p>
<br>
<br>

In [10]:
model.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0a7f7ae550>

In [14]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print("\nAccuracy for Embedding Neural Net Model: " + str(accuracy*100) + "%")

Accuracy for Embedding Neural Net Model: 76.8090625%
