In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words("english")
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniellagrimberg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Load Data**

In [4]:
df = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin-1', header = None, names=['sentiment', 'id', 'date', 'flag', 'user', 'tweet'])

In [5]:
from sklearn.utils import shuffle
df = shuffle(df)
#using a subset of 100k tweets to lower training time
df = df.head(100000)
df

Unnamed: 0,sentiment,id,date,flag,user,tweet
1442399,4,2061968114,Sat Jun 06 22:01:25 PDT 2009,NO_QUERY,AlexCox,Chicago with @maxdie and @derekbishe was quite...
1594417,4,2192143943,Tue Jun 16 06:34:48 PDT 2009,NO_QUERY,AurelieDaure,has the answer for all your IT needs
402842,0,2058034367,Sat Jun 06 14:08:12 PDT 2009,NO_QUERY,MrsNickJonas680,i almost forgot! it's d day makes me so sadd.
351269,0,2018333293,Wed Jun 03 09:48:15 PDT 2009,NO_QUERY,cheekyrzchick,Feel so sick.. but it aint the flu. just feel ...
741238,0,2266210274,Sun Jun 21 08:06:07 PDT 2009,NO_QUERY,pindowngirl,@anna8687 awh... too bad about the no wet t s...
...,...,...,...,...,...,...
1002699,4,1880237005,Fri May 22 00:32:58 PDT 2009,NO_QUERY,marcieaball,"Finally in the park, no one lost or in trouble..."
1532008,4,2178165299,Mon Jun 15 07:17:33 PDT 2009,NO_QUERY,Lena_DISTRACTIA,@jweaving lil wayne?
524161,0,2193392702,Tue Jun 16 08:23:48 PDT 2009,NO_QUERY,steffmd25,@pinkiecharm You're so right. I only got a hi...
256225,0,1984676693,Sun May 31 15:34:57 PDT 2009,NO_QUERY,AlyYvonneG,Home now the worst part of the day is finally ...


In [6]:
from sklearn.model_selection import StratifiedKFold
y = df['sentiment']
feats = [col for col in df.columns if col!= "sentiment"]
X = df[feats]
skf = StratifiedKFold(n_splits=5, random_state=12345, shuffle=True)
train_index, test_index = list(skf.split(X, y))[0]
df_test, df_train = df.iloc[test_index], df.iloc[train_index]

**Clean Data**
1. Remove Duplicate Rows
2. Remove Columns we wont use
3. Format Target column (sentiment) into 0/1

In [7]:
df_train = df_train.drop_duplicates(subset=['id'], keep='first')
df_train = df_train.drop_duplicates(subset=['id'], keep='first')

df_train.drop(columns=['id', 'flag', 'user'], inplace=True)
df_test.drop(columns=['id', 'flag', 'user'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [8]:
df_train["sentiment"] = df_train["sentiment"].apply(lambda s: 1 if s!=0 else s)
df_test["sentiment"] = df_test["sentiment"].apply(lambda s: 1 if s!=0 else s)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["sentiment"] = df_test["sentiment"].apply(lambda s: 1 if s!=0 else s)


In [9]:
df_test

Unnamed: 0,sentiment,date,tweet
1594417,1,Tue Jun 16 06:34:48 PDT 2009,has the answer for all your IT needs
741238,0,Sun Jun 21 08:06:07 PDT 2009,@anna8687 awh... too bad about the no wet t s...
6246,0,Tue Apr 07 06:04:55 PDT 2009,@bob_lee92 great! I got my first tattoo yester...
571618,0,Wed Jun 17 09:46:55 PDT 2009,@verythat lollapalooza's has as a great a line...
1209292,1,Sun May 31 23:45:50 PDT 2009,@appleseedinc Dive Shop Caroline here followin...
...,...,...,...
293469,0,Mon Jun 01 14:54:24 PDT 2009,At the bus stop alone Missing dreamy guy. Oh ...
689010,0,Sat Jun 20 04:11:48 PDT 2009,is listening to the Solitary Snape recordings....
997561,1,Mon May 18 07:22:00 PDT 2009,"@Time4CoffeeTime I am having coffee right now,..."
1002699,1,Fri May 22 00:32:58 PDT 2009,"Finally in the park, no one lost or in trouble..."


**Feature Engineering**

- Note: These new columns were not used in baseline model but might be useful later

In [10]:
import re
df_train["hashtags"] = df_train["tweet"].apply(lambda x: ",".join(tag for tag in list(re.findall(r"#(\w+)", x))))
df_test["hashtags"] = df_test["tweet"].apply(lambda x: ",".join(tag for tag in list(re.findall(r"#(\w+)", x))))
df_train["hashtag_count"] = df_train["hashtags"].apply(lambda h: len(h.split(',')) if len(h)>0 else 0)
df_test["hashtag_count"] = df_test["hashtags"].apply(lambda h: len(h.split(',')) if len(h) >0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["hashtags"] = df_test["tweet"].apply(lambda x: ",".join(tag for tag in list(re.findall(r"#(\w+)", x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["hashtag_count"] = df_test["hashtags"].apply(lambda h: len(h.split(',')) if len(h) >0 else 0)


**Preprocessing**
1. Lower-case letters
2. Remove stop words 
3. Tokenize using twitter tokenizer and lemmatize
4. Remove punctuation

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub('@[A-Za-z0–9]+', ' ', text)
    text = re.sub('#', ' ', text)
    text = re.sub('https?:\/\/\S+', ' ', text)
    return text
 

In [12]:
df_train["tweet"] = df_train["tweet"].apply(lambda x: clean_text(x))
df_test["tweet"] = df_test["tweet"].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["tweet"] = df_test["tweet"].apply(lambda x: clean_text(x))


In [13]:
def remove_stop_words(s):
    new_sent = ""
    for word in s.split():
        if word not in stop_words:
            new_sent += " " + word
    return new_sent

df_train["tweet"] = df_train["tweet"].apply(lambda s: remove_stop_words(s))

In [14]:
df_test["tweet"] = df_test["tweet"].apply(lambda s: remove_stop_words(s))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["tweet"] = df_test["tweet"].apply(lambda s: remove_stop_words(s))


In [15]:
def lemmatize_tokenize(tweet):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokenizer = TweetTokenizer()
    new_t = []
    for word in tokenizer.tokenize(tweet):
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_t.append(lemmatizer.lemmatize(new_word)) 
    return " ".join(new_t)

In [16]:
df_train["tweet"] = df_train["tweet"].apply(lambda t: lemmatize_tokenize(t))
df_test["tweet"] = df_test["tweet"].apply(lambda t: lemmatize_tokenize(t))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["tweet"] = df_test["tweet"].apply(lambda t: lemmatize_tokenize(t))


**Modelling with Word2Vec & RNN**

In [17]:
from gensim.models import Word2Vec

documents = [text.split() for text in df_train.tweet]
size = 200
model = Word2Vec(size=size, window=7, min_count=10, workers=4)
model.build_vocab(documents)

In [18]:
model.train(documents, total_examples=len(documents), epochs=20)

(9805366, 12311660)

**Example of Word2Vec model working. Notice the interesting twitter-specific voaccabulary such as h8**

In [19]:
model.wv.most_similar("hate")

[('dislike', 0.5086536407470703),
 ('fml', 0.4550246000289917),
 ('blah', 0.4027697443962097),
 ('swear', 0.40239256620407104),
 ('boring', 0.3992295265197754),
 ('urgh', 0.398406982421875),
 ('suck', 0.39819324016571045),
 ('killing', 0.396727591753006),
 ('hating', 0.3921506106853485),
 ('h8', 0.3761572241783142)]

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.tweet)

**Saving Tokenizer to use in Flask App**

In [35]:
import pickle

filename = 'tokenizer.pkl'
pickle.dump(tokenizer, open(filename, 'wb'))

**Tokenizing Process**

In [21]:
sequences_train = tokenizer.texts_to_sequences(df_train.tweet)
sequences_test = tokenizer.texts_to_sequences(df_test.tweet)

X_train_seq = sequence.pad_sequences(sequences_train, maxlen=100, value=0)

X_test_seq = sequence.pad_sequences(sequences_test, maxlen=100, value=0)

In [22]:
y_train = df_train["sentiment"]
y_test = df_test["sentiment"]

In [23]:
w_index = tokenizer.word_index

vocab_size = len(w_index) + 1

In [24]:
embedding_m = np.zeros((vocab_size, size))


In [25]:
for word, idx in list(w_index.items()):
    if word in list(model.wv.vocab.keys()):
        embedding_m[idx] = model.wv[word]

In [26]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GRU,Bidirectional, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.initializers import Constant

In [27]:
nn_model = Sequential()

**Building NN architecture**

In [28]:
emb_layer = Embedding(vocab_size, 200, weights=[embedding_m], input_length=100, trainable=False)
nn_model.add(emb_layer)
nn_model.add(Dropout(rate=0.4))
nn_model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
nn_model.add(Dropout(rate=0.4))
nn_model.add(Bidirectional(LSTM(units=128, return_sequences=False)))
nn_model.add(Dense(units=1, activation='sigmoid'))
nn_model.add(Dense(1, activation='sigmoid'))

In [29]:
nn_model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])

In [30]:
from tensorflow.keras.callbacks import EarlyStopping

nn_model.summary()
callbacks = [EarlyStopping(monitor='val_accuracy', patience=0)]
nn_model.fit(X_train_seq, y_train, batch_size=128, epochs=12, validation_split=0.2, callbacks=callbacks)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          9616800   
_________________________________________________________________
dropout (Dropout)            (None, 100, 200)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          336896    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2

<tensorflow.python.keras.callbacks.History at 0x7f8667f61d90>

**Model Final Accuracy: 76%**

- Saving model and weights to use in Flask App

In [33]:
model_json = nn_model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
    
nn_model.save_weights("weights.h5")
print("Saved model to disk")


Saved model to disk


In [32]:
nn_model.evaluate(X_test_seq, y_test, batch_size=16)



[0.5149518847465515, 0.7591999769210815]

**Testing Out Model**

**Prediction is a value from 0-1. 0 is negative, 1 is positive. We could set a threshold at for example 0.5 to determine the sentiment, but for now we are evaluating the sentiment as a continuous variable (how positive/negative)**

In [None]:
#Example of a tweet with positive and negative

In [75]:
test_t = sequence.pad_sequences(tokenizer.texts_to_sequences(["I love you so much but I dont like this"]), maxlen=100, value=0)

In [76]:
nn_model.predict(test_t)

array([[0.5572156]], dtype=float32)

**Notice that the positive was more exagerated than negative, and in fact the tweet would be classified as positive**

In [72]:
#Example of a tweet that would be neutral

In [73]:
test_2 = sequence.pad_sequences(tokenizer.texts_to_sequences(["I think that I will go to California next week"]), maxlen=100, value=0)

In [74]:
nn_model.predict(test_2)

array([[0.5061608]], dtype=float32)

In [None]:
#Example of a very negative tweet

In [68]:
test_3 = sequence.pad_sequences(tokenizer.texts_to_sequences(["His speech was disgusting. I really don't agree with this horrible behaviour"]), maxlen=100, value=0)

In [69]:
nn_model.predict(test_3)

array([[0.30823016]], dtype=float32)

In [None]:
#Example of a tweet we expact to be very positive

In [66]:
test_4 = sequence.pad_sequences(tokenizer.texts_to_sequences(["The president in Colombia is the best, I would vote for him again"]), maxlen=100, value=0)

In [67]:
nn_model.predict(test_4)

array([[0.81856334]], dtype=float32)

In [None]:
#Example of a tweet we would expect to be neutral

In [64]:
test_5 = sequence.pad_sequences(tokenizer.texts_to_sequences(["I read an article today"]), maxlen=100, value=0)

In [65]:
nn_model.predict(test_5)

array([[0.6176232]], dtype=float32)