In [5]:
from google.colab import drive
drive.mount('/content/drive')



In [1]:
import pandas as pd
import numpy as np

In [2]:
path = "/content/drive/My Drive/training.1600000.processed.noemoticon.csv"
raw_data = pd.read_csv(path, encoding='ISO-8859-1',names=['target','ids','date','flag','user','text'])

In [3]:
raw_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
import spacy
import tensorflow as tf

In [5]:
#!python -m spacy download en_core_web_lg

In [6]:
nlp = spacy.load('en_core_web_lg')

In [7]:
raw_data.target.value_counts()

4    800000
0    800000
Name: target, dtype: int64

In [8]:
raw_data.target = raw_data.target.map({0:'NEGATIVE',2:'NEUTRAL',4:'POSITIVE'})

In [9]:
import matplotlib.pyplot as plt

In [10]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = nlp.Defaults.stop_words

In [11]:
len(stop_words)

326

In [12]:
import re

In [13]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
def preprocess(text, stem=False):
  text = re.sub(TEXT_CLEANING_RE,' ',str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  
  return ' '.join(tokens)




In [14]:
df = raw_data.copy()

In [15]:
df.text = df.text.apply(lambda x: preprocess(x))

In [16]:
df.text.head(10)

0           awww s bummer shoulda got david carr day d
1    upset t update facebook texting cry result sch...
2         dived times ball managed save 50 rest bounds
3                           body feels itchy like fire
4                                   s behaving m mad t
5                                                 crew
6                                             need hug
7    hey long time yes rains bit bit lol m fine tha...
8                                          nope didn t
9                                            que muera
Name: text, dtype: object

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_train,df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [19]:
df_test.shape

(320000, 6)

In [20]:
import gensim

In [21]:
document = [_doc.split() for _doc in df_train.text]

In [22]:
w2v_model = gensim.models.word2vec.Word2Vec(size=300,
                                            window=7,
                                            min_count=10,
                                            workers=8)

In [23]:
w2v_model.build_vocab(document)

In [24]:
w2v_model.train(document,total_examples=len(document),epochs=32)

(244063780, 282369536)

In [25]:
w2v_model.most_similar('depress')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('hate', 0.37013694643974304),
 ('anymore', 0.2999071478843689),
 ('understand', 0.29864436388015747),
 ('depressed', 0.28786057233810425),
 ('ekk', 0.2873992323875427),
 ('confuse', 0.28392714262008667),
 ('know', 0.2736774981021881),
 ('hopeless', 0.2640218734741211),
 ('sad', 0.25586459040641785),
 ('think', 0.25119641423225403)]

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [27]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

In [28]:
vocab_size = len(tokenizer.word_index) + 1

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
x_train = pad_sequences(sequences=(tokenizer.texts_to_sequences(df_train.text)),maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text),maxlen=300)

In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
labels = df.target.unique().tolist()
labels.append('NEUTRAL')

In [33]:
labels

['NEGATIVE', 'POSITIVE', 'NEUTRAL']

In [34]:
le = LabelEncoder()
le.fit(df_train.target.unique().tolist())

LabelEncoder()

In [35]:
y_train = le.transform(df_train.target.tolist())
y_test = le.transform(df_test.target.tolist())

In [36]:
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [37]:
embedding_matrix = np.zeros(shape=(vocab_size,300))

In [38]:
for word,i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]

In [39]:
embedding_matrix.shape

(290272, 300)

In [40]:
from tensorflow.keras.layers import Embedding,LSTM,Conv1D,MaxPool1D,Dropout
from tensorflow.keras.models import Model

In [41]:
embedding_layer = Embedding(vocab_size,300,weights = [embedding_matrix],trainable = False, input_length=300)

In [42]:
from tensorflow.keras.layers import Dense,Input

In [43]:
from tensorflow.keras.models import Sequential

In [44]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))



In [45]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          87081600  
_________________________________________________________________
dropout (Dropout)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 87,242,101
Trainable params: 160,501
Non-trainable params: 87,081,600
_________________________________________________________________


In [46]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [47]:
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping

In [48]:
callbacks = [ReduceLROnPlateau(cooldown=0,monitor='val_loss',patience=5),
             EarlyStopping(monitor='val_accuracy',patience=5,min_delta=1e-4)]

In [49]:
r = model.fit(x_train,y_train,validation_split=0.1,
              verbose=1,
              callbacks=callbacks,
              batch_size = 1024,
              epochs = 8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
