In [2]:
import pandas as pd
import tensorflow as tf
import csv
import random
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers


In [3]:
#reading in the training dataset
df_train=pd.read_csv('/Users/chidam/Desktop/kaggle_tweet sentiment extraction/tweet_sentiment_train.csv')

In [4]:
#check the total number of records in the dataset

len(df_train)

27481

In [5]:
#checking the first few records
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
max_len=141
vocab_size = 25000
embedding_dim = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
#What I am going to do in this notebook
I am going to transform the sentiment column to numerics and then concatenate with the rest of the training dataset.
Also going to assign selected_text as the target column. In this notebook, I am just going to transform the words to their
vectors and use. My goal is to incorporate a lot of meaningful features in the future submissions.


In [10]:
y=df_train[['textID','selected_text']]

In [20]:
y

Unnamed: 0,textID,selected_text
0,cb774db0d1,"I`d have responded, if I were going"
1,549e992a42,Sooo SAD
2,088c60f138,bullying me
3,9642c003ef,leave me alone
4,358bd9e861,"Sons of ****,"
...,...,...
27476,4eac33d1c0,d lost
27477,4f4c4fc327,", don`t force"
27478,f67aae2310,Yay good for both of you.
27479,ed167662a5,But it was worth it ****.


In [12]:
df_train.drop('selected_text', axis=1, inplace=True)

In [13]:
#after the above transformations, the training dataset becomes
df_train.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [16]:
# df_train.loc[df_train['sentiment']=='neutral', 0]
# df_train.loc[df_train['sentiment']=='negative', -1]
# df_train.loc[df_train['sentiment']=='positive', 1]

Why not consider this dictionary the same as word_index dictionary? Therefore use the tokenizer to tranform the values in the
sentiment column?

d_sentiment={'neutral':0, 'negative':-1, 'positive':1}
            

In [21]:
#Let's find the maximum length of a sentence in the text column

def find_maxlength(l):

    max_len=0
    for i in l:
        max_len=max(max_len, len(str(i)))        
    return max_len
    


In [18]:
find_maxlength(df_train['text'])

141

In [22]:
find_maxlength(y['selected_text'])

141

In [23]:
find_maxlength(y['textID'])

10

In [24]:
#Let's transform the sentiment column in the dataset by assigning to s
s=df_train[['textID','sentiment']]

In [25]:
#dropping the sentiment column from the dataset
df_train.drop('sentiment', axis=1, inplace=True)

In [27]:
#re-mapping the values in the sentiment column by using the values in the dictionary: d_sentiment
s['sentiment'].replace(d_sentiment, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [33]:
s.isna()*100/len(s)

Unnamed: 0,textID,sentiment
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
27476,0.0,0.0
27477,0.0,0.0
27478,0.0,0.0
27479,0.0,0.0


In [34]:
percent_missing = s.isnull().sum() * 100 / len(s)
missing_value_s = pd.DataFrame({'column_name': s.columns,
                                 'percent_missing': percent_missing})

In [35]:
#there are no null values in s after the above transformation using the dictionary
missing_value_s

Unnamed: 0,column_name,percent_missing
textID,textID,0.0
sentiment,sentiment,0.0


In [38]:
training_portion=0.98
train_size = int(len(df_train['text'])*training_portion)



In [39]:
train_size

26931

In [40]:
y.head()

Unnamed: 0,textID,selected_text
0,cb774db0d1,"I`d have responded, if I were going"
1,549e992a42,Sooo SAD
2,088c60f138,bullying me
3,9642c003ef,leave me alone
4,358bd9e861,"Sons of ****,"


In [41]:
s.head()

Unnamed: 0,textID,sentiment
0,cb774db0d1,0
1,549e992a42,-1
2,088c60f138,-1
3,9642c003ef,-1
4,358bd9e861,-1


In [42]:
#split train, validation sequences and also the train and validation labels


train_sentences = df_train['text'][:train_size]
train_labels = y['selected_text'][:train_size]

validation_sentences = df_train['text'][train_size:]
validation_labels = y['selected_text'][train_size:]


train2_sentences=s['sentiment'][:train_size]#to concatenate with the train set

validation2_sentences = s['sentiment'][train_size:]#to concatenate with the validation set 


In [62]:
#to convert every entry in the train and validation sentences and target sentences into string type
def change_type_sentence(x, y, c, z):
    train=[]
    train_lab=[]
    validation=[]
    validation_lab=[]
    
    for i in x:
        train.append(str(i).lstrip().rstrip().replace("  ", " "))
        
    for j in y:
        train_lab.append(str(j).lstrip().rstrip().replace("  ", " "))
        
    for v in c:
        validation.append(str(v).lstrip().rstrip().replace("  ", " "))
        
    for k in z:
        validation_lab.append(str(k).lstrip().rstrip().replace("  ", " "))
        
    return train, train_lab, validation, validation_lab
        
        

In [63]:
train_sentence, train_label, validation_sentence, validation_label=change_type_sentence(train_sentences, train_labels, validation_sentences, validation_labels)



In [60]:
#trial version with the train_sentences


def change_type_sentence(x):
    train=[]
    for i in x:
#         i = i.replace("  ", " ")
        
        train.append(str(i).lstrip().rstrip().replace("  ", " "))
        
    return train
        

In [64]:
# change_type_sentence(train_sentences)

In [66]:
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentence)

train_sequences = tokenizer.texts_to_sequences(train_sentence)
train_padded = pad_sequences(train_sequences, maxlen=max_len)

validation_sequences = tokenizer.texts_to_sequences(validation_sentence)
validation_padded = pad_sequences(validation_sequences,maxlen=max_len)

#while running this received the following error:
#ValueError: invalid literal for int() with base 10: ' Why so expensive' 
#I think removing spaces after quotations would solve the problem



In [67]:
#In this case in point, we have to transform the target texts and pad them appropriately 

train_labels = tokenizer.texts_to_sequences(train_label)
train_labels_padded = pad_sequences(train_labels, maxlen=max_len)

validation_labels = tokenizer.texts_to_sequences(validation_label)
validation_labels_padded = pad_sequences(validation_labels,maxlen=max_len)

In [75]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 141, 16)           400000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                9600      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 390       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 409,997
Trainable params: 409,997
Non-trainable params: 0
_________________________________________________________________


In [74]:
train_padded.shape

(26931, 141)

In [73]:
np.array(train2_sentences).shape

(26931,)

In [71]:
#turns out it's not possible to concatenate the embedding array and the numerical column as their shape differs
train_padded+np.array(train2_sentences)

ValueError: operands could not be broadcast together with shapes (26931,141) (26931,) 

In [None]:
#now is the time to concatenate the transformed sentiment column with the train dataset

#well turns out that we add numerical vector columns as metadata to the padded train dataset. Maybe this is just one of the
# ways. Will research more to understand. it's really interesting!



In [None]:
#
# nlp_input = Input(shape=train_padded.shape, name='nlp_input')
# meta_input = Input(shape=(26931,), name='meta_input')
# emb = Embedding(output_dim=embedding_size, input_dim=vocab_size, input_length=max_len)(nlp_input)
# nlp_out = Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3, kernel_regularizer=regularizers.l2(0.01)))(emb)
# x = concatenate([nlp_out, meta_input])
# x = Dense(classifier_neurons, activation='relu')(x)
# x = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[nlp_input , meta_input], outputs=[x])




In [85]:
from keras.layers.merge import concatenate

sentiment_data=np.array(train2_sentences)

nlp_input = tf.keras.Input(shape=train_padded.shape, name='nlp_input')
meta_input = tf.keras.Input(shape=sentiment_data.shape, name='meta_input')
x = concatenate([nlp_input, meta_input])
num_epochs = 5


history = model.fit(x, np.array(train_labels_padded), epochs=num_epochs, validation_data=(np.array(validation_padded), np.array(validation_labels_padded)), verbose=2)



ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concat axis. Got inputs shapes: [(None, 26931, 141), (None, 26931)]

In [86]:
from keras.layers import Input, Dense, Dropout, concatenate
from keras.models import Model

x1 = Input(shape=train_padded.shape)
x2 = Input(shape=sentiment_data.shape)


input_layer = concatenate([x1,x2])

hidden_layer = Dense(units=6, activation='relu')(input_layer)
prediction = Dense(6, activation='sigmoid')(hidden_layer)

model = Model(inputs=input_layer,outputs=prediction)

model.summary()

ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concat axis. Got inputs shapes: [(None, 26931, 141), (None, 26931)]

In [91]:
np.array(train_padded).shape

(26931, 141)

In [92]:
np.array(validation_padded).shape

(550, 141)

In [93]:
train_labels_padded.shape

(26931, 141)

In [94]:
validation_labels_padded.shape

(550, 141)

In [88]:
num_epochs = 5

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()


history = model.fit(np.array(train_padded),, epochs=num_epochs, validation_data=(np.array(validation_padded), np.array(validation_labels_padded)), verbose=2)



Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 141, 16)           400000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                9600      
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 390       
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 42        
Total params: 410,032
Trainable params: 410,032
Non-trainable params: 0
_________________________________________________________________


ValueError: A target array with shape (26931, 141) was passed for an output of shape (None, 6) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.