# TWEET SENTIMENT EXTRACTION

**Kaggle competition**: [Tweet Sentiment Extraction](https://www.kaggle.com/c/tweet-sentiment-extraction)

Several tweets are classified as neutral, positive or negative. The task consist in extracting which part of the tweet makes the tweet neutral, positive or negative.

**Example**

INPUT:

tweet sentence:"  SWEEEEET - San Fran is awesome!!!!  Love it there "

*it is known to be defined as positive*

OUTPUT:

it is positive because it contains: " Love it there "

# Import libraries

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split,StratifiedKFold,KFold

# Data

**train.csv** contains 4 columns:
1.  Tweet ID
2.  text: tweet sentence
3.  selected_text: part of the tweet which defines the sentiment
4.  sentiment: positive, neutral or negative

**test.csv** contains 3 columns:
1.  Tweet ID
2.  text: tweet sentence
3.  sentiment: positive, neutral or negative

In [2]:
PATH_DATA="Data/"

In [3]:
def corpus_extraction(file):
    ## extraction of data from files
    corpus = []
    with open(file) as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader, None)
        for row in reader:
            corpus.append(row)
    return corpus

def corpus_selection(corpus,selection,selection_column):
    ## This function extract a sub-corpus containing only one sentiment type
    ## INPUTS
    #  corpus
    #  selection: "postivive", "neutral", "negative"
    #  selection_column: contains the column of corpus in which sentiment is found
    ## OUTPUT
    #  corpus_s: sub-corpus
    corpus_s=[]
    for row in corpus:
        if row[selection_column]==selection:
            corpus_s.append(row)
    return corpus_s

def separate(corpus):
    ## Extract sentence (tweet), index (ID) and sentence_label (selected part of the tweet) from a corpus
    sentence=[]
    index=[]
    sentence_label=[]
    for row in corpus:
        sentence.append(row[1])
        index.append(row[0])
        sentence_label.append(row[2])
    return sentence,index,sentence_label

def separate_test(corpus):
    ## Extract sentence (tweet) and index (ID) from a corpus
    sentence=[]
    index=[]
    for row in corpus:
        sentence.append(row[1])
        index.append(row[0])
    return sentence,index


In [4]:
## Corpus creaetion
corpus=corpus_extraction(PATH_DATA+"train.csv")
corpus_test=corpus_extraction(PATH_DATA+"test.csv")

## TRAINING DATA
## Corpus separation according to sentiment
corpus_neutral=corpus_selection(corpus,selection='neutral',selection_column=3)
corpus_positive=corpus_selection(corpus,selection='positive',selection_column=3)
corpus_negative=corpus_selection(corpus,selection='negative',selection_column=3)

## Extraction of sentence
sentence_neutral,index_neutral,sentence_label_neutral=separate(corpus_neutral)
sentence_positive,index_positive,sentence_label_positive=separate(corpus_positive)
sentence_negative,index_negative,sentence_label_negative=separate(corpus_negative)

number_positive=len(sentence_positive)
number_negative=len(sentence_negative)

# positive and negative tweets shall be trained together
sentence=sentence_positive+sentence_negative
index=index_positive+index_negative
sentence_label=sentence_label_positive+sentence_label_negative


## TEST DATA
corpus_test_neutral=corpus_selection(corpus_test,selection='neutral',selection_column=2)
corpus_test_positive=corpus_selection(corpus_test,selection='positive',selection_column=2)
corpus_test_negative=corpus_selection(corpus_test,selection='negative',selection_column=2)

sentence_test_neutral,index_test_neutral=separate_test(corpus_test_neutral)
sentence_test_positive,index_test_positive=separate_test(corpus_test_positive)
sentence_test_negative,index_test_negative=separate_test(corpus_test_negative)

number_test_positive=len(sentence_test_positive)
number_test_negative=len(sentence_test_negative)

sentence_test=sentence_test_positive+sentence_test_negative
index_test=index_test_positive+index_test_negative

del sentence_positive,sentence_test_positive
del sentence_negative,sentence_test_negative
del index_positive,index_test_positive
del index_negative,index_test_negative
del sentence_label_positive
del sentence_label_negative

# SCORE: word-level Jaccard score

In [5]:
def jaccard(str1, str2): 
    ## JACCARD score between two strings
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    if np.abs(len(a) + len(b) - len(c))>0:
        return float(len(c)) / (len(a) + len(b) - len(c))
    else:
        return 0.0
    
def all_jaccard(sentence_label,sentence_label_pred):
    ## Jaccard average score between a list of sentence_label and list of sentence_label predicted
    cont=0
    for i in range(len(sentence_label)):
        #print(i)
        #print(sentence_label[i],"/////",sentence_label_pred[i])
        cont=cont+jaccard(sentence_label[i], sentence_label_pred[i])
    cont=cont/len(sentence_label)
    return cont

n_pn=len(sentence)
score_pn=all_jaccard(sentence_label,sentence)
n_n=len(sentence_neutral)
score_n=all_jaccard(sentence_label_neutral,sentence_neutral)

print("Number of sentences positive and negative: ",n_pn)
print("initial prediction score for positive and negative: ",score_pn)
print("\n")
print("Number of sentences neutral: ",n_n)
print("initial prediction score for neutral: ",score_n)
print("\n")
print("Expected initial score: ",((n_pn*score_pn+n_n*score_n)/(n_pn+n_n)))

Number of sentences positive and negative:  16363
initial prediction score for positive and negative:  0.32589890326672905


Number of sentences neutral:  11118
initial prediction score for neutral:  0.9763568439593936


Expected initial score:  0.5890549523414005


Note that simply by assuming sentence_label is equal to sentence, the initial score is 0.589. The maximum public score at present time obtained in the kaggle competition is 0.731. There is little margin for improvement (About 0.14).

# Tokenizer

In [114]:
def separate_filter(sentence):
    ## This function adds "/" around any sign (of the ones contained in string sign_included) found in the sentence.
    #  sentence can be a list of sentences or a single sentence.
    sign_included='!"#$%&()+,-;<=>?@^_`{|}~'
    islist=0
    if isinstance(sentence, str):
        sentence=[sentence]
        islist=1
    sentence=sentence.copy()
    for i in range(len(sentence)):
        for j in range(len(sign_included)):
            sentence[i]=sentence[i].replace(sign_included[j],"/"+sign_included[j]+"/")
    if islist==1:
        return sentence[0]
    if islist==0:
        return sentence

def to_sequence(sentence,tokenizer):
    ## This function tokenizes the sentece (it can be a list of sentences or a single sentence)
    sentence=separate_filter(sentence)
    if isinstance(sentence, str):
        sentence=[sentence]
    sequence=tokenizer.texts_to_sequences(sentence)
    return sequence

def to_sentence(sequence):
    ## This function converts from sequence to sentence
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

def create_model_input_pn(sentence,tokenizer,number_positive,number_negative,length=-1):
    ## This function creates the model input for positive and negative sentences
    sequence=to_sequence(sentence,tokenizer)
    if length==-1:
        sequence = pad_sequences(sequence,padding='post',truncating='post')
        length=len(sequence[0])
    else:
        sequence = pad_sequences(sequence,maxlen=length,padding='post',truncating='post')
    sentiment=np.vstack((np.zeros((number_positive,length))+word_index['positive'],np.zeros((number_negative,length))+word_index['negative']))
    sequence=np.array(sequence).reshape(-1,length,1)  
    
    x=np.concatenate([sequence,sentiment.reshape(-1,length,1)],axis=2)
    return x

def create_model_input_n(sentence,tokenizer,length=-1):
    ## This function creates the model input for neutral sentences
    sequence=to_sequence(sentence,tokenizer)
    if length==-1:
        sequence = pad_sequences(sequence,padding='post',truncating='post')
        length=len(sequence[0])
    else:
        sequence = pad_sequences(sequence,maxlen=length,padding='post',truncating='post')
    sentiment=np.zeros((len(sentence),length))+word_index['neutral']
    sequence=np.array(sequence).reshape(-1,length,1)
    x=np.concatenate([sequence,sentiment.reshape(-1,length,1)],axis=2)
    return x

def print_example(sentence,sequence,sentence_label,example_number):
    print("sentence:\"", sentence[example_number],"\"")
    print("sequence: ",sequence[example_number])
    print("sentence label:\"", sentence_label[example_number],"\"")
    
    

In [115]:
## Tokenizer creation
FILTERS='./:[\\]\t\n'
tokenizer = Tokenizer(oov_token="<OOV>",filters=FILTERS)
tokenizer.fit_on_texts(separate_filter(sentence+sentence_test+sentence_neutral+["neutral"]))

# word_index contains a dictionary with the index of each token
word_index = tokenizer.word_index
# reverse_word_index contains a dictionary with the token of each index
reverse_word_index=dict([(value,key) for (key,value) in word_index.items()])

# Number of tokens
vocab_size=len(word_index)

length=103
#initial_length=63
# Inputs to model
x=create_model_input_pn(sentence,tokenizer,number_positive,number_negative,length=length)
x_test=create_model_input_pn(sentence_test,tokenizer,number_test_positive,number_test_negative,length=length)

# Inputs to model
x_neutral=create_model_input_n(sentence_neutral,tokenizer,length=length)
x_test_neutral=create_model_input_n(sentence_test_neutral,tokenizer,length=length)


# Labels for the model

In [116]:
def start_label(sentence,sentence_label,tokenizer):
    ## This function returns the token number at which the sentence_label starts in the sentence
    sentence_label=separate_filter(sentence_label)
    sentence=separate_filter(sentence)
    index = sentence.find(sentence_label)
    sequence=tokenizer.texts_to_sequences([sentence[:index]])
    return len(sequence[0])

def end_label(sentence,sentence_label,tokenizer):
    ## This function returns the token number at which the sentence_label ends in the sentence
    sentence_label=separate_filter(sentence_label)
    sentence=separate_filter(sentence)
    index = sentence.find(sentence_label)
    sequence=tokenizer.texts_to_sequences([sentence[:index+len(sentence_label)+1]])
    return len(sequence[0])-1

def y_label(sentence,sentence_label,tokenizer,length):
    ## This function returns a list of start_label and end_label for each sentence.
    y_start=[]
    y_end=[]
    y=np.zeros((len(sentence),length))
    for i in range(len(sentence)):
        y_start.append(start_label(sentence[i],sentence_label[i],tokenizer))
        y_end.append(end_label(sentence[i],sentence_label[i],tokenizer))
        y[i,start_label(sentence[i],sentence_label[i],tokenizer):end_label(sentence[i],sentence_label[i],tokenizer)]=y[i,start_label(sentence[i],sentence_label[i],tokenizer):end_label(sentence[i],sentence_label[i],tokenizer)]+1
    return y_start,y_end,y


y_start,y_end,y=y_label(sentence,sentence_label,tokenizer,length)
y_start_neutral,y_end_neutral,y_neutral=y_label(sentence_neutral,sentence_label_neutral,tokenizer,length)

In [117]:
def y2text(sentence,y,tokenizer):
    # This function creates a sentence_label using the sentence and the y labels.
    n1=len(sentence)
    nq1=len(to_sequence(sentence,tokenizer)[0])
    sentence=[sentence].copy()
    sentence=sentence[0]
    if y[0]>=nq1:
        y[0]=nq1-1
    if y[1]<=y[0]:
        y[1]=y[0]
    if y[1]>=nq1:
        y[1]=y[0]
    if y[0]>0:
        cont=0
        while cont<n1:
            seq_aux=to_sequence(sentence[cont:],tokenizer)
            if len(seq_aux[0])==nq1-y[0]:
                sentence=sentence[cont:]
                break
            cont=cont+1
        n1=len(sentence)
    cont=n1
    while cont>0:
        seq_aux=to_sequence(sentence[:cont],tokenizer)
        if len(seq_aux[0])==y[1]-y[0]+1:
            sentence=sentence[:cont]
            break
        cont=cont-1
    while (FILTERS+" ").find(sentence[0])!=-1:
        sentence=sentence[1:]
    return sentence



# Embedding matrix

In [118]:
embeddings_index = {};
embedding_dim=100
with open('glove.6B.100d.txt',encoding='UTF-8') as f:
#with open('glove.42B.300d.txt',encoding='UTF-8') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

## Model

In [119]:
def convbatchrelu(model,f,k,drop_prob=0):
    ## INPUTS
    # model
    # f: number filters
    # k: kernel size
    # drop_prob: drop-out probability (if 0, dropout removed)
    ## OUTPUT
    # model
    
    model = tf.keras.layers.Conv1D(f,k,padding='same')(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.ReLU()(model)
    if drop_prob>0:
        model = tf.keras.layers.Dropout(drop_prob)(model)
    return model

In [155]:
def create_model(seed):
    ## CREATES A MODEL
    ## seed: random seed for the model
    
    tf.random.set_seed(seed)
    drop_prob=0.5  # Drop-our rate
    lambda_reg=0   # L2 regularization lambda

    ## INPUT layer
    inputs = tf.keras.layers.Input(shape = (int(length),2))

    #mask = tf.keras.layers.Reshape((length,1))(inputs[:,:,2])

    
    ## Embedding
    model = tf.keras.layers.Embedding(vocab_size+1, embedding_dim, weights=[embeddings_matrix], trainable=True)(inputs[:,:,0])
    
    ## model multiplied by the embedding of special word: "psotive" or "negative" or "neutral"
    positive=tf.keras.layers.Embedding(vocab_size+1, embedding_dim, weights=[embeddings_matrix], trainable=True)(inputs[:,:,1])
    model = tf.keras.layers.Multiply()([model, positive])

    model = tf.keras.layers.Dropout(drop_prob)(model)
    out1=model
    model = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim,return_sequences=True,kernel_regularizer=regularizers.l2(lambda_reg)))(model)
    model = tf.keras.layers.Dense(embedding_dim,kernel_regularizer=regularizers.l2(lambda_reg))(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.ReLU()(model)
    model = tf.keras.layers.Dropout(drop_prob)(model)
    model = tf.keras.layers.Add()([model,out1])

    out2=model
    model = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim,return_sequences=True,kernel_regularizer=regularizers.l2(lambda_reg)))(model)
    model = tf.keras.layers.Dense(embedding_dim,kernel_regularizer=regularizers.l2(lambda_reg))(model)
    model = tf.keras.layers.BatchNormalization()(model)
    model = tf.keras.layers.ReLU()(model)
    model = tf.keras.layers.Dropout(drop_prob)(model)

    model = tf.keras.layers.Add()([model,out1,out2])
    

    output_start = tf.keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=regularizers.l2(lambda_reg))(model)
    output_end = tf.keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=regularizers.l2(lambda_reg))(model)
    output_start = tf.keras.layers.Flatten(name='start')(output_start)
    output_end = tf.keras.layers.Flatten(name='end')(output_end)

    model = tf.keras.Model(inputs=inputs, outputs=[output_start,output_end])
    #model.summary()

    ## Compilation
    model.compile(optimizer = tf.optimizers.Adam(),
                  loss = 'categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

#create_model(1)

In [156]:
def create_callbacks(checkpoint_filepath):
    ## Creation of callbacks
    ## INPUT
    #  checkpoint_filepath: path for checkpoint
    
    ## REDUCTION LEARNING RATE ON PLATEAU
    learning_rate_reduction = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_start_accuracy', 
                                                                   patience=4, 
                                                                   verbose=1, 
                                                                   factor=0.5, 
                                                                   min_lr=0.00000001)

    ## EARLY STOPPING
    callback_earlyS = tf.keras.callbacks.EarlyStopping(monitor='val_start_accuracy', patience=7) 

    ## Nice plotting of progressbar
    #tqdm_callbacks = tfa.callbacks.TQDMProgressBar()

    ## Checkpoint everytime accuracy is maximum
    #checkpoint_filepath = '/tmp/checkpoint'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                                filepath=checkpoint_filepath,
                                save_weights_only=True,
                                monitor='val_start_accuracy',
                                mode='max',
                                save_best_only=True)
    
    return [learning_rate_reduction,callback_earlyS,model_checkpoint_callback]


In [157]:
def train_model(model,x_train,y_start_train,y_end_train,x_CV,y_start_CV,y_end_CV):
    checkpoint_filepath = '/tmp/checkpoint'
    CALLBACKS=create_callbacks(checkpoint_filepath)
    
    batchsize=128
    history=model.fit(x_train,[y_start_train,y_end_train],
                     validation_data = (x_CV, [y_start_CV,y_end_CV]),
                     batch_size=batchsize,
                     #steps_per_epoch=x_train.shape[0] // batchsize,
                     epochs=50,
                     verbose=1,
                     callbacks=CALLBACKS)

    model.load_weights(checkpoint_filepath)
    
    return model

In [158]:
def CV_model(model,x,sentence,sentence_label,tokenizer):
    # calculate Jaccard score on cross-validation data
    def evaluate_all_jaccard(sentence,sentence_label,y,tokenizer):
        cont=0
        for i in range(len(sentence)):
            sentence_label_predicted=y2text(sentence[i],y[i],tokenizer)
            cont=cont+jaccard(sentence_label[i], sentence_label_predicted)
            #print(sentence_label_predicted,"/////",sentence_label[i])
        cont=cont/len(sentence)
        return cont
    Y_pred=np.array(model.predict(x))
    y_pred=Y_pred.argmax(axis=2).transpose()
    score=evaluate_all_jaccard(sentence,sentence_label,y_pred,tokenizer)
    print("Score:", score)
    return score
    
def test_model(listmodel,x_test,sentence_test,tokenizer):
    # creates the sentence_label for test data

    def sentence_label_function(sentence,y,tokenizer):
        sentence_label=[]
        for i in range(len(sentence)):
            if y[i][1]<y[i][0]:
                y[i][1]=y[i][0]
            sentence_label_predicted=y2text(sentence[i],y[i],tokenizer)
            sentence_label.append(sentence_label_predicted)
        return sentence_label
    
    Y_pred_list=[]
    for model in listmodel:
        Y_pred_list.append(model.predict(x_test))
    Y_pred=Y_pred_list[0]
    if len(Y_pred_list)>0:
        for i in range(1,len(Y_pred_list)):
            Y_pred=Y_pred+Y_pred_list[i]
    y_pred=np.array(Y_pred).argmax(axis=2).transpose()  
    sentence_label_test=sentence_label_function(sentence_test,y_pred,tokenizer)
    return sentence_label_test

# RUN

In [159]:
X=np.vstack([x,x_neutral])
X_test=np.vstack([x_test,x_test_neutral])

SENTENCE=sentence+sentence_neutral
SENTENCE_LABEL=sentence_label+sentence_label_neutral
SENTENCE_TEST=sentence_test+sentence_test_neutral

Y_start=y_start+y_start_neutral
Y_end=y_end+y_end_neutral

classes=X[:,0,1]




In [160]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=777)
score=[]
for kfold,(train,CV) in enumerate(skf.split(X,classes)):
    x_train=X[train,:,:]
    x_CV=X[CV,:,:]
    y_start_train=tf.one_hot(Y_start,length).numpy()[train,:]
    y_end_train=tf.one_hot(Y_end,length).numpy()[train,:]
    y_start_CV=tf.one_hot(Y_start,length).numpy()[CV,:]
    y_end_CV=tf.one_hot(Y_end,length).numpy()[CV,:]
    sen_CV=[]
    sen_label_CV=[]
    for i in CV:
        sen_CV.append(SENTENCE[i])
        sen_label_CV.append(SENTENCE_LABEL[i])
        
    model_list=[]
    model=create_model(kfold)
    model=train_model(model,x_train,y_start_train,y_end_train,x_CV,y_start_CV,y_end_CV)
    model_list.append(model)
    
    score.append(CV_model(model,x_CV,sen_CV,sen_label_CV,tokenizer))

Train on 21984 samples, validate on 5497 samples
Epoch 1/50
Epoch 2/50


KeyboardInterrupt: 

In [None]:
print(np.array(model.predict(x_CV)))

# Test and save submission file

In [None]:
sentence_label_test=test_model(model_list,X_test,SENTENCE_TEST,tokenizer)  

index_test_REC=index_test+index_test_neutral
sentence_label_test_REC=sentence_label_test#+sentence_test_neutral

REC= pd.DataFrame()
REC['textID']=index_test_REC
REC['selected_text']=sentence_label_test_REC
REC[['textID','selected_text']].to_csv('submission.csv',index=False)