# [Tweet Sentiment Extraction](https://www.kaggle.com/c/tweet-sentiment-extraction)

## Mount Google Drive\

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

## Temporary

In [0]:
!pip install tokenizers >> /dev/null
!pip install transformers >> /dev/null
!pip install bert-for-tf2 >> /dev/null
# !pip install sentencepiece >> /dev/null
# !python -m spacy download en_core_web_lg >> /dev/null

%matplotlib inline

DATADIR = '/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data'
import sys
sys.path.insert(0, f"{DATADIR}/models/sentencepiece/")

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

import os
import re
import string
from tqdm import tqdm
import gc
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from __future__ import unicode_literals, print_function
import spacy
import random
import plac
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import en_core_web_sm
# import en_core_web_lg

import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import BertTokenizer, AlbertTokenizer, TFBertModel, TFRobertaModel, RobertaConfig, XLMRobertaConfig, TFXLMRobertaModel
# import sentencepiece as spm
# import sentencepiece_pb2
from tensorflow.keras.preprocessing.sequence import pad_sequences

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
# from bert.tokenization.bert_tokenization import FullTokenizer

import warnings
warnings.filterwarnings("ignore")

df_train = pd.read_csv(f'{DATADIR}/train.csv').dropna().reset_index(drop=True)
df_test = pd.read_csv(f'{DATADIR}/test.csv').dropna().reset_index(drop=True)
df_train[df_train.isna().any(axis=1)], df_test[df_test.isna().any(axis=1)]
df_train.shape, df_test.shape

# How many tweets where there is no overlap between text & selected_text - basically bad labels

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

jaccard_text_sel_text = []
for _, row in df_train.iterrows():
    jaccard_text_sel_text.append(jaccard(row.text, row.selected_text))
        
df_train['jaccard_text_sel_text'] = jaccard_text_sel_text
bad_labels_total = df_train[df_train.jaccard_text_sel_text == 0].shape[0]
bad_labels_positive = df_train[(df_train.jaccard_text_sel_text == 0) & (df_train.sentiment == 'positive')].shape[0]
bad_labels_negative = df_train[(df_train.jaccard_text_sel_text == 0) & (df_train.sentiment == 'negative')].shape[0]
bad_labels_neutral = df_train[(df_train.jaccard_text_sel_text == 0) & (df_train.sentiment == 'neutral')].shape[0]

print (f'Total # of training samples = {df_train.shape[0]:,}')

print (f'Total number of bad labels = {bad_labels_total}')
print (f'Total number of positive bad labels = {bad_labels_positive:,}')
print (f'Total number of negative bad labels = {bad_labels_negative:,}')
print (f'Total number of negative neutral labels = {bad_labels_neutral:,}')

# Remove rows with bad labels
df_train = df_train[df_train.jaccard_text_sel_text != 0].reset_index(drop=True)
df_train.drop(columns=['jaccard_text_sel_text'], inplace=True)

print (f'Remaining # of training samples post removal of bad label rows = {df_train.shape[0]:,}')


# Basic text cleanup

clean = lambda s: s.lower().strip()

def clean_text(s):
    s = s.lower()     # Convert to lowercase
    # s = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))', '', s)     # Remove URLs
#     s = re.sub(r'\d+', '', s)     # Remove numbers
#     s = s.translate(str.maketrans('', '', string.punctuation))     # Remove punctuation
    s = s.strip()     # Remove leading & trailing whitespaces
    s = re.sub('[ ]{2,}', ' ', s)   # replace multiple white spaces within tweet with single white space
    # s = re.sub(r'https?', '', s)      #Remove remnants of http | https
#     s = s.encode("ascii", "ignore").decode()     # Remove non-ascii characters
    return (s)


def process_df(df, train):
    text_clean = []
    sel_text_clean = []
    for idx, row in df.iterrows():
        if (row.sentiment != 'neutral'):
            if (train == True):
                text_clean.append(clean_text(row.text))
                sel_text_clean.append(clean_text(row.selected_text))
            else:
                text_clean.append(clean_text(row.text))
        else:
            if (train == True):
                text_clean.append(clean(row.text))
                sel_text_clean.append(clean(row.selected_text))
            else:
                text_clean.append(clean(row.text))
    if (train == True):
        return (text_clean, sel_text_clean)
    else:
        return(text_clean)
            
(text_clean, sel_text_clean) = process_df(df_train, True)
df_train['text_clean'] = text_clean
df_train['sel_text_clean'] = sel_text_clean

(text_clean) = process_df(df_test, False)
df_test['text_clean'] = text_clean

In [0]:
print (tf.__version__)

## Import dataset from kaggle

In [0]:
#to access kaggle datasets
!pip install kaggle

# Colab's file access feature
from google.colab import files

#retrieve uploaded file
uploaded = files.upload()

#print results
for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

# !unzip train.csv.zip -d '/content/gdrive/My Drive/Colab Notebooks/data1/Tweet_Sentiment/'
#!mv *.csv '/content/gdrive/My Drive/Colab Notebooks/data1/Tweet_Sentiment/'

## Install dependencies

In [0]:
!pip install tokenizers >> /dev/null
!pip install transformers >> /dev/null
!pip install bert-for-tf2 >> /dev/null
# !pip install sentencepiece >> /dev/null
# !python -m spacy download en_core_web_lg >> /dev/null

## Import dependencies

In [0]:
%matplotlib inline

DATADIR = '/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data'
import sys
sys.path.insert(0, f"{DATADIR}/models/sentencepiece/")

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

import os
import re
import string
from tqdm import tqdm
import gc
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from __future__ import unicode_literals, print_function
import spacy
import random
import plac
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import en_core_web_sm
# import en_core_web_lg

import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import BertTokenizer, AlbertTokenizer, TFBertModel, TFRobertaModel, RobertaConfig, XLMRobertaConfig, TFXLMRobertaModel
# import sentencepiece as spm
# import sentencepiece_pb2
from tensorflow.keras.preprocessing.sequence import pad_sequences

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
# from bert.tokenization.bert_tokenization import FullTokenizer

import warnings
warnings.filterwarnings("ignore")

## Load data

In [0]:
DATADIR = '/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data'

df_train = pd.read_csv(f'{DATADIR}/train.csv').dropna().reset_index(drop=True)
df_test = pd.read_csv(f'{DATADIR}/test.csv').dropna().reset_index(drop=True)
df_train[df_train.isna().any(axis=1)], df_test[df_test.isna().any(axis=1)]
df_train.shape, df_test.shape

## EDA

In [0]:
# How many tweets where there is no overlap between text & selected_text - basically bad labels

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

jaccard_text_sel_text = []
for _, row in df_train.iterrows():
    jaccard_text_sel_text.append(jaccard(row.text, row.selected_text))
        
df_train['jaccard_text_sel_text'] = jaccard_text_sel_text
bad_labels_total = df_train[df_train.jaccard_text_sel_text == 0].shape[0]
bad_labels_positive = df_train[(df_train.jaccard_text_sel_text == 0) & (df_train.sentiment == 'positive')].shape[0]
bad_labels_negative = df_train[(df_train.jaccard_text_sel_text == 0) & (df_train.sentiment == 'negative')].shape[0]
bad_labels_neutral = df_train[(df_train.jaccard_text_sel_text == 0) & (df_train.sentiment == 'neutral')].shape[0]

print (f'Total # of training samples = {df_train.shape[0]:,}')

print (f'Total number of bad labels = {bad_labels_total}')
print (f'Total number of positive bad labels = {bad_labels_positive:,}')
print (f'Total number of negative bad labels = {bad_labels_negative:,}')
print (f'Total number of negative neutral labels = {bad_labels_neutral:,}')

# Remove rows with bad labels
df_train = df_train[df_train.jaccard_text_sel_text != 0].reset_index(drop=True)
df_train.drop(columns=['jaccard_text_sel_text'], inplace=True)

print (f'Remaining # of training samples post removal of bad label rows = {df_train.shape[0]:,}')

In [0]:
# Check how many training sampels are provided for different sentiments
# Looks to be a fairly balanced training dataset

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.title('Train')
sns.countplot(x='sentiment', data=df_train)
plt.subplot(1,2,2)
plt.title('Test')
sns.countplot(x='sentiment', data=df_test)
plt.show()

In [0]:
# Check how the "text" vs. "selected_text" varies for different sentiments
# It appears that for a majority of "neutral" tweets the text & selected_text are the same.

char_count = lambda s: len(s.strip())
df_train['text_len'] = df_train.text.apply(char_count)
df_train['sel_text_len'] = df_train.selected_text.apply(char_count)
df_train['delta_text_sel_text_len'] = df_train.text_len - df_train.sel_text_len

# How many neutral tweets have the same text vs. selected_text 
a = df_train[df_train.sentiment == 'neutral'].shape[0]
b = df_train[(df_train.delta_text_sel_text_len == 0) & (df_train.sentiment == 'neutral')].shape[0]
print (f'{b:,} out of {a:,} ({b/a*100:.2f}%) neutral sentiment tweets are the same')

# How many positive tweets have the same text vs. selected_text 
a = df_train[df_train.sentiment == 'positive'].shape[0]
b = df_train[(df_train.delta_text_sel_text_len == 0) & (df_train.sentiment == 'positive')].shape[0]
print (f'{b:,} out of {a:,} ({b/a*100:.2f}%) positive sentiment tweets are the same')

# How many negative tweets have the same text vs. selected_text 
a = df_train[df_train.sentiment == 'negative'].shape[0]
b = df_train[(df_train.delta_text_sel_text_len == 0) & (df_train.sentiment == 'negative')].shape[0]
print (f'{b:,} out of {a:,} ({b/a*100:.2f}%) negative sentiment tweets are the same')

In [0]:
# Count words in text, selected text & compute difference

c = lambda s: len(s.strip().split())

df_train['text_wc'] = df_train['text'].apply(c)
df_train['seltext_wc'] = df_train['selected_text'].apply(c)
df_train['delta_text_seltext_wc'] = df_train['text_wc'] - df_train['seltext_wc']

In [0]:
# Check how the "text" vs. "selected_text" varies for non-neutral tweets with 2 to 5 words

y0 = 0
y1 = 0
plt.figure(figsize=(15,10))
for i in range (2,6):
  df_train_shortTweet = df_train[(df_train.text_wc <= i) & (df_train.sentiment != "neutral")]
  plt.subplot(2,2,i-1)
  plt.title(f'delta_text_seltext_wc for {i} words')
  sns.countplot(x='delta_text_seltext_wc', data=df_train_shortTweet)
plt.show()

In [0]:
# Check how the "text" vs. "selected_text" varies for non-neutral sentiments containing URLs
# There appear to be very few non-neutral selected text entries with URLs

f = lambda s: r'//' in s

df_train['URLs_text'] = df_train.text.apply(f)
df_train['URLs_sel_text'] = df_train.selected_text.apply(f)

print (f'Number of non neutral sentiment text with URLs = {df_train[(df_train.URLs_text == True) & (df_train.sentiment != "neutral")].shape[0]}')
print (f'Number of non neutral sentiment selected_text with URLs = {df_train[(df_train.URLs_sel_text == True) & (df_train.sentiment != "neutral")].shape[0]}')

plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
sns.countplot(x = 'URLs_text', data = df_train)
plt.subplot(1,2,2)
sns.countplot(x = 'URLs_sel_text', data = df_train)
plt.show()

In [0]:
# Check for non-ascii
# There appear to be very few non-neutral selected text entries with non-ascii charaters

f = lambda x: all(ord(char) < 128 for char in x)
df_train[(df_train.selected_text.apply(f) == False)  & (df_train.sentiment != 'neutral')]


print (f'Number of non neutral sentiment tweets with non-ascii chars = {df_train[(df_train.selected_text.apply(f) == False) & (df_train.sentiment != "neutral")].shape[0]}')
print (f'Number of non neutral sentiment selected_text with non-ascii chars = {df_train[(df_train.selected_text.apply(f) == False) & (df_train.sentiment != "neutral")].shape[0]}')



In [0]:
# Check for numeric characters in tweets

df_train[(df_train.selected_text.str.contains(r'\d+', regex=True) == True) & (df_train.sentiment != 'neutral')].shape

## Basic data cleanup


In [0]:
# Basic text cleanup

clean = lambda s: s.lower().strip()

def clean_text(s):
    s = s.lower()     # Convert to lowercase
    # s = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))', '', s)     # Remove URLs
#     s = re.sub(r'\d+', '', s)     # Remove numbers
#     s = s.translate(str.maketrans('', '', string.punctuation))     # Remove punctuation
    s = s.strip()     # Remove leading & trailing whitespaces
    s = re.sub('[ ]{2,}', ' ', s, 10)   # replace multiple white spaces within tweet with single white space
    # s = re.sub(r'https?', '', s)      #Remove remnants of http | https
#     s = s.encode("ascii", "ignore").decode()     # Remove non-ascii characters
    return (s)


def process_df(df, train):
    text_clean = []
    sel_text_clean = []
    for idx, row in df.iterrows():
        if (row.sentiment != 'neutral'):
            if (train == True):
                text_clean.append(clean_text(row.text))
                sel_text_clean.append(clean_text(row.selected_text))
            else:
                text_clean.append(clean_text(row.text))
        else:
            if (train == True):
                text_clean.append(clean(row.text))
                sel_text_clean.append(clean(row.selected_text))
            else:
                text_clean.append(clean(row.text))
    if (train == True):
        return (text_clean, sel_text_clean)
    else:
        return(text_clean)
            
(text_clean, sel_text_clean) = process_df(df_train, True)
df_train['text_clean'] = text_clean
df_train['sel_text_clean'] = sel_text_clean

(text_clean) = process_df(df_test, False)
df_test['text_clean'] = text_clean

## Pickle data

In [0]:
# with open (f'{DATADIR}/df_train_pos_neg_good.pkl', 'wb') as pklfile:
#   pickle.dump(df_train_pos_neg_good, pklfile)

In [0]:
DATADIR = '/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data'

# Pickle data

# with open (f'{DATADIR}/df_train.pkl', 'wb') as pklfile:
#   pickle.dump(df_train, pklfile)

# with open (f'{DATADIR}/df_test.pkl', 'wb') as pklfile:
#   pickle.dump(df_test, pklfile)


with open (f'{DATADIR}/df_train.pkl', 'rb') as pklfile:
  df_train = pickle.load(pklfile)

with open (f'{DATADIR}/df_test.pkl', 'rb') as pklfile:
  df_test = pickle.load(pklfile)

In [0]:
df_train.shape

## Develop Models


### spaCy

[Training spaCy’s Statistical Models](https://spacy.io/usage/training)

[NER](https://medium.com/@manivannan_data/how-to-train-ner-with-custom-training-data-using-spacy-188e0e508c6)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df_train, test_size=0.20, random_state=42)

# Create train DataFrames for "positive" & "negative" sentiment data
X_train_pos = X_train[X_train.sentiment == 'positive'].reset_index(drop=True)
X_train_neg = X_train[X_train.sentiment == 'negative'].reset_index(drop=True)
X_train_neu = X_train[X_train.sentiment == 'neutral'].reset_index(drop=True)
df_train_pos = df_train[df_train.sentiment == 'positive'].reset_index(drop=True)
df_train_neg = df_train[df_train.sentiment == 'negative'].reset_index(drop=True)
df_train_neu = df_train[df_train.sentiment == 'neutral'].reset_index(drop=True)
df_train_pos_neg = df_train[df_train.sentiment != 'neutral'].reset_index(drop=True)

df_train.shape, df_test.shape, X_train.shape, X_test.shape, X_train_pos.shape, X_train_neg.shape, X_train_neu.shape, X_test.shape, df_train_pos.shape, df_train_neg.shape, df_train_neu.shape, df_train_pos_neg.shape

#### spaCy 1

In [0]:
# Function to format data that can be used for training a spaCy model 
def prep_spacy_data(df):
  spacy_data = []
  for index, row in df.iterrows():
    selected_text = row.sel_text_clean
    text = row.text_clean
    start = text.find(selected_text)
    end = start + len(selected_text)
    spacy_data.append((text, {"entities": [[start, end, 'selected_text']]}))
  return spacy_data

# Creating spaCy training data for "positive" & "negative" sentiments
spacy_data_positive = prep_spacy_data(X_train_pos)
spacy_data_negative = prep_spacy_data(X_train_neg)
spacy_data_neutral = prep_spacy_data(X_train_neu)
# spacy_data_positive = prep_spacy_data(df_train_pos)
# spacy_data_negative = prep_spacy_data(df_train_neg)
# spacy_data_neutral = prep_spacy_data(df_train_neu)

len(spacy_data_positive), len(spacy_data_negative), len(spacy_data_neutral)

##### Model Pos Neg

In [0]:
# Train spaCy model
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import en_core_web_sm
import en_core_web_lg

def spacy_ner(model, train_data, output_dir, n_iter):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
      # nlp = spacy.load(model)  # load existing spaCy model
      nlp = model.load()   # load existing spaCy model
      print("Loaded model '%s'" % model)
    else:
      nlp = spacy.blank("en")  # create blank Language class
      print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
      ner = nlp.create_pipe("ner")
      nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
      ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data:
      for ent in annotations.get("entities"):
        ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            # batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
#----------------------------------------------------------------------------------------#
# Function to extract "selected_text" - called by the extract_sel_text function
def get_sel_text(text, model):
    doc = model(text)
    try:
        sel_text = doc.ents[0].text
    except:
        sel_text = text
    return (sel_text)
#----------------------------------------------------------------------------------------#
# Function to setup extraction of "selected_text" - called by the spacy_pridict function
def extract_sel_text(**kwargs):
    models_pos = {}
    models_neg = {}
    sel_texts = []
    
    for key, value in kwargs.items():
        if (re.findall("sentiment", key)):
            sentiment = value
        elif (re.findall("text", key)):
            text = value
        elif (re.findall("model.*pos", key)):
            models_pos[key] = value
        elif (re.findall("model.*neg", key)):
            models_neg[key] = value
        elif (re.findall("return_indx", key)):
            return_indx = value
            
    if (sentiment == 'positive'):
        for name, model in models_pos.items():
            sel_texts.append(get_sel_text(text, model))
    else:
        for name, model in models_neg.items():
            sel_texts.append(get_sel_text(text, model))

    sel_texts.sort(key=len)

    return (sel_texts[return_indx])
#----------------------------------------------------------------------------------------#
# Function to predict "selected_text"
def spacy_predict(model_ver, test_data, return_indx, lg=True, sm=True, none=True):
  # Load models
  try:
    nlp_lg_pos = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_lg_pos_{model_ver}')
    nlp_lg_neg = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_lg_neg_{model_ver}')
  except:
    lg = False
  try:
    nlp_sm_pos = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_sm_pos_{model_ver}')
    nlp_sm_neg = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_sm_neg_{model_ver}')
  except:
    sm = False
  try:
    nlp_None_pos = spacy.load(f'{DATADIR}/models/spacy/model_None_pos_{model_ver}')
    nlp_None_neg = spacy.load(f'{DATADIR}/models/spacy/model_None_neg_{model_ver}')
  except:
    none = False

  df = test_data.copy(deep=True)     # Choose test dataset
  sel_texts = []

  for idx, row in tqdm(df.iterrows()):
      if (row.sentiment != 'neutral'):
        if ( (lg == True) and (sm == True) and (none == True) ):
          sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                             model1_pos=nlp_lg_pos, model1_neg=nlp_lg_neg,\
                                             model2_pos=nlp_sm_pos, model2_neg=nlp_sm_neg,\
                                             model3_pos=nlp_None_pos, model3_neg=nlp_None_neg,\
                                             return_indx = return_indx))
        elif ( (lg == True) and (sm == False) and (none == False) ):
          sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                             model1_pos=nlp_lg_pos, model1_neg=nlp_lg_neg,\
                                             return_indx = return_indx))
        elif ( (lg == False) and (sm == True) and (none == False) ):
          sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                             model2_pos=nlp_sm_pos, model2_neg=nlp_sm_neg,\
                                             return_indx = return_indx))
        elif ( (lg == False) and (sm == False) and (none == True) ):
          sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                            model3_pos=nlp_None_pos, model3_neg=nlp_None_neg,\
                                            return_indx = return_indx))
      else:
          sel_texts.append(row.text_clean)

  df['selected_text_pred'] = sel_texts
  return (df)
#----------------------------------------------------------------------------------------#
# Function to build spaCy model(s)
def build_spacy_models(model_names, train_data, model_ver, n_iter):
  for model_name, model in model_names.items():
    for name, data in train_data.items():
      output_dir = f'{DATADIR}/models/spacy/model_{model_name}_{name.split("_")[-1]}_{model_ver}'
      spacy_ner(model, data, output_dir, n_iter)
#----------------------------------------------------------------------------------------#
# Function to compute Jaccard score 
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

##### Model pos, neg, neu

In [0]:
# Train spaCy model
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import en_core_web_sm
import en_core_web_lg

def spacy_ner(model, train_data, output_dir, n_iter):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
      # nlp = spacy.load(model)  # load existing spaCy model
      nlp = model.load()   # load existing spaCy model
      print("Loaded model '%s'" % model)
    else:
      nlp = spacy.blank("en")  # create blank Language class
      print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
      ner = nlp.create_pipe("ner")
      nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
      ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data:
      for ent in annotations.get("entities"):
        ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            # batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
#----------------------------------------------------------------------------------------#
# Function to extract "selected_text" - called by the extract_sel_text function
def get_sel_text(text, model):
    doc = model(text)
    try:
        sel_text = doc.ents[0].text
    except:
        sel_text = text
    return (sel_text)
#----------------------------------------------------------------------------------------#
# Function to setup extraction of "selected_text" - called by the spacy_pridict function
def extract_sel_text(**kwargs):
    models_pos = {}
    models_neg = {}
    models_neu = {}
    sel_texts = []
    
    for key, value in kwargs.items():
        if (re.findall("sentiment", key)):
            sentiment = value
        elif (re.findall("text", key)):
            text = value
        elif (re.findall("model.*pos", key)):
            models_pos[key] = value
        elif (re.findall("model.*neg", key)):
            models_neg[key] = value
        elif (re.findall("model.*neu", key)):
            models_neu[key] = value
        elif (re.findall("return_indx", key)):
            return_indx = value
            
    if (sentiment == 'positive'):
        for name, model in models_pos.items():
            sel_texts.append(get_sel_text(text, model))
    elif (sentiment == 'negative'):
        for name, model in models_neg.items():
            sel_texts.append(get_sel_text(text, model))
    elif (sentiment == 'neutral'):
        for name, model in models_neu.items():
            sel_texts.append(get_sel_text(text, model))

    sel_texts.sort(key=len)

    return (sel_texts[return_indx])
#----------------------------------------------------------------------------------------#
# Function to predict "selected_text"
def spacy_predict(model_ver, test_data, return_indx, lg=True, sm=True, none=True):
  # Load models
  try:
    nlp_lg_pos = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_lg_pos_{model_ver}')
    nlp_lg_neg = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_lg_neg_{model_ver}')
    nlp_lg_neu = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_lg_neu_{model_ver}')
  except:
    lg = False
  try:
    nlp_sm_pos = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_sm_pos_{model_ver}')
    nlp_sm_neg = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_sm_neg_{model_ver}')
    nlp_sm_neu = spacy.load(f'{DATADIR}/models/spacy/model_en_core_web_sm_neu_{model_ver}')
  except:
    sm = False
  try:
    nlp_None_pos = spacy.load(f'{DATADIR}/models/spacy/model_None_pos_{model_ver}')
    nlp_None_neg = spacy.load(f'{DATADIR}/models/spacy/model_None_neg_{model_ver}')
    nlp_None_neu = spacy.load(f'{DATADIR}/models/spacy/model_None_neu_{model_ver}')
  except:
    none = False

  df = test_data.copy(deep=True)     # Choose test dataset
  sel_texts = []

  for idx, row in tqdm(df.iterrows()):
    if ( (lg == True) and (sm == True) and (none == True) ):
      sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                        model1_pos=nlp_lg_pos, model1_neg=nlp_lg_neg, model1_neu=nlp_lg_neu,\
                                        model2_pos=nlp_sm_pos, model2_neg=nlp_sm_neg, model2_neu=nlp_sm_neu,\
                                        model3_pos=nlp_None_pos, model3_neg=nlp_None_neg, model3_neu=nlp_None_neu,\
                                          return_indx = return_indx))
    elif ( (lg == True) and (sm == False) and (none == False) ):
      sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                          model1_pos=nlp_lg_pos, model1_neg=nlp_lg_neg, model1_neu=nlp_lg_neu,\
                                          return_indx = return_indx))
    elif ( (lg == False) and (sm == True) and (none == False) ):
      sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                          model2_pos=nlp_sm_pos, model2_neg=nlp_sm_neg, model2_neu=nlp_sm_neu,\
                                          return_indx = return_indx))
    elif ( (lg == False) and (sm == False) and (none == True) ):
      sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
                                        model3_pos=nlp_None_pos, model3_neg=nlp_None_neg, model3_neu=nlp_None_neu,\
                                        return_indx = return_indx))

  df['selected_text_pred'] = sel_texts
  return (df)
#----------------------------------------------------------------------------------------#
# Function to build spaCy model(s)
def build_spacy_models(model_names, train_data, model_ver, n_iter):
  for model_name, model in model_names.items():
    for name, data in train_data.items():
      output_dir = f'{DATADIR}/models/spacy/model_{model_name}_{name.split("_")[-1]}_{model_ver}'
      spacy_ner(model, data, output_dir, n_iter)
#----------------------------------------------------------------------------------------#
# Function to compute Jaccard score 
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

##### Define parameters, models & datasets

In [0]:
# Define parameters, models, datasets
train_data = {"pos" : spacy_data_positive, "neg" : spacy_data_negative, "neu" : spacy_data_neutral}
test_data = X_test
model_names = {"None" : None}
# model_names = {"en_core_web_sm" : en_core_web_sm}
# model_names = {"en_core_web_lg" : en_core_web_lg}
# model_names = {"None" : None, "en_core_web_sm" : en_core_web_sm, "en_core_web_lg" : en_core_web_lg}
n_iter = 15
model_ver = 'v7'
return_indx = 0   # length of selected_text prediction [0=>shortest, -1=>longest]

# Launch model build, predict & score functions
build_spacy_models(model_names, train_data, model_ver, n_iter)
df = spacy_predict(model_ver, test_data, return_indx, lg=True, sm=True, none=True)

jaccard_score = []
for _, row in df.iterrows():
    jaccard_score.append(jaccard(row.selected_text, row.selected_text_pred))
df['jaccard_score'] = jaccard_score
print (f'Overall Jaccard score = {np.mean(df.jaccard_score):.2f}')

In [0]:
# Define parameters, models, datasets
train_data = {"pos" : spacy_data_positive, "neg" : spacy_data_negative, "neu" : spacy_data_neutral}
test_data = X_test
model_names = {"None" : None}
# model_names = {"en_core_web_sm" : en_core_web_sm}
# model_names = {"en_core_web_lg" : en_core_web_lg}
# model_names = {"None" : None, "en_core_web_sm" : en_core_web_sm, "en_core_web_lg" : en_core_web_lg}
n_iter = 15
model_ver = 'v7'
return_indx = 0   # length of selected_text prediction [0=>shortest, -1=>longest]

# Launch model build, predict & score functions
build_spacy_models(model_names, train_data, model_ver, n_iter)
df = spacy_predict(model_ver, test_data, return_indx, lg=True, sm=True, none=True)

jaccard_score = []
for _, row in df.iterrows():
    jaccard_score.append(jaccard(row.selected_text, row.selected_text_pred))
df['jaccard_score'] = jaccard_score
print (f'Overall Jaccard score = {np.mean(df.jaccard_score):.2f}')

In [0]:
# df_ = df[['textID', 'text', 'selected_text', 'selected_text_pred', 'jaccard_score']]
df_.to_csv(f"{DATADIR}/df.csv", index=False)

In [0]:
df_ = df[df.jaccard_score < 0.1]
df_[df_.sentiment == 'negative'].to_csv(f"{DATADIR}/df.csv", sep='|', index=False)

In [0]:
df_submission = df[['textID', 'selected_text']]
df_submission.to_csv("./data/submission.csv", index=False)

#### spaCy 2

In [0]:
# Function to format data that can be used for training a spaCy model 
def prep_spacy_data(df):
  spacy_data = []
  train_text = []
  for _, row in df.iterrows():
    selected_text = row.sel_text_clean
    sentiment = row.sentiment
    text = row.text_clean
    start = text.find(selected_text)
    end = start + len(selected_text)
    spacy_data.append((text+" :"+sentiment, {"entities": [[start, end, 'selected_text']]}))
  return spacy_data, train_text

In [0]:
spacy_data, train_text = prep_spacy_data(df_train_pos_neg)
train_text = []
for _, row in df_train.iterrows():
  if row.sentiment != 'neutral':
    train_text.append(row.text_clean+" :"+row.sentiment)
  else:
    train_text.append(row.text_clean)

In [0]:
len(spacy_data), df_train_pos_neg.shape

In [0]:
# Train spaCy model

from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

def main(model, train_data, output_dir, n_iter):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
      nlp = spacy.load(model)  # load existing spaCy model
      print("Loaded model '%s'" % model)
    else:
      nlp = spacy.blank("en")  # create blank Language class
      print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
      ner = nlp.create_pipe("ner")
      nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
      ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data:
      for ent in annotations.get("entities"):
        ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

data = spacy_data
model_name = None
# model_name = "en_core_web_sm"
# model_name = "en_core_web_lg"
n_iter = 30
model_ver = 'v2_1'

output_dir = f'{DATADIR}/models/spacy/model_{model_name}_{model_ver}'
main(model_name, data, output_dir, n_iter)

In [0]:
import re

def get_sel_text(text, model):
    doc = model(text)
    try:
        sel_text = doc.ents[0].text
    except:
        sel_text = text
    return (sel_text)

def extract_sel_text(text, models):
    sel_texts = []
    return_indx = -1   # length of selected_text prediction [0=>shortest, -1=>longest]
    
    for model in models:
      sel_texts.append(get_sel_text(text, model))
      
    sel_texts.sort(key=len)

    return (sel_texts[return_indx])

In [0]:
version = 'v2_1'

# Load models
nlp_None = spacy.load(f'{DATADIR}/models/spacy/model_None_{version}')

# df = X_test.copy(deep=True)     # Choose test dataset
df = df_train.copy(deep=True)     # Choose test dataset
sel_texts = []

for idx, row in tqdm(df.iterrows()):
    if (row.sentiment != 'neutral'):
#         sel_texts.append(extract_sel_text(text=row.text_clean, sentiment=row.sentiment,\
#                                            model1_pos=nlp_lg_pos, model1_neg=nlp_lg_neg,\
#                                            model2_pos=nlp_sm_pos, model2_neg=nlp_sm_neg,\
#                                            model3_pos=nlp_None_pos, model3_neg=nlp_None_neg))
        sel_texts.append(extract_sel_text(text=row.train_text, models=[nlp_None]))
    else:
        sel_texts.append(row.text_clean)

df['selected_text_pred'] = sel_texts

In [0]:
df.head()

In [0]:
# Compute Jaccard score 

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

jaccard_score = []

for _, row in df.iterrows():
    jaccard_score.append(jaccard(row.selected_text, row.selected_text_pred))
        
df['jaccard_score'] = jaccard_score
print (f'Overall Jaccard score = {np.mean(df.jaccard_score):.2f}')

In [0]:
df.head()

In [0]:
plt.figure(figsize=(15,7))
sns.countplot(x='text_wc', hue='sentiment', data=df[df.jaccard_score == 0])

In [0]:
np.mean(df[df.sentiment == "positive"].jaccard_score)

In [0]:
np.mean(df[df.sentiment == "neutral"].jaccard_score)

In [0]:
df_submission = df[['textID', 'selected_text']]
df_submission.to_csv("./data/submission.csv", index=False)

### Azure ML

In [0]:
name = 'NLPone618'
endpoint = 'https://nlpone618.cognitiveservices.azure.com/'
key = '5cef5507a7b94260adfc9170b8f1c321'
key2 = 'fbd33fd5e562426091f1649634d9b0f5'

In [0]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [0]:
def key_phrase_extraction_example(client):

    try:
        documents = list(df_train[df_train.sentiment != "neutral"].text[:5])
        # documents = ["My cat might need to see a veterinarian.",
        #              "The quick brown fox jumps over a lazy dog."]
        for i in range (len(documents)):
          response = client.extract_key_phrases(documents = documents)[i]

          if not response.is_error:
              print("\tKey Phrases:")
              for phrase in response.key_phrases:
                  print("\t\t", phrase)
          else:
              print(response.id, response.error)

    except Exception as err:
        print("Encountered exception. {}".format(err))
        
key_phrase_extraction_example(client)


In [0]:
list(df_train[df_train.sentiment != "neutral"].text[:5])

In [0]:
list(df_train[df_train.sentiment != "neutral"].selected_text[:5])

### Bert

https://medium.com/tensorflow/using-tensorflow-2-for-state-of-the-art-natural-language-processing-102445cda54a

https://www.kaggle.com/svadivazhagu/tf-bert-sentiment

https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/

https://colab.research.google.com/drive/1Y4o3jh3ZH70tl6mCd76vz_IxX23biCPP#scrollTo=yDcqNlvVhL5W

https://www.kaggle.com/mdmashurshalehin/tweet-sentiment-insight-eda

https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert

https://www.kaggle.com/koushiksahu/question-answering-roberta-for-absolute-beginners

https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/143281

https://www.curiousily.com/posts/intent-recognition-with-bert-using-keras-and-tensorflow-2/

#### Prepare Train & Test data for Bert



In [0]:
# Create train DataFrames for "positive" & "negative" sentiment data

from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df_train, test_size=0.20, random_state=42)

X_train_pos = X_train[X_train.sentiment == 'positive'].reset_index(drop=True)
X_train_neg = X_train[X_train.sentiment == 'negative'].reset_index(drop=True)
X_train_pos_neg = X_train[X_train.sentiment != 'neutral'].reset_index(drop=True)
X_test_pos_neg = X_test[X_test.sentiment != 'neutral'].reset_index(drop=True)

df_train_pos = df_train[df_train.sentiment == 'positive'].reset_index(drop=True)
df_train_neg = df_train[df_train.sentiment == 'negative'].reset_index(drop=True)
df_train_pos_neg = df_train[df_train.sentiment != 'neutral'].reset_index(drop=True)

df_test_pos = df_test[df_test.sentiment == 'positive'].reset_index(drop=True)
df_test_neg = df_test[df_test.sentiment == 'negative'].reset_index(drop=True)
df_test_pos_neg = df_test[df_test.sentiment != 'neutral'].reset_index(drop=True)

df_train.shape, df_train_pos.shape, df_train_neg.shape, df_train_pos_neg.shape

#### Bert

##### Tokenize using BertWordPieceTokenizer

In [0]:
class prepDataBert():
    def __init__(self, df, tokenizer, train=True, max_len=0):
        self.text_ids, self.offsets = self.tokenize_data(df)
        attention_masks = self.gen_attention_masks(self.text_ids)
        self.token_type_ids = self.gen_token_type_ids(self.text_ids, max_len)
        # self.max_len = max([len(text) for text in self.text_ids])
        # print (self.max_len)
        self.text_ids_padded = self.pad_data(self.text_ids, max_len)
        self.attention_masks_padded = self.pad_data(attention_masks, max_len)
        # self.token_type_ids_padded = self.pad_data(token_type_ids, max_len)
        if (train):
            self.targets_start, self.targets_end = self.get_targets(df, self.offsets)
#------------------------------------------------------------------------------------------#
    # Method to tokenize & get offsets using BertWordPieceTokenizer    
    def tokenize_data(self, data):
        text_ids = []
        offsets = []
        for _, row in data.iterrows():
            s_tok = tokenizer.encode(row.sentiment).ids
            t_tok = tokenizer.encode(row.text_clean).ids
            text_ids.append(np.array(s_tok + t_tok[1:]))
            offsets.append([(0, 0)] * 2 + tokenizer.encode(row.text_clean).offsets)
        return (np.array(text_ids), offsets)
#------------------------------------------------------------------------------------------#
    # Method to create attention masks
    def gen_attention_masks(self, text_ids):
        attention_masks = []
        # For each sentence...
        for ids in text_ids:
            # Create the attention mask.
            #   - If a token ID is 0, then it's padding, set the mask to 0.
            #   - If a token ID is > 0, then it's a real token, set the mask to 1.
            att_mask = [int(token_id > 0) for token_id in ids]
            # Store the attention mask for this sentence.
            attention_masks.append(np.array(att_mask))
        return(np.array(attention_masks))
#------------------------------------------------------------------------------------------#
    # Method to create token type ids
    def gen_token_type_ids(self, text_ids, max_len):
        token_type_ids = []
        # For each sentence...
        for ids in text_ids:
            # Set mask for sentiment (question) tokens to 1 and tweet (context) tokens to 0
            tok_type_id = [0]*3 + [1]*(len(ids)-3) + [0]*(max_len-len(ids))
            # Store the token type id for this sentence.
            token_type_ids.append(np.array(tok_type_id))
        return(np.array(token_type_ids))
#------------------------------------------------------------------------------------------#
    # Method to pad data
    def pad_data (self, data, max_len):
        data_padded = pad_sequences(data, maxlen=max_len, dtype="long", 
                                  value=0, truncating="post", padding="post")
        return(data_padded)
#------------------------------------------------------------------------------------------#
    # Method to generate start & end targets based on text vs. selected_text matches
    def get_targets(self, data, offsets):
        targets_start = []
        targets_end = []
        i = 0
        
        # Find start & end index within tweet which matches selected_text
        for _, row in data.iterrows():
            tweet = row.text_clean
            selected_text = row.sel_text_clean
            len_st = len(selected_text)
            idx0 = None
            idx1 = None
            for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
                if tweet[ind: ind+len_st] == selected_text:
                    idx0 = ind
                    idx1 = ind + len_st - 1
                    break
                    
            # Mark portion of the tweet which matches selected_text with 1s
            char_targets = [0] * len(tweet)
            if idx0 != None and idx1 != None:
                for ct in range(idx0, idx1 + 1):
                    char_targets[ct] = 1

            # targets_start = which index offset pair contains start of selected_text
            # targets_end = which index offset pair contains end of selected_text
            target_idx = []
            for j, (offset1, offset2) in enumerate(offsets[i][3:]):
                if sum(char_targets[offset1: offset2]) > 0:
                    target_idx.append(j)
            i += 1        
            target_start = target_idx[0] + 3
            target_end = target_idx[-1] + 3
            targets_start.append(np.array(target_start))
            targets_end.append(np.array(target_end))
            
        return (np.array(targets_start), np.array(targets_end))

#------------------------------------------------------------------------------------------#

In [0]:
tokenizer = tokenizers.BertWordPieceTokenizer(
        f"{DATADIR}/models/uncased_L-12_H-768_A-12/vocab.txt", 
        lowercase=True
    )

x_train_pos_neg = prepDataBert(X_train_pos_neg, tokenizer, train=True, max_len=97)
x_test_pos_neg = prepDataBert(X_test_pos_neg, tokenizer, train=True, max_len=97)

train_pos_neg = prepDataBert(df_train_pos_neg, tokenizer, train=True, max_len=97)
test_pos_neg = prepDataBert(df_test_pos_neg, tokenizer, train=False, max_len=97)

In [0]:
for idx in range(15202,15205):
    train_pos_neg.text_ids_padded[idx], train_pos_neg.offsets[idx], train_pos_neg.targets_start[idx], train_pos_neg.targets_end[idx]

    s = train_pos_neg.offsets[idx][train_pos_neg.targets_start[idx]][0]
    e = train_pos_neg.offsets[idx][train_pos_neg.targets_end[idx]][1]
    print (df_train_pos_neg.text[idx])
    print (df_train_pos_neg.selected_text[idx])
    print (df_train_pos_neg.text_clean[idx][s:e])
    print ('-------------------------------------------')

##### Create Bert Model

In [0]:
def create_model(max_seq_len, bert_model_dir, optimizer):
    bert_params = bert.params_from_pretrained_ckpt(bert_model_dir)
    bert_model = bert.BertModelLayer.from_params(bert_params, name="bert")

    input_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype='int32')
    att = tf.keras.layers.Input(shape=(max_seq_len,), dtype='int32')
    tok_type_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype='int32')

    x = bert_model([input_ids, att])
    # x = bert_model([input_ids, tok_type_ids])

    #*******************Mod 1*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x)
    # x1 = tf.keras.layers.Conv1D(1,1)(x1) 
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)

    # x2 = tf.keras.layers.Dropout(0.1)(x) 
    # x2 = tf.keras.layers.Conv1D(1,1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)

    #*******************Mod 2*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x)
    # x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    # x1 = tf.keras.layers.LeakyReLU()(x1)
    # x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)

    # x2 = tf.keras.layers.Dropout(0.1)(x) 
    # x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    # x2 = tf.keras.layers.LeakyReLU()(x2)
    # x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 3*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x)
    # x1 = tf.keras.layers.Conv1D(128, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Conv1D(64, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Conv1D(32, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)

    # x2 = tf.keras.layers.Dropout(0.1)(x) 
    # x2 = tf.keras.layers.Conv1D(128, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Conv1D(64, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Conv1D(32, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 4*******************
    x1 =  tf.keras.layers.Dropout(0.1)(x)
    x1 =  tf.keras.layers.GRU(1024, return_sequences=True)(x1)
    x1 =  tf.keras.layers.LeakyReLU()(x1)
    x1 =  tf.keras.layers.Dense(1)(x1)
    x1 =  tf.keras.layers.Flatten()(x1)
    x1 =  tf.keras.layers.Activation('softmax')(x1)

    x2 =  tf.keras.layers.Dropout(0.1)(x)
    x2 =  tf.keras.layers.GRU(1024, return_sequences=True)(x2)
    x2 =  tf.keras.layers.LeakyReLU()(x2)    
    x2 =  tf.keras.layers.Dense(1)(x2)
    x2 =  tf.keras.layers.Flatten()(x2)
    x2 =  tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 5*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x)
    # x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    # x1 = tf.keras.layers.LeakyReLU()(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)
    
    # x2 = tf.keras.layers.Dropout(0.1)(x) 
    # x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    # x2 = tf.keras.layers.LeakyReLU()(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)
    #*******************************************
    
    model = tf.keras.Model(inputs=[input_ids, att], outputs=[x1,x2])
    # model = tf.keras.Model(inputs=[input_ids, tok_type_ids], outputs=[x1,x2])
    
    model.build(input_shape=(None, max_seq_len))
    load_stock_weights(bert_model, f'{bert_model_dir}/bert_model.ckpt')
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)
    
    return (model)

In [0]:
bert_model_dir = f'{DATADIR}/models/uncased_L-12_H-768_A-12'
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=3e-5,
    decay_steps=10000,
    decay_rate=0.9)
# optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model = create_model(97, bert_model_dir, optimizer)
model.summary()

In [0]:
model_name = 'bert9-model'
if not os.path.exists(f'{DATADIR}/models/{model_name}'):
    os.makedirs(f'{DATADIR}/models/{model_name}')

model.save(f'{DATADIR}/models/{model_name}')

#### Roberta, XLM-Roberta

##### Tokenize using RobertaTokenizer

In [0]:
class prepDataRoberta():
    def __init__(self, df, tokenizer, train=True, max_len=0):
        self.text_ids, attention_masks, self.offsets = self.tokenize_data(df)
        # self.max_len = max([len(text) for text in self.text_ids])
        # print (self.max_len)
        self.text_ids_padded = self.pad_data(self.text_ids, max_len)
        self.attention_masks_padded = self.pad_data(attention_masks, max_len)
        self.token_type_ids_padded = np.zeros((df.shape[0],max_len), dtype=np.int32)
        if (train):
            self.targets_start, self.targets_end = self.get_targets(df, self.offsets)
    #------------------------------------------------------------------------------------------#
    # Method to tokenize & get attention masks, offsets using RobertaTokenizer Tokenizer
    def tokenize_data(self, data):
        text_ids = []
        attention_masks = []
        offsets = []
        for _, row in data.iterrows():
          tok = tokenizer.encode_plus(row.sentiment, row.text_clean)
          tok_ids = tok['input_ids']
          att_masks = tok['attention_mask']
          text_ids.append(np.array(tok_ids))
          attention_masks.append(np.array(att_masks))
          s = 0
          offset = []
          for i in tok_ids[4:-1]:
            e = s + len(tokenizer.decode(i))
            # print ((s,e))
            offset.append((s,e))
            s = e
          offset_ = 4*[(0,0)] + offset + [(0,0)]
          offsets.append(offset_)
        return(text_ids, attention_masks, offsets)
    #------------------------------------------------------------------------------------------#
    # Method to pad data
    def pad_data (self, data, max_len):
        data_padded = pad_sequences(data, maxlen=max_len, dtype="long", 
                                  value=1, truncating="post", padding="post")
        return(data_padded)
    #------------------------------------------------------------------------------------------#
    # Method to generate start & end targets based on text vs. selected_text matches
    def get_targets(self, data, offsets):
        targets_start = []
        targets_end = []
        i = 0
        
        # Find start & end index within tweet which matches selected_text
        for _, row in data.iterrows():
            tweet = row.text_clean
            selected_text = row.sel_text_clean
            len_st = len(selected_text)
            idx0 = None
            idx1 = None
            for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
                if tweet[ind: ind+len_st] == selected_text:
                    idx0 = ind
                    idx1 = ind + len_st - 1
                    break
                    
            # Mark portion of the tweet which matches selected_text with 1s
            char_targets = [0] * len(tweet)
            if idx0 != None and idx1 != None:
                for ct in range(idx0, idx1 + 1):
                    char_targets[ct] = 1

            # targets_start = which index offset pair contains start of selected_text
            # targets_end = which index offset pair contains end of selected_text
            target_idx = []
            for j, (offset1, offset2) in enumerate(offsets[i][4:]):
                if sum(char_targets[offset1: offset2]) > 0:
                    target_idx.append(j)
            i += 1        
            target_start = target_idx[0] + 4
            target_end = target_idx[-1] + 4
            targets_start.append(np.array(target_start))
            targets_end.append(np.array(target_end))
            
        return (np.array(targets_start), np.array(targets_end))

In [0]:
import transformers
tokenizer = transformers.RobertaTokenizer.from_pretrained(f"{DATADIR}/models/roberta-base")

x_train_pos_neg = prepDataRoberta(X_train_pos_neg, tokenizer, train=True, max_len=75)
x_test_pos_neg = prepDataRoberta(X_test_pos_neg, tokenizer, train=True, max_len=75)
train_pos_neg = prepDataRoberta(df_train_pos_neg, tokenizer, train=True, max_len=75)
test_pos_neg = prepDataRoberta(df_test_pos_neg, tokenizer, train=False, max_len=75)

In [0]:
for idx in range(15202,15205):
    train_pos_neg.text_ids_padded[idx], train_pos_neg.offsets[idx], train_pos_neg.targets_start[idx], train_pos_neg.targets_end[idx]

    s = train_pos_neg.offsets[idx][train_pos_neg.targets_start[idx]][0]
    e = train_pos_neg.offsets[idx][train_pos_neg.targets_end[idx]][1]
    print (df_train_pos_neg.text[idx])
    print (df_train_pos_neg.selected_text[idx])
    print (re.sub(r'^\s','',df_train_pos_neg.text_clean[idx][s:e]))
    print ('-------------------------------------------')

##### Tokenize using SentencePiece

In [0]:
class prepDataXLMRoberta():
    def __init__(self, df, sp, spt, train=True, max_len=0):
        self.text_ids, self.offsets = self.tokenize_data(df)
        attention_masks = self.gen_attention_masks(self.text_ids)
        # self.max_len = max([len(text) for text in self.text_ids])
        # print (self.max_len)
        self.text_ids_padded = self.pad_data(self.text_ids, max_len)
        self.attention_masks_padded = self.pad_data(attention_masks, max_len)
        self.token_type_ids_padded = np.zeros((df.shape[0],max_len), dtype=np.int32)
        if (train):
            self.targets_start, self.targets_end = self.get_targets(df, self.offsets)
#------------------------------------------------------------------------------------------#
    # Method to tokenize & get offsets
    def get_spt_pieces_data(self, spt):
        ids = []
        offsets = []
        for piece in spt.pieces:
            ids.append(piece.id)
            offsets.append((piece.begin, piece.end))
        ids = [1] + ids + [2]
        offsets = [(0,0)] + offsets + [(0,0)]
        return (ids, offsets)
#------------------------------------------------------------------------------------------#
    # Method to tokenize & get offsets using SentencePiece Tokenizer
    def tokenize_data(self, data):
        text_ids = []
        offsets = []
        for _, row in data.iterrows():
            spt.ParseFromString(sp.encode_as_serialized_proto(row.sentiment))
            (sentiment_ids, sentiment_offsets) = self.get_spt_pieces_data(spt)
            spt.ParseFromString(sp.encode_as_serialized_proto(row.text))
            (token_ids, token_offsets) = self.get_spt_pieces_data(spt)
            text_ids.append(np.array(sentiment_ids + token_ids))
            offsets.append([(0, 0)] * 3 + token_offsets)
        return (np.array(text_ids), offsets)
#------------------------------------------------------------------------------------------#
    # Method to create attention masks
    def gen_attention_masks(self, text_ids):
        attention_masks = []
        # For each sentence...
        for ids in text_ids:
            # Create the attention mask.
            #   - If a token ID is 0, then it's padding, set the mask to 0.
            #   - If a token ID is > 0, then it's a real token, set the mask to 1.
            att_mask = [int(token_id > 0) for token_id in ids]
            # Store the attention mask for this sentence.
            attention_masks.append(np.array(att_mask))
        return(np.array(attention_masks))
#------------------------------------------------------------------------------------------#
    # Method to create token type ids
    def gen_token_type_ids(self, text_ids, max_len):
        token_type_ids = []
        # For each sentence...
        for ids in text_ids:
            # Set mask for sentiment (question) tokens to 1 and tweet (context) tokens to 0
            tok_type_id = [0]*4 + [1]*(len(ids)-4) + [0]*(max_len-len(ids))
            # Store the token type id for this sentence.
            token_type_ids.append(np.array(tok_type_id))
        return(np.array(token_type_ids))
#------------------------------------------------------------------------------------------#
    # Method to pad data
    def pad_data (self, data, max_len):
        data_padded = pad_sequences(data, maxlen=max_len, dtype="long", 
                                  value=1, truncating="post", padding="post")
        return(data_padded)
#------------------------------------------------------------------------------------------#
    # Method to generate start & end targets based on text vs. selected_text matches
    def get_targets(self, data, offsets):
        targets_start = []
        targets_end = []
        i = 0
        
        # Find start & end index within tweet which matches selected_text
        for _, row in data.iterrows():
            tweet = row.text
            selected_text = row.selected_text
            len_st = len(selected_text)
            idx0 = None
            idx1 = None
            for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
                if tweet[ind: ind+len_st] == selected_text:
                    idx0 = ind
                    idx1 = ind + len_st - 1
                    break
                    
            # Mark portion of the tweet which matches selected_text with 1s
            char_targets = [0] * len(tweet)
            if idx0 != None and idx1 != None:
                for ct in range(idx0, idx1 + 1):
                    char_targets[ct] = 1

            # targets_start = which index offset pair contains start of selected_text
            # targets_end = which index offset pair contains end of selected_text
            target_idx = []
            for j, (offset1, offset2) in enumerate(offsets[i][4:]):
                if sum(char_targets[offset1: offset2]) > 0:
                    target_idx.append(j)
            i += 1        
            target_start = target_idx[0] + 4
            target_end = target_idx[-1] + 4
            targets_start.append(np.array(target_start))
            targets_end.append(np.array(target_end))
            
        return (np.array(targets_start), np.array(targets_end))

#------------------------------------------------------------------------------------------#

In [0]:
import sys
sys.path.insert(0, f"{DATADIR}/models/sentencepiece/")
import sentencepiece as spm
import sentencepiece_pb2
sp = spm.SentencePieceProcessor()
sp.load(f"{DATADIR}/models/tf-xlm-roberta-base/sentencepiece.bpe.model")
spt = sentencepiece_pb2.SentencePieceText()

x_train_pos_neg = prepDataXLMRoberta(X_train_pos_neg, sp, spt, train=True, max_len=78)
x_test_pos_neg = prepDataXLMRoberta(X_test_pos_neg, sp, spt, train=True, max_len=78)
train_pos_neg = prepDataXLMRoberta(df_train_pos_neg, sp, spt, train=True, max_len=78)
test_pos_neg = prepDataXLMRoberta(df_test_pos_neg, sp, spt, train=False, max_len=78)

In [0]:
for idx in range(15202,15205):
    train_pos_neg.text_ids_padded[idx], train_pos_neg.offsets[idx], train_pos_neg.targets_start[idx], train_pos_neg.targets_end[idx]

    s = train_pos_neg.offsets[idx][train_pos_neg.targets_start[idx]][0]
    e = train_pos_neg.offsets[idx][train_pos_neg.targets_end[idx]][1]
    print (df_train_pos_neg.text[idx])
    print (df_train_pos_neg.selected_text[idx])
    print (df_train_pos_neg.text[idx][s:e])
    print ('-------------------------------------------')

##### Create Roberta Model

In [0]:
def create_roberta_model(max_len, roberta_model_dir, optimizer):
    
    text_ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att_masks = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    tok_type_ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    
    
    config = RobertaConfig.from_pretrained(f'{roberta_model_dir}/config.json')
    roberta_model = TFRobertaModel.from_pretrained(f'{roberta_model_dir}/tf_model.h5',config=config)
    # roberta_model = TFRobertaModel.from_pretrained(roberta_model_dir)

    x = roberta_model(text_ids, attention_mask=att_masks)
    # x = roberta_model(text_ids,attention_mask=att_masks,token_type_ids=tok_type_ids)
    
    #*******************Mod 1*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x1 = tf.keras.layers.Conv1D(1,1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)
    
    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(1,1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)

    #*******************Mod 2*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0])
    # x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    # x1 = tf.keras.layers.LeakyReLU()(x1)
    # x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)

    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    # x2 = tf.keras.layers.LeakyReLU()(x2)
    # x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)

    #*******************Mod 3*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0])
    # x1 = tf.keras.layers.Conv1D(128, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Conv1D(64, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Conv1D(32, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)

    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(128, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Conv1D(64, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Conv1D(32, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 4*******************
    # x1 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x1 =  tf.keras.layers.LSTM(1024, return_sequences=True)(x1)
    # x1 =  tf.keras.layers.LeakyReLU()(x1)
    # x1 =  tf.keras.layers.Dense(1)(x1)
    # x1 =  tf.keras.layers.Flatten()(x1)
    # x1 =  tf.keras.layers.Activation('softmax')(x1)

    # x2 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x2 =  tf.keras.layers.LSTM(1024, return_sequences=True)(x2)
    # x2 =  tf.keras.layers.LeakyReLU()(x2)    
    # x2 =  tf.keras.layers.Dense(1)(x2)
    # x2 =  tf.keras.layers.Flatten()(x2)
    # x2 =  tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 5*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0])
    # x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    # x1 = tf.keras.layers.LeakyReLU()(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)
    
    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    # x2 = tf.keras.layers.LeakyReLU()(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 6*******************
    # x1 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x1 =  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768, return_sequences=True))(x1)
    # x1 =  tf.keras.layers.LeakyReLU()(x1)
    # x1 =  tf.keras.layers.Dense(1)(x1)
    # x1 =  tf.keras.layers.Flatten()(x1)
    # x1 =  tf.keras.layers.Activation('softmax')(x1)

    # x2 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x2 =  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768, return_sequences=True))(x2)
    # x2 =  tf.keras.layers.LeakyReLU()(x2)    
    # x2 =  tf.keras.layers.Dense(1)(x2)
    # x2 =  tf.keras.layers.Flatten()(x2)
    # x2 =  tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 7*******************
    x1 =  tf.keras.layers.Dropout(0.1)(x[0])
    x1 =  tf.keras.layers.GRU(1024, return_sequences=True)(x1)
    x1 =  tf.keras.layers.LeakyReLU()(x1)
    x1 =  tf.keras.layers.Dense(1)(x1)
    x1 =  tf.keras.layers.Flatten()(x1)
    x1 =  tf.keras.layers.Activation('softmax')(x1)

    x2 =  tf.keras.layers.Dropout(0.1)(x[0])
    x2 =  tf.keras.layers.GRU(1024, return_sequences=True)(x2)
    x2 =  tf.keras.layers.LeakyReLU()(x2)    
    x2 =  tf.keras.layers.Dense(1)(x2)
    x2 =  tf.keras.layers.Flatten()(x2)
    x2 =  tf.keras.layers.Activation('softmax')(x2)
    #*******************************************
    model = tf.keras.models.Model(inputs=[text_ids, att_masks], outputs=[x1,x2])
    # model = tf.keras.models.Model(inputs=[text_ids, att_masks, tok_type_ids], outputs=[x1,x2])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)
    
    return (model)

In [0]:
roberta_model_dir = f'{DATADIR}/models/roberta-base/'

lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=3e-5,
    decay_steps=10000,
    end_learning_rate=0.0001,
    power=1.0,
    cycle=False,
    )
# optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model = create_roberta_model(75, roberta_model_dir, optimizer)
model.summary()

In [0]:
model_name = 'r1-model'
# model.save(f'{DATADIR}/models/{model_name}')
model.save_weights(f'{DATADIR}/models/{model_name}/{model_name}.h5')


In [0]:
# del(model)
# model = tf.keras.models.load_model(f'{DATADIR}/models/r1-model')

##### Create XLM-Roberta Model

In [0]:
def create_xlm_roberta_model(max_len, xlm_roberta_model_dir, optimizer):
    
    text_ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att_masks = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    tok_type_ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    
    config = XLMRobertaConfig.from_pretrained(f'{xlm_roberta_model_dir}/config.json')
    xlm_roberta_model = TFXLMRobertaModel.from_pretrained(f'{xlm_roberta_model_dir}/tf_model.h5',config=config)

    x = xlm_roberta_model(text_ids,attention_mask=att_masks)
    # x = xlm_roberta_model(text_ids,attention_mask=att_masks,token_type_ids=tok_type_ids)
    
    #*******************Mod 1*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x1 = tf.keras.layers.Conv1D(1,1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)
    
    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(1,1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)

    #*******************Mod 2*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0])
    # x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    # x1 = tf.keras.layers.LeakyReLU()(x1)
    # x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)

    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    # x2 = tf.keras.layers.LeakyReLU()(x2)
    # x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)

    #*******************Mod 3*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0])
    # x1 = tf.keras.layers.Conv1D(128, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Conv1D(64, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Conv1D(32, 1, activation='relu')(x1)
    # x1 = tf.keras.layers.Dropout(0.1)(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)

    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(128, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Conv1D(64, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Conv1D(32, 1, activation='relu')(x2)
    # x2 = tf.keras.layers.Dropout(0.1)(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 4*******************
    # x1 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x1 =  tf.keras.layers.LSTM(1024, return_sequences=True)(x1)
    # x1 =  tf.keras.layers.LeakyReLU()(x1)
    # x1 =  tf.keras.layers.Dense(1)(x1)
    # x1 =  tf.keras.layers.Flatten()(x1)
    # x1 =  tf.keras.layers.Activation('softmax')(x1)

    # x2 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x2 =  tf.keras.layers.LSTM(1024, return_sequences=True)(x2)
    # x2 =  tf.keras.layers.LeakyReLU()(x2)    
    # x2 =  tf.keras.layers.Dense(1)(x2)
    # x2 =  tf.keras.layers.Flatten()(x2)
    # x2 =  tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 5*******************
    # x1 = tf.keras.layers.Dropout(0.1)(x[0])
    # x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    # x1 = tf.keras.layers.LeakyReLU()(x1)
    # x1 = tf.keras.layers.Dense(1)(x1)
    # x1 = tf.keras.layers.Flatten()(x1)
    # x1 = tf.keras.layers.Activation('softmax')(x1)
    
    # x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    # x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    # x2 = tf.keras.layers.LeakyReLU()(x2)
    # x2 = tf.keras.layers.Dense(1)(x2)
    # x2 = tf.keras.layers.Flatten()(x2)
    # x2 = tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 6*******************
    # x1 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x1 =  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768, return_sequences=True))(x1)
    # x1 =  tf.keras.layers.LeakyReLU()(x1)
    # x1 =  tf.keras.layers.Dense(1)(x1)
    # x1 =  tf.keras.layers.Flatten()(x1)
    # x1 =  tf.keras.layers.Activation('softmax')(x1)

    # x2 =  tf.keras.layers.Dropout(0.1)(x[0])
    # x2 =  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768, return_sequences=True))(x2)
    # x2 =  tf.keras.layers.LeakyReLU()(x2)    
    # x2 =  tf.keras.layers.Dense(1)(x2)
    # x2 =  tf.keras.layers.Flatten()(x2)
    # x2 =  tf.keras.layers.Activation('softmax')(x2)
    #*******************Mod 7*******************
    x1 =  tf.keras.layers.Dropout(0.1)(x[0])
    x1 =  tf.keras.layers.GRU(1024, return_sequences=True)(x1)
    x1 =  tf.keras.layers.LeakyReLU()(x1)
    x1 =  tf.keras.layers.Dense(1)(x1)
    x1 =  tf.keras.layers.Flatten()(x1)
    x1 =  tf.keras.layers.Activation('softmax')(x1)

    x2 =  tf.keras.layers.Dropout(0.1)(x[0])
    x2 =  tf.keras.layers.GRU(1024, return_sequences=True)(x2)
    x2 =  tf.keras.layers.LeakyReLU()(x2)    
    x2 =  tf.keras.layers.Dense(1)(x2)
    x2 =  tf.keras.layers.Flatten()(x2)
    x2 =  tf.keras.layers.Activation('softmax')(x2)
    #*******************************************
    model = tf.keras.models.Model(inputs=[text_ids, att_masks], outputs=[x1,x2])
    # model = tf.keras.models.Model(inputs=[text_ids, att_masks, tok_type_ids], outputs=[x1,x2])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)
    
    return (model)

In [0]:
xlm_roberta_model_dir = f'{DATADIR}/models/tf-xlm-roberta-base/'

lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=3e-5,
    decay_steps=10000,
    end_learning_rate=0.0001,
    power=1.0,
    cycle=False,
    )
# optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model = create_xlm_roberta_model(75, xlm_roberta_model_dir, optimizer)
model.summary()

In [0]:
model_name = 'r1-model'
model.save(f'{DATADIR}/models/{model_name}')

#### Albert

##### Tokenize using AlbertTokenizer

In [0]:
class prepDataBert():
    def __init__(self, df, tokenizer, train=True, max_len=0):
        self.max_len = max_len
        self.text_ids, attention_masks = self.tokenize_data(df.text_clean)
        #self.max_len = max([len(text) for text in self.text_ids])
        self.text_ids_padded = self.pad_data(self.text_ids, self.max_len)
        self.attention_masks_padded = self.pad_data(attention_masks, self.max_len)
        if (train):
            self.selected_text_ids, _ = self.tokenize_data(df.sel_text_clean)
            self.targets_start, self.targets_end, self.textIDs = self.get_targets(\
                                                                                  self.text_ids,\
                                                                                  self.selected_text_ids,\
                                                                                  df.textID,\
                                                                                  self.max_len)
#------------------------------------------------------------------------------------------#
    # Method to tokenize data using AlbertTokenizer    
    def tokenize_data(self, data):
        input_ids = []
        attention_masks = []
        for sent in data:
            t = tokenizer.encode_plus(sent)
            input_ids.append(t['input_ids'])
            attention_masks.append(t['attention_mask'])
        return (np.array(input_ids), np.array(attention_masks))
#------------------------------------------------------------------------------------------#
    # Method to pad data
    def pad_data (self, data, max_len):
        data_padded = pad_sequences(data, maxlen=max_len, dtype="long", 
                                  value=0, truncating="post", padding="post")
        return(data_padded)
#------------------------------------------------------------------------------------------#
    # Method to generate targets based on text vs. selected_text matches
    def get_targets(self, data1, data2, data3, max_len):
        self.targets_start = []
        self.targets_end = []
        self.textIDs = []
        for text_ids, sel_text_ids, textID in zip(data1, data2, data3):
            a = text_ids
            b = sel_text_ids[1:-1]
            bool_a = np.in1d(a,b)
            matched_indices = np.where(bool_a == True)[0]
            target_start = np.zeros((max_len,), dtype=int)
            target_end = np.zeros((max_len,), dtype=int)
            try:
                target_start[matched_indices[0]] = 1
                target_end[matched_indices[len(b)-1]] = 1
            except:
                self.textIDs.append(textID)
                target_end[matched_indices[-1]] = 1
            self.targets_start.append(np.array(target_start))
            self.targets_end.append(np.array(target_end))
        return(np.array(self.targets_start), np.array(self.targets_end), self.textIDs)
        
#------------------------------------------------------------------------------------------#

In [0]:
tokenizer = AlbertTokenizer.from_pretrained(f'{DATADIR}/models/albert-large-v1/')

train_pos = prepDataBert(df_train_pos, tokenizer, train=True, max_len=95)
train_neg = prepDataBert(df_train_neg, tokenizer, train=True, max_len=95)
test_pos = prepDataBert(df_test_pos, tokenizer, train=False, max_len=95)
test_neg = prepDataBert(df_test_neg, tokenizer, train=False, max_len=95)

In [0]:
at = AlbertTokenizer.from_pretrained(f'{DATADIR}/models/albert-large-v1/')
at.encode_plus(['This is a test'], ['This is a a test also'])

In [0]:
at.decode([3])

#### Train & chkpt model on data

In [0]:
def FitModelCheckpoint(checkpoint_path, data, epochs): 
  # Create a callback that saves the model's weights
  cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                   save_weights_only=True,
                                                   verbose=1)
  
  # Train the model with the new callback
  history = model.fit([data.text_ids_padded, data.attention_masks_padded], 
            [data.targets_start, data.targets_end],
            epochs=epochs,
            validation_split=0.2,
            batch_size=16,
            shuffle=True,
            callbacks=[cp_callback])  # Pass callback to training

  # history = model.fit([data.text_ids_padded, data.attention_masks_padded, data.token_type_ids_padded], 
  #           [data.targets_start, data.targets_end],
  #           epochs=epochs,
  #           validation_split=0.2,
  #           batch_size=16,
  #           shuffle=True,
  #           callbacks=[cp_callback])  # Pass callback to training

  return (history)  

In [0]:
data = x_train_pos_neg
# data = train_pos_neg
epochs = 3

# checkpoint_dir = f'{DATADIR}/models/bert9-training'
checkpoint_dir = f'{DATADIR}/models/r4-1-training'
if not os.path.exists(f'{checkpoint_dir}'):
    os.makedirs(f'{checkpoint_dir}')

checkpoint_path = f'{checkpoint_dir}/cp.ckpt'
print (checkpoint_path)

history = FitModelCheckpoint(checkpoint_path, data, epochs)

In [0]:
!ls '/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/r2-1-training/'

In [0]:
def plot_hist(history):
  history_dict=history.history
  loss_values = history_dict['loss']
  val_loss_values=history_dict['val_loss']
  # plt.figure(figsize=(10,6))
  plt.xlabel('Epochs')
  plt.plot(loss_values, color='Blue', linestyle='dashed', marker='o', label='Training Loss')
  plt.plot(val_loss_values,color='Red', label=
           'Validation Loss')
  plt.legend()
  # plt.show()

plot_hist(history)
plt.show()

In [0]:
# !tar -zcvf "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/roberta-base.tar.gz" "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/roberta-base"

# !tar -zcvf "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/bert9-model.tar.gz" "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/bert9-model"
# !tar -zcvf "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/bert9-training.tar.gz" "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/bert9-training"

# !tar -zcvf "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/r1-model.tar.gz" "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/r1-model"
!tar -zcvf "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/r3-1-training.tar.gz" "/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/r3-1-training"

#### Predict

In [0]:
# model_pos = tf.keras.models.load_model(f'{DATADIR}/models/bert4-model')
# model_neg = tf.keras.models.load_model(f'{DATADIR}/models/bert4-model')
# model_pos.load_weights(f'{DATADIR}/models/bert4-training-pos/cp.ckpt')
# model_neg.load_weights(f'{DATADIR}/models/bert4-training-neg/cp.ckpt')


# model = tf.keras.models.load_model(f'{DATADIR}/models/bert5-model')
# model.load_weights(f'{DATADIR}/models/bert5-training/cp.ckpt')

# model = tf.keras.models.load_model(f'{DATADIR}/models/r1-model')
model.load_weights(f'/content/gdrive/My Drive/Colab Notebooks/TweetSentiment/data/models/r2-1-training/cp.ckpt')

In [0]:
selected_text = []
bad_preds = []
i = 0
# df = df_test
# tok_data = test_pos_neg
# df = df_train
# tok_data = train_pos_neg
df = X_test
tok_data = x_test_pos_neg
roberta = True

preds = model.predict([tok_data.text_ids_padded, tok_data.attention_masks_padded])
# preds = model.predict([tok_data.text_ids_padded, tok_data.attention_masks_padded, tok_data.token_type_ids_padded])


for idx, row in df.iterrows():
  if (row.sentiment != 'neutral'):
    flag = True
    target_start = preds[0][i].argmax()
    target_end = preds[1][i].argmax()
    try:
      start = tok_data.offsets[i][target_start][0]
      end = tok_data.offsets[i][target_end][1]
    except:
      bad_preds.append(i)
      flag = False
    if (flag):
      if (roberta):
        selected_text.append(re.sub(r'^\s','',row.text_clean[start:end]))
      else:
        selected_text.append(row.text_clean[start:end])
    else:
      selected_text.append(row.text_clean)  
    i += 1
  else:
    selected_text.append(row.text_clean)
    
df['selected_text_pred'] = selected_text
print (f'Bad predictions = {len(bad_preds)}')

# Function to compute Jaccard score 
def jaccard(str1, str2): 
      a = set(str1.lower().split()) 
      b = set(str2.lower().split())
      c = a.intersection(b)
      try:
        return float(len(c)) / (len(a) + len(b) - len(c))
      except:
        return (0)

if not (df.equals(df_test)):
  jaccard_score = []
  for _, row in df.iterrows():
      jaccard_score.append(jaccard(row.selected_text, row.selected_text_pred))
  df['jaccard_score'] = jaccard_score
  print (f'Overall Jaccard score = {np.mean(df.jaccard_score):.2f}')

In [0]:
df_submission = df[['textID', 'selected_text']]
df_submission.to_csv('submission.csv', index=False)
df_submission.head()