# IMPORTANT NOTE

## Continuation of part 1 notebook. Contains model framework and BERT

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Add progress bars everywhere!!!!!!!!!!!!!!!!!!!!!
from tqdm import tqdm
tqdm.pandas()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sarcasm/bert_embeddings_768.parquet
/kaggle/input/sarcasm/non_sarcasm_tweets.csv
/kaggle/input/sarcasm/sarcasm_tweets.csv
/kaggle/input/sarcasm/clean_raw_data.parquet
/kaggle/input/artkdata-private/ATRK dataset.csv
/kaggle/input/artkdata-private/clean_raw_data.parquet


# Import libraries

In [2]:
# Set random seed

SEED = 1234509876

# Importing basic libraries
from zipfile import ZipFile
import os, sys
import re
import gc
import time
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import json 
from string import punctuation
import pyarrow as pa
import pyarrow.parquet as pq

%matplotlib inline

# Import NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# # Preprocessing
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler

# from sklearn.decomposition import PCA
# from sklearn.metrics import f1_score

from wordcloud import WordCloud
# Import models

# import lightgbm as lgb
# from sklearn.cluster import KMeans
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB

# Model selection
# from sklearn.model_selection import RandomizedSearchCV

# Others

from tqdm import tqdm_notebook #Loads progressbars for various loops

import warnings

warnings.filterwarnings('ignore')

#####################
#Useful pandas settings

pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_colwidth', 40)
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Custom Functions

Collecting all functions here for easy reference and update

In [3]:
################################################################################################
# Downcasting function for pandas dataframes

def downcast_dtypes(df):
    '''
    Changes column types in the dataframe:             
      `float64` type to lowest possible float without data loss
      `int64`   type to lowest possible int wihtout data loss
    '''

    # Select columns to downcast
    float_cols = [col for col in df if df[col].dtype == "float64"]
    int_cols =   [col for col in df if df[col].dtype == "int64"]

    # Downcast columns using to numeric function
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')

    # remove variables from memory to avoid issues

    del float_cols
    del int_cols

    return df

################################################################################################
# Check duplication at given level of dataframe

def check_dups(df, cols):

    orig_count_rows = df.shape[0]

    temp = df.groupby(cols).size().reset_index(name = 'counts')

    dedup_count_rows = temp.shape[0]

    if orig_count_rows == dedup_count_rows:
        print("No duplicates. Dataframe is unique at given level")
        print("# of unique entries: n=",orig_count_rows)
    else:
        print("Duplicates found. Dataframe is not unique at given level")
        print("# of entries in original dataset: n=", orig_count_rows)
        print("# of unique entries expected in deduped dataset: n=", dedup_count_rows)
        print("# of addational entries: n=", orig_count_rows - dedup_count_rows)

    del orig_count_rows, temp, dedup_count_rows

#####################################################################################
# Plotting classification features
def fancy_plot(df):
    column_names = list(df.columns.values)
    frauds = df[df['Class'] == 1]
    no_frauds = df[df['Class'] == 0]

    plt.figure()
    fig, ax = plt.subplots(8,4,figsize=(16,28))
    i = 0
    for feature in column_names:
        i += 1
        plt.subplot(8,4,i)
        sns.kdeplot(frauds[feature])
        sns.kdeplot(no_frauds[feature])
        plt.xlabel(feature, fontsize=10)
        locs, labels = plt.xticks()
        plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show();

####################################################################################

########################################
#Custom function to apply functions to dataframe with missing values
def impute_missing(df, func, target_col, new_col_name):
    df.loc[~df[target_col].isnull(),new_col_name] = df.loc[~df[target_col].isnull(),target_col].apply(func)


# Import dataset - continue from P1

In [4]:
# Import code for resume
# temp = pq.read_table('/kaggle/input/sarcasm/clean_raw_data.parquet')

temp = pq.read_table('/kaggle/input/artkdata-private/clean_raw_data.parquet')

raw_data = temp.to_pandas()

In [5]:
# raw_data.loc[raw_data.labels != 'sarcasm',:]
raw_data.shape

(30000, 6)

In [6]:
raw_data.tail()

Unnamed: 0,text,labels,clean_text,emoji_list,clean_text_lem,clean_text_lem_stop
29995,ALL WHITE Party UP??blessed Toshlove...,non-sarcasm,ALL WHITE Party UP??blessed Toshlove...,[],ALL WHITE Party UP??blessed Toshlove...,ALL WHITE Party UP??blessed Toshlove...
29996,REPLY @Sumeru_Infra For more details...,non-sarcasm,REPLY SOME_ENTITY infra For more det...,[],REPLY SOME_ENTITY infra For more det...,REPLY SOME_ENTITY infra For detail c...
29997,07/09::::::Happy Birthday CHRISSIE H...,non-sarcasm,07/09::::::Happy Birthday christie h...,[],07/09::::::Happy Birthday christie h...,07/09::::::Happy Birthday christie h...
29998,RT @SamanthaBrook75: Learn to be Hap...,non-sarcasm,it SOME_ENTITY : Learn to be happy S...,[],it SOME_ENTITY : Learn to be happy S...,SOME_ENTITY : Learn happy Sign recei...
29999,RT @MichaelLuiMusic: Have a great we...,non-sarcasm,it SOME_ENTITY : Have a great weeken...,[],it SOME_ENTITY : Have a great weeken...,SOME_ENTITY : Have great weekend eve...


# Extract features from pretrained BERT model



In [7]:
# !pip install transformers

In [8]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
# Copy of data with needed fields
bert_test_data = raw_data.loc[:,['clean_text']]

In [10]:
bert_test_data.dtypes

clean_text    object
dtype: object

In [11]:
# bert_test_data.shape
# bert_test_data = bert_test_data.drop_duplicates(subset = ['clean_text'])

bert_test_data.head()

Unnamed: 0,clean_text
0,Youll notice on the new episode comi...
1,SOME_ENTITY SOME_ENTITY Well arent y...
2,SOME_ENTITY SOME_ENTITY SOME_ENTITY ...
3,"Merlin for the . and if you like , a..."
4,You spin me right round wholesale I ...


In [12]:
bert_test_data.shape

(30000, 1)

In [13]:
raw_data.shape

(30000, 6)

## Adding tags for BERT

In [14]:
bert_test_data['clean_text'] = bert_test_data['clean_text'].progress_apply(lambda tweet: "[CLS] " + tweet + " [SEP]")
bert_test_data.head()

100%|██████████| 30000/30000 [00:00<00:00, 611572.08it/s]


Unnamed: 0,clean_text
0,[CLS] Youll notice on the new episod...
1,[CLS] SOME_ENTITY SOME_ENTITY Well a...
2,[CLS] SOME_ENTITY SOME_ENTITY SOME_E...
3,[CLS] Merlin for the . and if you li...
4,[CLS] You spin me right round wholes...


## Generate BERT tokens

In [15]:
bert_test_data['token_text'] = bert_test_data['clean_text'].progress_apply(tokenizer.tokenize)
bert_test_data.head()

# Map the token strings to their vocabulary indeces.
bert_test_data['indexed_tokens'] = bert_test_data['token_text'].progress_apply(tokenizer.convert_tokens_to_ids)
bert_test_data.head()

100%|██████████| 30000/30000 [00:20<00:00, 1480.28it/s]
100%|██████████| 30000/30000 [00:01<00:00, 29641.77it/s]


Unnamed: 0,clean_text,token_text,indexed_tokens
0,[CLS] Youll notice on the new episod...,"[[CLS], you, ##ll, notice, on, the, ...","[101, 2017, 3363, 5060, 2006, 1996, ..."
1,[CLS] SOME_ENTITY SOME_ENTITY Well a...,"[[CLS], some, _, entity, some, _, en...","[101, 2070, 1035, 9178, 2070, 1035, ..."
2,[CLS] SOME_ENTITY SOME_ENTITY SOME_E...,"[[CLS], some, _, entity, some, _, en...","[101, 2070, 1035, 9178, 2070, 1035, ..."
3,[CLS] Merlin for the . and if you li...,"[[CLS], merlin, for, the, ., and, if...","[101, 15993, 2005, 1996, 1012, 1998,..."
4,[CLS] You spin me right round wholes...,"[[CLS], you, spin, me, right, round,...","[101, 2017, 6714, 2033, 2157, 2461, ..."


In [16]:
# BERT expects pairs of sentence for matching. Since we are not doing that, we ignore 0 tag and give inputs the 1 tag

bert_test_data['segment_ids'] = bert_test_data['token_text'].progress_apply(lambda tweet: [1]*len(tweet))

bert_test_data.head()

100%|██████████| 30000/30000 [00:00<00:00, 418217.57it/s]


Unnamed: 0,clean_text,token_text,indexed_tokens,segment_ids
0,[CLS] Youll notice on the new episod...,"[[CLS], you, ##ll, notice, on, the, ...","[101, 2017, 3363, 5060, 2006, 1996, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
1,[CLS] SOME_ENTITY SOME_ENTITY Well a...,"[[CLS], some, _, entity, some, _, en...","[101, 2070, 1035, 9178, 2070, 1035, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,[CLS] SOME_ENTITY SOME_ENTITY SOME_E...,"[[CLS], some, _, entity, some, _, en...","[101, 2070, 1035, 9178, 2070, 1035, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
3,[CLS] Merlin for the . and if you li...,"[[CLS], merlin, for, the, ., and, if...","[101, 15993, 2005, 1996, 1012, 1998,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,[CLS] You spin me right round wholes...,"[[CLS], you, spin, me, right, round,...","[101, 2017, 6714, 2033, 2157, 2461, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."


In [17]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
    

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [18]:
# Use the model to extract the information from final layers

def extract_bert_embeddings(indexed_tokens, segment_ids):
    tokens_tensor = torch.tensor([indexed_tokens])
    segment_tensor = torch.tensor([segment_ids]) 
    # print("Token size: %d | Segment size: %d" %(len(indexed_tokens), len(segment_ids)))
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segment_tensor)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]

        # Extract the mean of last 2nd layer as word embeddings.
        # Can tweak this to get mean of last 4 layers or a concatenation
        embedding = torch.mean(hidden_states[-2][0], dim = 0) 

        # print(embedding)
        return embedding.tolist()


In [19]:
bert_test_data.head()

Unnamed: 0,clean_text,token_text,indexed_tokens,segment_ids
0,[CLS] Youll notice on the new episod...,"[[CLS], you, ##ll, notice, on, the, ...","[101, 2017, 3363, 5060, 2006, 1996, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
1,[CLS] SOME_ENTITY SOME_ENTITY Well a...,"[[CLS], some, _, entity, some, _, en...","[101, 2070, 1035, 9178, 2070, 1035, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,[CLS] SOME_ENTITY SOME_ENTITY SOME_E...,"[[CLS], some, _, entity, some, _, en...","[101, 2070, 1035, 9178, 2070, 1035, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
3,[CLS] Merlin for the . and if you li...,"[[CLS], merlin, for, the, ., and, if...","[101, 15993, 2005, 1996, 1012, 1998,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,[CLS] You spin me right round wholes...,"[[CLS], you, spin, me, right, round,...","[101, 2017, 6714, 2033, 2157, 2461, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."


In [20]:
# bert_test_data.assign(bert_embedding_1_layer = lambda info: extract_bert_embeddings(info['indexed_tokens'], info['segment_ids']))
# bert_test_data.head()

error_idx = list()
temp = bert_test_data.loc[:,['indexed_tokens','segment_ids']]

# Running in batches to prevent complete loss of data
for i in tqdm(range(0,30000,10)):
    start = i
    end = i+9
#     temp.loc[start:end,'bert_emb_768'] = temp.loc[start:end,:].apply(lambda x: extract_bert_embeddings(x.indexed_tokens, x.segment_ids), axis=1)
    try:
        temp.loc[start:end,'bert_emb_768'] = temp.loc[start:end,:].apply(lambda x: extract_bert_embeddings(x.indexed_tokens, x.segment_ids), axis=1)
    except RuntimeError:
        print(f'Error at index b/w {start} and {end}. Skipped processing')
        continue

# OLD CODE
# for indexed_tokens,segment_ids in tqdm(zip(bert_test_data['indexed_tokens'], bert_test_data['segment_ids'])):
#     bert_word_embeddings.append(extract_bert_embeddings(indexed_tokens, segment_ids))


 57%|█████▋    | 1698/3000 [20:05<12:33,  1.73it/s]

Error at index b/w 16970 and 16979. Skipped processing


 65%|██████▍   | 1938/3000 [22:57<11:57,  1.48it/s]

Error at index b/w 19380 and 19389. Skipped processing


 80%|███████▉  | 2392/3000 [28:11<06:32,  1.55it/s]

Error at index b/w 23910 and 23919. Skipped processing


 80%|████████  | 2407/3000 [28:21<06:53,  1.43it/s]

Error at index b/w 24070 and 24079. Skipped processing


 84%|████████▍ | 2521/3000 [29:35<05:13,  1.53it/s]

Error at index b/w 25210 and 25219. Skipped processing


 85%|████████▌ | 2563/3000 [30:00<04:22,  1.67it/s]

Error at index b/w 25620 and 25629. Skipped processing


 87%|████████▋ | 2602/3000 [30:25<03:44,  1.77it/s]

Error at index b/w 26010 and 26019. Skipped processing


 87%|████████▋ | 2604/3000 [30:26<03:47,  1.74it/s]

Error at index b/w 26030 and 26039. Skipped processing


 88%|████████▊ | 2653/3000 [30:55<03:15,  1.77it/s]

Error at index b/w 26520 and 26529. Skipped processing


100%|██████████| 3000/3000 [34:35<00:00,  1.45it/s]


# Combine with main data and export

In [21]:
raw_data['bert_emb_768'] = temp['bert_emb_768']

In [22]:
# Export the numpy array
# Need to convert to pandas for export

raw_data.to_parquet(path = '/kaggle/working/clean_data_p2_w_bert.parquet')