# IMPORTANT NOTE

## Continuation of part 1 notebook. Contains model framework and BERT

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sarcasm/non_sarcasm_tweets.csv
/kaggle/input/sarcasm/sarcasm_tweets.csv
/kaggle/input/sarcasm/clean_raw_data.parquet


# Import libraries

In [2]:
# Set random seed

SEED = 1234509876

# Importing basic libraries
from zipfile import ZipFile
import os, sys
import re
import gc
import time
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import json 
from string import punctuation
import pyarrow as pa
import pyarrow.parquet as pq

%matplotlib inline

# Import NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# # Preprocessing
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler

# from sklearn.decomposition import PCA
# from sklearn.metrics import f1_score

from wordcloud import WordCloud
# Import models

# import lightgbm as lgb
# from sklearn.cluster import KMeans
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB

# Model selection
# from sklearn.model_selection import RandomizedSearchCV

# Others

from tqdm import tqdm_notebook #Loads progressbars for various loops

import warnings

warnings.filterwarnings('ignore')

#####################
#Useful pandas settings

pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_colwidth', 40)
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Custom Functions

Collecting all functions here for easy reference and update

In [3]:
################################################################################################
# Downcasting function for pandas dataframes

def downcast_dtypes(df):
    '''
    Changes column types in the dataframe:             
      `float64` type to lowest possible float without data loss
      `int64`   type to lowest possible int wihtout data loss
    '''

    # Select columns to downcast
    float_cols = [col for col in df if df[col].dtype == "float64"]
    int_cols =   [col for col in df if df[col].dtype == "int64"]

    # Downcast columns using to numeric function
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')

    # remove variables from memory to avoid issues

    del float_cols
    del int_cols

    return df

################################################################################################
# Check duplication at given level of dataframe

def check_dups(df, cols):

    orig_count_rows = df.shape[0]

    temp = df.groupby(cols).size().reset_index(name = 'counts')

    dedup_count_rows = temp.shape[0]

    if orig_count_rows == dedup_count_rows:
        print("No duplicates. Dataframe is unique at given level")
        print("# of unique entries: n=",orig_count_rows)
    else:
        print("Duplicates found. Dataframe is not unique at given level")
        print("# of entries in original dataset: n=", orig_count_rows)
        print("# of unique entries expected in deduped dataset: n=", dedup_count_rows)
        print("# of addational entries: n=", orig_count_rows - dedup_count_rows)

    del orig_count_rows, temp, dedup_count_rows

#####################################################################################
# Plotting classification features
def fancy_plot(df):
    column_names = list(df.columns.values)
    frauds = df[df['Class'] == 1]
    no_frauds = df[df['Class'] == 0]

    plt.figure()
    fig, ax = plt.subplots(8,4,figsize=(16,28))
    i = 0
    for feature in column_names:
        i += 1
        plt.subplot(8,4,i)
        sns.kdeplot(frauds[feature])
        sns.kdeplot(no_frauds[feature])
        plt.xlabel(feature, fontsize=10)
        locs, labels = plt.xticks()
        plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show();

####################################################################################

########################################
#Custom function to apply functions to dataframe with missing values
def impute_missing(df, func, target_col, new_col_name):
    df.loc[~df[target_col].isnull(),new_col_name] = df.loc[~df[target_col].isnull(),target_col].apply(func)


# Import dataset - continue from P1

In [4]:
# Import code for resume
temp = pq.read_table('/kaggle/input/sarcasm/clean_raw_data.parquet')

raw_data = temp.to_pandas()

In [5]:
raw_data.head()

Unnamed: 0,id,text,labels,clean_text,emoji_list,clean_text_lem,clean_text_lem_stop
0,1626008355558035456,rt @kangaroos991: diversity? job do...,sarcasm,it SOME_ENTITY : diversity job done,[],it SOME_ENTITY : diversity job done,SOME_ENTITY : diversity job done
1,1626008180173213696,@pierrepoilievre for 7 years &amp; 3...,sarcasm,SOME_ENTITY for 7 years camp 3 month...,[],SOME_ENTITY for 7 year camp 3 month ...,SOME_ENTITY 7 year camp 3 month roun...
2,1626007897024126976,diversity? job done! #sarcasm htt...,sarcasm,diversity job done,[],diversity job done,diversity job done
3,1626007435918905345,they are all the same just change th...,sarcasm,they are all the same just change th...,[],they are all the same just change th...,change clothes wear observation 31 m...
4,1626006869432012801,get my art printed on awesome produc...,sarcasm,get my art printed on awesome produc...,[],get my art printed on awesome produc...,get art printed awesome product supp...


# Extract features from pretrained BERT model



In [6]:
# !pip install transformers

In [7]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
bert_test_data = raw_data.loc[:,['id','clean_text']]

# bert_test_data.shape
bert_test_data = bert_test_data.drop_duplicates(subset = ['clean_text'])

bert_test_data.head()

Unnamed: 0,id,clean_text
0,1626008355558035456,it SOME_ENTITY : diversity job done
1,1626008180173213696,SOME_ENTITY for 7 years camp 3 month...
2,1626007897024126976,diversity job done
3,1626007435918905345,they are all the same just change th...
4,1626006869432012801,get my art printed on awesome produc...


## Adding tags for BERT

In [9]:
bert_test_data['clean_text'] = bert_test_data['clean_text'].apply(lambda tweet: "[CLS] " + tweet + " [SEP]")
bert_test_data.head()

Unnamed: 0,id,clean_text
0,1626008355558035456,[CLS] it SOME_ENTITY : diversity job...
1,1626008180173213696,[CLS] SOME_ENTITY for 7 years camp 3...
2,1626007897024126976,[CLS] diversity job done [SEP]
3,1626007435918905345,[CLS] they are all the same just cha...
4,1626006869432012801,[CLS] get my art printed on awesome ...


## Generate BERT tokens

In [10]:
bert_test_data['token_text'] = bert_test_data['clean_text'].apply(tokenizer.tokenize)
bert_test_data.head()

# Map the token strings to their vocabulary indeces.
bert_test_data['indexed_tokens'] = bert_test_data['token_text'].apply(tokenizer.convert_tokens_to_ids)
bert_test_data.head()

Unnamed: 0,id,clean_text,token_text,indexed_tokens
0,1626008355558035456,[CLS] it SOME_ENTITY : diversity job...,"[[CLS], it, some, _, entity, :, dive...","[101, 2009, 2070, 1035, 9178, 1024, ..."
1,1626008180173213696,[CLS] SOME_ENTITY for 7 years camp 3...,"[[CLS], some, _, entity, for, 7, yea...","[101, 2070, 1035, 9178, 2005, 1021, ..."
2,1626007897024126976,[CLS] diversity job done [SEP],"[[CLS], diversity, job, done, [SEP]]","[101, 8906, 3105, 2589, 102]"
3,1626007435918905345,[CLS] they are all the same just cha...,"[[CLS], they, are, all, the, same, j...","[101, 2027, 2024, 2035, 1996, 2168, ..."
4,1626006869432012801,[CLS] get my art printed on awesome ...,"[[CLS], get, my, art, printed, on, a...","[101, 2131, 2026, 2396, 6267, 2006, ..."


In [11]:
# BERT expects pairs of sentence for matching. Since we are not doing that, we ignore 0 tag and give inputs the 1 tag

bert_test_data['segment_ids'] = bert_test_data['token_text'].apply(lambda tweet: [1]*len(tweet))

bert_test_data.head()

Unnamed: 0,id,clean_text,token_text,indexed_tokens,segment_ids
0,1626008355558035456,[CLS] it SOME_ENTITY : diversity job...,"[[CLS], it, some, _, entity, :, dive...","[101, 2009, 2070, 1035, 9178, 1024, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,1626008180173213696,[CLS] SOME_ENTITY for 7 years camp 3...,"[[CLS], some, _, entity, for, 7, yea...","[101, 2070, 1035, 9178, 2005, 1021, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,1626007897024126976,[CLS] diversity job done [SEP],"[[CLS], diversity, job, done, [SEP]]","[101, 8906, 3105, 2589, 102]","[1, 1, 1, 1, 1]"
3,1626007435918905345,[CLS] they are all the same just cha...,"[[CLS], they, are, all, the, same, j...","[101, 2027, 2024, 2035, 1996, 2168, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
4,1626006869432012801,[CLS] get my art printed on awesome ...,"[[CLS], get, my, art, printed, on, a...","[101, 2131, 2026, 2396, 6267, 2006, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."


In [12]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
    

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [13]:
# Use the model to extract the information from final layers

def extract_bert_embeddings(indexed_tokens, segment_ids):
    tokens_tensor = torch.tensor([indexed_tokens])
    segment_tensor = torch.tensor([segment_ids]) 
    # print("Token size: %d | Segment size: %d" %(len(indexed_tokens), len(segment_ids)))
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segment_tensor)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        
        
        # Extract the mean of last 2nd layer as word embeddings.
        # Can tweak this to get mean of last 4 layers or a concatenation
        embedding = torch.mean(hidden_states[-2][0], dim = 0)
        embedding = embedding.tolist()
        
        # print(embedding)
        return embedding



In [14]:
# bert_test_data.assign(bert_embedding_1_layer = lambda info: extract_bert_embeddings(info['indexed_tokens'], info['segment_ids']))
# bert_test_data.head()


bert_word_embeddings = list()

for indexed_tokens,segment_ids in zip(bert_test_data['indexed_tokens'], bert_test_data['segment_ids']):
    bert_word_embeddings.append(extract_bert_embeddings(indexed_tokens, segment_ids))

# Save the nested list as numpy array for easy usage
bert_word_embeddings = np.array(bert_word_embeddings)

In [15]:
# Export the numpy array
# Need to conmvert to pandas for export

temp = pd.DataFrame(bert_word_embeddings)
temp_col_names = temp.columns.values
temp_col_names = ["bert_emb_"+str(col) for col in temp_col_names]


temp.columns = temp_col_names

temp.to_parquet(path = '/kaggle/working/bert_embeddings_768.parquet')

In [16]:
# Save the embeddings for easy access
# Import code for resume
