# _Generate `distilbert` Embeddings for Sample_

In [4]:
from pathlib import Path
import pandas as pd

In [22]:
#!pip install -U sentence-transformers
#!pip install emoji

In [23]:
def experiments_path():
    return Path.cwd().parent

## _Load in Data_

In [24]:
path = experiments_path()

In [25]:
df = pd.read_json(
    f'{path}/playground_data/sample_data_2020_05_27.json',
    orient='columns',
    dtype={
        'id_str': str,
        'user_id_str': str
    }
)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   created_at   1000 non-null   datetime64[ns]
 1   id_str       1000 non-null   object        
 2   user_id_str  1000 non-null   object        
 3   lang         1000 non-null   object        
 4   full_text    1000 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 46.9+ KB


## _Apply Text Normalization_

In [27]:
from sentence_transformers import SentenceTransformer
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re
import numpy as np
import scipy
import pytest


tokenizer = TweetTokenizer()


def normalizeToken(token):
    lwrcase_tok = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lwrcase_tok.startswith("http") or lwrcase_tok.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    normTweet = re.sub(
        r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet
    )
    normTweet = re.sub(
        r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet
    )
    normTweet = re.sub(
        r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet
    )

    return " ".join(normTweet.split())

In [28]:
# apply text normalization
df['bert_tweet'] = df['full_text'].apply(
    lambda tweet: normalizeTweet(tweet)
).str.lower()

## _Embeddings_

In [30]:
def create_embedding_model(model_name):
    '''
    Given string of pretrain embedding available in sentence-transformers
    library, create a SentenceTransformer object to encode embeddings with
    '''
    model = SentenceTransformer(model_name)
    return model


def generate_embeddings(model, df, column_name='bert_tweet'):
    '''
    Given a SentenceTransformer model, a pandas DataFrame, and a column name
    (whose default value will take 'bert_tweet'), we'll encode a set of
    (unnormalized) embeddings on tweet text within the dataframe and
    return this dataframe
    '''
    tweets = df[column_name]
    # generate embeddings with model
    tweet_embeddings = model.encode(tweets)
    return tweet_embeddings

In [32]:
distilbert = create_embedding_model('distilbert-base-nli-stsb-mean-tokens')

In [33]:
tweet_embeddings = generate_embeddings(
    distilbert,
    df
)

In [35]:
len(tweet_embeddings)

1000