## FastText embedding features in single CRF layer

In [33]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re
from ast import literal_eval
import numpy as np
from sklearn.metrics import classification_report


tweet_tokenizer = TweetTokenizer()

# https://huggingface.co/vinai/bertweet-base
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

        
def normalizeTweet(tweet):

    tokens = tweet_tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())


def split_into_sentences(text):
    """ Split tweet into sentences """
    
    text = " " + text + "  "
    text = text.replace("\n"," ")
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace("..", "<POINTPOINT>")
    text = text.replace(".",".<stop>")
    text = text.replace("<POINTPOINT>", "..")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    sentences = [s  for s in sentences if s != ""]
    return sentences




In [None]:
dataPath = "data/cause_effect_sentences_with_IO_tags.csv"
data = pd.read_csv(dataPath, sep=";", converters={"tokenized":literal_eval, "bio_tags":literal_eval})
print(data.shape)
data.head()

In [None]:
for i, row in data[0:20].iterrows():
    print("\n", row["sentence"])
    print(row["tokenized"])
    print(row["bio_tags"])

In [1]:
################# Take only sentences with both cause and effect

data_sentences = data[(data["Cause"].notnull()) & (data["Effect"].notnull())]

In [11]:
############# Load FastText embeddings (trained on diabetes tweets) #######

from gensim.models import FastText

we_path = "models/FastText_embeddings/ft_wordembeddings_dim300_minCount5_URL-User-toConstant_iter10_20190703"
wordEmbeddings = FastText.load(we_path)

In [None]:
train_to_test_ratio = 0.9
train = data_sentences.sample(frac=train_to_test_ratio, random_state=0)
test = data_sentences.drop(train.index)
print("Train:", train.shape)
print("Test:", test.shape)
train.head()

In [26]:
########## Create features for conditional random field (CRF) #######

def get_features(word):
    word=word.lower()
    try:
         vector=wordEmbeddings[word]
    except:
        # if the word is not in vocabulary, returns zeros array
        vector=np.zeros(300,)

    return vector   

def word2features(sent, i):
    word = sent[i]#[0]
    wordembedding=get_features(word)   ## word embedding vector 

    # features to return
    # TODO: add / remove features
    features = {
#        'bias': 1.0,
        'word.lower()': word.lower(),
#        'word[-3:]': word[-3:],
#        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
#        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
#        'postag': postag,
#        'postag[:2]': postag[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }

    # here you add 300 features (one for each vector component)
    for iv,value in enumerate(wordembedding):
        features['v{}'.format(iv)]=value
    
    return features


def sent2features(sent):
    """ Get feature vector for each sentence """
    return [word2features(sent, i) for i in range(len(sent))]



X_train = [sent2features(sentence) for sentence in train.tokenized.values.tolist()]
y_train = [tags for tags in train.bio_tags]


X_test = [sent2features(sentence) for sentence in test.tokenized.values.tolist()]
y_test = [tags for tags in test.bio_tags]



  vector=wordEmbeddings[word]


In [None]:
######## TRAIN model #########
import sklearn_crfsuite

%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)   ### Error message when try to train


In [29]:
####### Test set predictions ########
predictions = crf.predict(X_test)

In [2]:
####### Performance measures ############

test_true_tag = [ID for ID in np.concatenate(y_test)]#
test_predict_tag = [ID for ID in np.concatenate(predictions)]
print(classification_report(test_true_tag, test_predict_tag))

TODO: Change the features in the function word2features and play with the model parameters to beat this baseline:

              precision    recall  f1-score   support

         I-C       0.59      0.57      0.58       495
         I-E       0.45      0.38      0.41       487
           O       0.92      0.94      0.93      4159

    accuracy                           0.85      5141
    macro avg      0.65      0.63      0.64      5141
    weighted avg   0.84      0.85      0.84      5141

In [None]:
########## Save your best model ############
#joblib.dump(".....")

## Apply on diabetes tweets

Consider normalizing and splitting into sentences

In [38]:
########## Load diabetes tweets ##############


diabetes_tweets = pd.read_csv("data/diabetes_tweets_normalized.csv", sep=";")
diabetes_tweets = diabetes_tweets.sample(n=1000, random_state=55)
print(diabetes_tweets.shape)
diabetes_tweets.head()

(1000, 1)


Unnamed: 0,text
13310,@USER w / r to this particular point Jeremy as...
42148,"This is the same parent who , when I drank loa..."
48035,When ur blood sugar :syringe: is 46 and u feel...
45706,@USER You really need to go . More people die ...
949,That 's right ! No one said rice does n't fill...


In [39]:
####### SPLIT TWEETS INTO SENTENCES ######################

diabetes_sentences = diabetes_tweets["text"].map(lambda text: split_into_sentences(normalizeTweet(text)))
diabetes_sentences = diabetes_sentences.explode()
print("tweets to sentences:", diabetes_sentences.shape[0])


tweets to sentences: 2565
['@USER w / r to this particular point Jeremy as a T1 diabetic ( 47yrs ) I carry ID in my wallet to identify mys ...'
 'HTTPURL'
 'This is the same parent who , when I drank loads of tea BECAUSE TEA IS PENG , assumed I was pre diabetic and “ thirsty ...'
 'HTTPURL'
 'When ur blood sugar : syringe : is 46 and u feel like eating the entire pantry : woozy_face : #diabeticproblems #type1 #diabetes'
 '@USER You really need to go .'
 'More people die of diabetes every year .'
 'Less would have died had you not put cov ...' 'HTTPURL'
 "That ' s right !"]


In [40]:
######### Exclude questions and sentences with less than 5 words #################

diabetes_sentences_filtered = diabetes_sentences[diabetes_sentences.str.split(" ").str.len() > 5] # keep only sentence with more than 3 tokens
diabetes_sentences_filtered = diabetes_sentences_filtered[~diabetes_sentences_filtered.str.endswith("?")]
print("N sentences with > 5 words & no question:", diabetes_sentences_filtered.shape)

diabetes_sentences_filtered_df = diabetes_sentences_filtered.to_frame("sentences")
diabetes_sentences_filtered_df["tokenized"] = diabetes_sentences_filtered_df["sentences"].map(lambda tweet: normalizeTweet(tweet).split(" "))
diabetes_sentences_filtered_df.head()
#diabetes_text = diabetes_sentences_filtered.values.tolist()

N sentences with > 5 words & no question: (1464,)


Unnamed: 0,sentences,tokenized
13310,@USER w / r to this particular point Jeremy as...,"[@USER, w, /, r, to, this, particular, point, ..."
42148,"This is the same parent who , when I drank loa...","[This, is, the, same, parent, who, ,, when, I,..."
48035,When ur blood sugar : syringe : is 46 and u fe...,"[When, ur, blood, sugar, :, syringe, :, is, 46..."
45706,@USER You really need to go .,"[@USER, You, really, need, to, go, .]"
45706,More people die of diabetes every year .,"[More, people, die, of, diabetes, every, year, .]"


In [41]:
diabetes_features = [sent2features(sentence) for sentence in diabetes_sentences_filtered_df.tokenized.values.tolist()]
print("X_train:", len(X_train), len(X_train[0]))

  vector=wordEmbeddings[word]


X_train: 1906 31


In [None]:
diabetes_predictions = crf.predict(diabetes_features)

In [None]:
for tokens, predicts in zip(diabetes_sentences_filtered_df.tokenized, diabetes_predictions):
    print("\n")
    for token, predic in zip(tokens, predicts):
        print(token, "true:", true_label, "predic:", predic)

## Apply on personal cancer tweets

In [None]:
## TODO: Change Path to Causal cancer sentences

######### LOAD cancer tweets #################


import pandas as pd

# Alternatively, take the smaller file: data/causal_cancer_sentences_personal_subsample.csv
tweets_cancer = pd.read_csv("data/causal_cancer_sentences_personal.csv", sep=";")
print(tweets_cancer.shape)
tweets_cancer.head()

In [49]:
############## Save file ##################

# Save DataFrame with text and predicted cause and effect

cancer_cause_effect.to_csv("data/cancer_cause_effect_FastText.csv", sep=";")