In [6]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

In [3]:
#
# load glove embeddings
#
embeddings_dict = {}
embedding_size=200
with open(f"../data/embeddings/glove.twitter.27B.{embedding_size}d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector


In [7]:
import re
import json

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    #text = (r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()


if __name__ == '__main__':
    processed = []
    lables = []
    tweets = pd.read_csv("../text_twitter_raw.csv")
    tweets = tweets.sample(1700)
    for _,row in tweets.iterrows():
        tokens = tokenize(row['text'])
        lables.append(row['label'])
        processed.append(tokens)

['<user> for my small org, it works amazingly well. has everything to do with configuration. i love the vcs <allcaps> integrations.',
 '<user> i think so <sadface>',
 'suck it kat, you fucking cunt. <hashtag> mkr',
 "rt <allcaps> <user> <user> a self appointed vigilante for feminism yet shares an article that proclaims men aren't needed. you dont unde…",
 "kat that's karma <allcaps> b!tch <allcaps> 🔥🔥🔥🔥 <hashtag> mkr",
 'rt <allcaps> <user> call me sexist but i get annoyed by women anchors on sportscenter',
 'what could you have possibly modeled <hashtag> mkr',
 '<user> <user> unrelated to highlander.  i also prefer fletch lives over the first for same reason as gb <allcaps> <number>  funnier, bottom line.',
 '<user> but the truth is that mohammed followers were worthless starving thugs that produced nothing and so wanted the riches of khybar.',
 "<user> hi you're rad."]

In [8]:
#
# load english spacy model and add tokenizer special cases for special tokens
#
from spacy.symbols import ORTH, NORM, POS
import spacy

nlp = spacy.load("en_core_web_sm")

ignore = [u'<user>',u'<allcaps>',u'<hashtag>',u'<url>',u'<smile>',u'<lolface>',u'<sadface>',u'<neutralface>',u'<heart>',
          u'<number>',u'<repeat>',u'<elong>']

for i in ignore:
    nlp.tokenizer.add_special_case(i,[{ORTH: i,NORM: i}])

In [19]:
#
# train test split
#
from sklearn.model_selection import train_test_split

train_X,test_X,train_y,test_y= train_test_split(processed,lables,test_size=200/1700)

In [20]:
#
#count frequency of each token of train set
#

DF = {}
tokenized = nlp.pipe(train_X)
N=0
for tweet in tokenized:
    for w in tweet:
        try:
            DF[w.text].add(N)
        except:
            DF[w.text] = {N}
    N+=1


In [27]:
from collections import Counter    
#
# calculate tf_idf for document token pair
#
tf_idf_dict = {}
tokenized = nlp.pipe(train_X)
for tweet in tokenized:
    tokens = [t.text for t in tweet]
    counter = Counter(tokens)
    for token in np.unique(tokens):
        tf = counter[token]/(len(DF.keys()))
        df = len(DF[token])
        idf = np.log(N/(df+1))
        tf_idf_dict[tweet.text, token] = tf*idf

In [36]:
#
# 1st approach randomizing embedding vector
# tf weighted
#
# 2nd approach skipping oov
#

X_train_embedded = []
X_test_embedded = []

tokenized_train = nlp.pipe(train_X)
tokenized_test = nlp.pipe(test_X)

for tweet in tokenized_train:
    w_sum = 0.0
    tfi_sum =0.0
    for t in tweet:
        if t.text in embeddings_dict:
            w_sum+=tf_idf_dict[tweet.text,t.text]*embeddings_dict[t.text]
        else:
            w_sum+=tf_idf_dict[tweet.text,t.text]*(np.random.rand(embedding_size)-2*np.random.rand(embedding_size))
        tfi_sum += tf_idf_dict[tweet.text,t.text]
    X_train_embedded.append(w_sum/tfi_sum)

for tweet in tokenized_test:
    w_sum = 0.0
    tfi_sum =0.0
    tokens = [t.text for t in tweet]
    counter = Counter(tokens)
    for t in tweet:
        tf = counter[t.text]/(len(DF.keys())+1)
        df = len(DF[t.text]) if t.text in DF else 1
        idf = np.log(N/(df+1))
        tf_idf = tf*idf
        if t.text in embeddings_dict:
            w_sum+=tf_idf*embeddings_dict[t.text]
        else:
            w_sum+=tf_idf*(np.random.rand(embedding_size)-2*np.random.rand(embedding_size))
        tfi_sum += tf_idf
    X_test_embedded.append(w_sum/tfi_sum)

In [41]:
#
# 1st approach randomizing embedding vector
# tf weighted
#
# 2nd approach skipping oov
#

X_train_embedded = []
X_test_embedded = []

tokenized_train = nlp.pipe(train_X)
tokenized_test = nlp.pipe(test_X)

for tweet in tokenized_train:
    summ = 0.0
    N=0
    for t in tweet:
        if t.text in embeddings_dict:
            summ+=embeddings_dict[t.text]
        else:
            summ+=(np.random.rand(embedding_size)-2*np.random.rand(embedding_size))
        N += 1
    X_train_embedded.append(summ/N)

for tweet in tokenized_test:
    summ = 0.0
    N=0
    for t in tweet:
        if t.text in embeddings_dict:
            summ+=embeddings_dict[t.text]
        else:
            summ+=(np.random.rand(embedding_size)-2*np.random.rand(embedding_size))
        N += 1
    X_test_embedded.append(summ/N)

In [319]:
#print(tweet_embeddings[:20])

In [50]:
#
#grid search for best parameters
#
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import SVC
import pandas as pd
base_y=2
base_n=2
def score_for_weights(y,n):
    clf = SVC(kernel='rbf',class_weight={'y': y,'n':n})
    clf.fit(train_X,train_y)
    return(clf.score(test_X,test_y))

f1_scorer = make_scorer(f1_score, pos_label="positive")

weights = []
for y_w in range(2,6):
    for n_w in range(2,6):
        weights.append({'y':y_w,'n':n_w})

clf = SVC(kernel='linear',class_weight='balanced')
clf.fit(X_train_embedded,train_y)

SVC(class_weight='balanced', kernel='linear')

In [54]:
f1_score(clf.predict(X_test_embedded),test_y,pos_label="positive")
#clf.predict(test_X)

0.7088607594936709

In [55]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, clf.predict(X_test_embedded))

0.77

In [324]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_X, train_y)
print("Dummy classifier:")
print(f"F1: {f1_scorer(dummy_clf,X_test_embedded,test_y)}, accuracy: {accuracy_score(test_y, dummy_clf.predict(X_test_embedded))}")

print("SVM (rbf,balance weighted, C=1):")
print(f"F1: {f1_scorer(clf,X_test_embedded,test_y)} accuracy: {accuracy_score(test_y,clf.predict(X_test_embedded))}")

f1s.append(f1_scorer(clf,X_test_embedded,test_y))
accs.append(accuracy_score(test_y, clf.predict(X_test_embedded)))

Dummy classifier:
F1: 0.0, accuracy: 0.692
SVM (rbf,balance weighted, C=1):
F1: 0.6234509056244042 accuracy: 0.7366666666666667


## Adding char2vec embeddings

In [1]:
#
# adding char2vec representations
#
#from keras.preprocessing import text
import chars2vec


c2v_model = chars2vec.load_model('eng_200')

#print(c2v_model.vectorize_words(['bla']))

c2v_summed=[]
for tweet in processed:
    
    embedded = c2v_model.vectorize_words([w for w in tweet]).mean(axis=0)
    c2v_summed.append(embedded)
        
        


AttributeError: module 'keras.utils.generic_utils' has no attribute 'populate_dict_with_module_objects'

In [326]:
c2v_glove_emb = [glove + c2v for glove,c2v in zip(tweet_embeddings,c2v_summed)]
train_X,test_X,train_y,test_y= train_test_split(tweet_embeddings,lables,test_size=0.3)

In [327]:
parameters = {'kernel':['linear'], 'C':[1],'class_weight':['balanced']}
svc = SVC(class_weight='balanced')
clf = GridSearchCV(svc, parameters,scoring=f1_scorer)
clf.fit(train_X,train_y)
print(clf.best_params_)

{'C': 1, 'class_weight': 'balanced', 'kernel': 'linear'}


In [328]:
print(clf.score(test_X,test_y))
print(accuracy_score(test_y, clf.predict(test_X)))
f1s_c2v.append(clf.score(test_X,test_y))
accs_c2v.append(accuracy_score(test_y, clf.predict(test_X)))

0.6561484918793504
0.753


In [329]:
print(len(f1s))

10


In [332]:
print(f"Glove only f1: {np.array(f1s).mean()}")
print(f"Glove only acc: {np.array(accs).mean()}")
print(f"Glove+c2v f1: {np.array(f1s_c2v).mean()}")
print(f"Glove+c2v acc: {np.array(accs_c2v).mean()}")
      

Glove only f1: 0.6439712024681012
Glove only acc: 0.7435666666666666
Glove+c2v f1: 0.6500983826205389
Glove+c2v acc: 0.7495666666666667


In [45]:
from sklearn.preprocessing import normalize

word_rac = [('muslim', 604), ('islam', 597), ('murder', 192), ('mohamme', 175), ('isis', 159), ('religion', 151), ('woman', 147), ('people', 142), ('prophet', 135), ('quran', 131), ('year', 120), ('jew', 108), ('rape', 98), ('kill', 92), ('war', 92), ('christian', 90), ('lie', 86), ('terrorist', 86), ('say', 85), ('want', 84), ('world', 84), ('slave', 81), ('get', 80), ('make', 80), ('tell', 80), ('know', 72), ('man', 70), ('hate', 69), ('child', 68), ('go', 64), ('follow', 63), ('hatred', 61), ('attack', 60), ('hadith', 58), ('declare', 58), ('humanity', 58), ('time', 57), ('slavery', 56), ('try', 56), ('use', 54), ('see', 53), ('think', 52), ('islamic', 52), ('pedophile', 51), ('never', 47), ('sexist', 45), ('look', 45), ('give', 44), ('stupid', 44), ('israel', 44), ('read', 43), ('idiot', 43), ('behead', 43), ('bigotry', 42), ('right', 42), ('amp', 41), ('force', 41), ('palestinian', 41), ('exterminate', 40), ('non', 40), ('ago', 40), ('live', 39), ('country', 38), ('care', 38), ('state', 38), ('good', 37), ('stop', 37), ('leave', 36), ('exactly', 36), ('call', 35)]
word_sex = [('sexist', 954), ('woman', 617), ('girl', 373), ('kat', 350), ('man', 260), ('female', 250), ('call', 242), ('get', 234), ('think', 190), ('make', 157), ('go', 155), ('say', 149), ('know', 148), ('see', 122), ('right', 120), ('want', 119), ('feminist', 112), ('really', 111), ('sport', 105), ('hate', 99), ('amp', 97), ('good', 96), ('well', 96), ('football', 95), ('bitch', 95), ('look', 94), ('need', 92), ('face', 77), ('time', 76), ('fuck', 75), ('never', 74), ('blonde', 74), ('even', 73), ('people', 72), ('take', 72), ('cook', 71), ('watch', 71), ('drive', 70), ('ever', 70), ('thing', 68), ('funny', 67), ('shit', 66), ('guy', 66), ('talk', 65), ('feminism', 64), ('andre', 63), ('play', 62), ('lol', 62), ('way', 61), ('male', 61), ('bad', 60), ('stop', 58), ('pretty', 55), ('mean', 55), ('still', 55), ('nikki', 54), ('work', 53), ('driver', 52), ('come', 51), ('women', 51), ('much', 50), ('dumb', 50), ('tell', 49), ('find', 48), ('stand', 48), ('game', 47), ('actually', 47), ('job', 45), ('suck', 45), ('katie', 45)]
lda_dict = {}
for w in word_rac+word_sex:
    if(w[0] in lda_dict and lda_dict[w[0]]>=w[1]):
        pass
    else:
        lda_dict[w[0]]=w[1]

for pos,k in enumerate(lda_dict.keys()):
    lda_dict[k]=(lda_dict[k],pos)

for tweet in nlp.pipe(processed):
    features = np.zeros(len(lda_dict))
    for w in tweet:
        if(w.lemma_ in lda_dict):
            word =  lda_dict[w.lemma_]
            features[word[1]]+=word[0]
    features=normalize(features.reshape(-1,1))
    print(features[:,0].shape)
    


(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)

(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)
(118,)