In [1]:
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import wordnet
import string
from sklearn.model_selection import KFold
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')

# initialize the document embeddings, mode = mean
doc_embed = DocumentPoolEmbeddings([glove_embedding])

In [4]:
def embedding(sentence):
    sentence = Sentence(sentence)
    doc_embed.embed(sentence)
    np_tens = sentence.embedding.numpy().tolist()
    tf_tensor = tf.convert_to_tensor(np_tens)
    return tf_tensor

In [5]:
df = pd.read_csv(r"olid-training-v1.0.tsv", sep="\t")

In [6]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
...,...,...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,@USER And why report this garbage. We don't g...,OFF,TIN,OTH
13238,27429,@USER Pussy,OFF,UNT,


In [7]:
df['tweet'] = df['tweet'].str.replace('@USER','')

In [8]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [9]:
# Removing hyperlinks
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

In [10]:
df['tweet'] = df['tweet'].apply(hyperlink)

In [11]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [12]:
# Removing retweets
def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

In [13]:
df['tweet'] = df['tweet'].apply(retweets)

In [14]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [15]:
def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

In [16]:
df["tweet"] = df["tweet"].apply(split_hashtag)

In [17]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [18]:
# Splitting joined words
def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

In [19]:
df['tweet'] = df['tweet'].apply(join_words)

In [20]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [21]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [22]:
df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')

  df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')


In [23]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk MAGA Trump 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,Someone shouldve Taken this piece of shit to ...,OFF,UNT,
4,43605,Obama wanted liberals amp illegals to move i...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage We dont give a crap,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [24]:
df['tweet'] = df['tweet'].str.strip()

In [25]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what the...,OFF,UNT,
1,90194,Go home you’re drunk MAGA Trump 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,Someone shouldve Taken this piece of shit to a...,OFF,UNT,
4,43605,Obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and t...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage We dont give a crap,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [26]:
df['tweet'] = df['tweet'].str.lower()

In [27]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [28]:
from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

In [29]:
def token(tweet):
    return tokenizer.tokenize(tweet)

In [30]:
df['tweet'] = df['tweet'].apply(token)

In [31]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[she, should, ask, a, few, native, americans, ...",OFF,UNT,
1,90194,"[go, home, you, ’, re, drunk, maga, trump, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, is, investigating, chinese, employees...",NOT,,
3,62688,"[someone, shouldve, taken, this, piece, of, sh...",OFF,UNT,
4,43605,"[obama, wanted, liberals, amp, illegals, to, m...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, i, get, strong, vibes, from, peopl...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, not, to...",NOT,,
13237,82921,"[and, why, report, this, garbage, we, dont, gi...",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [32]:
stop_words = set(stopwords.words('english'))

In [33]:
def remove(tweet):
    return [i for i in tweet if i not in stop_words]

In [34]:
df['tweet'] = df['tweet'].apply(remove)

In [35]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[ask, native, americans, take]",OFF,UNT,
1,90194,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, investigating, chinese, employees, se...",NOT,,
3,62688,"[someone, shouldve, taken, piece, shit, volcan...",OFF,UNT,
4,43605,"[obama, wanted, liberals, amp, illegals, move,...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, get, strong, vibes, people, man, ’...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT,,
13237,82921,"[report, garbage, dont, give, crap]",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [36]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [37]:
def get_wordnet_POS(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(word, wordnet.NOUN)


In [38]:
def lemm(tweet):
    return [lemmatizer.lemmatize(i, get_wordnet_POS(i)) for i in tweet]

In [39]:
df['tweet'] = df['tweet'].apply(lemm)

In [40]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[ask, native, american, take]",OFF,UNT,
1,90194,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, investigating, chinese, employee, sel...",NOT,,
3,62688,"[someone, shouldve, taken, piece, shit, volcan...",OFF,UNT,
4,43605,"[obama, wanted, liberal, amp, illegals, move, ...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, get, strong, vibe, people, man, ’,...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT,,
13237,82921,"[report, garbage, dont, give, crap]",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [41]:
df = df.drop(['subtask_b', 'subtask_c', 'id'], axis=1)

In [42]:
df

Unnamed: 0,tweet,subtask_a
0,"[ask, native, american, take]",OFF
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF
2,"[amazon, investigating, chinese, employee, sel...",NOT
3,"[someone, shouldve, taken, piece, shit, volcan...",OFF
4,"[obama, wanted, liberal, amp, illegals, move, ...",NOT
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",OFF
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT
13237,"[report, garbage, dont, give, crap]",OFF
13238,[pussy],OFF


In [43]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [44]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",OFF
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF
2,"[amazon, investigating, chinese, employee, sel...",NOT
3,"[someone, shouldve, taken, piece, shit, volcan...",OFF
4,"[obama, wanted, liberal, amp, illegals, move, ...",NOT
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",OFF
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT
13237,"[report, garbage, dont, give, crap]",OFF
13238,[pussy],OFF


In [45]:
def off(cls):
    if cls =='OFF':
        return 1
    elif cls == 'NOT':
        return 0

In [46]:
df["Offensive"] = df["Offensive"].apply(off)

In [47]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",1
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",1
2,"[amazon, investigating, chinese, employee, sel...",0
3,"[someone, shouldve, taken, piece, shit, volcan...",1
4,"[obama, wanted, liberal, amp, illegals, move, ...",0
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",1
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",0
13237,"[report, garbage, dont, give, crap]",1
13238,[pussy],1


In [48]:
rows_to_drop = []

for i in range(len(df)):
    try:
        df.iloc[i]['tweet'] = embedding(df.iloc[i]['tweet'])
    except:
        rows_to_drop.append(i)

print(rows_to_drop)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[i]['tweet'] = embedding(df.iloc[i]['tweet'])


[847, 995, 1549, 2709, 2734, 2927, 3315, 3418, 3840, 4241, 4422, 5154, 5175, 6425, 7310, 7381, 7634, 7816, 8008, 8177, 8304, 8652, 8653, 8871, 9567, 9713, 9833, 9925, 10095, 10103, 10373, 10685, 10710, 10718, 10726, 11010, 11268, 11609, 11618, 11635, 11667, 11679, 12615, 12872]


In [49]:
df.drop(rows_to_drop, inplace=True)

In [50]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",1
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",1
2,"[amazon, investigating, chinese, employee, sel...",0
3,"[someone, shouldve, taken, piece, shit, volcan...",1
4,"[obama, wanted, liberal, amp, illegals, move, ...",0
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",1
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",0
13237,"[report, garbage, dont, give, crap]",1
13238,[pussy],1


In [51]:
df["tweet"] = df["tweet"].apply(embedding)

In [52]:
df

Unnamed: 0,tweet,Offensive
0,"(tf.Tensor(-0.004631497, shape=(), dtype=float...",1
1,"(tf.Tensor(-0.080776, shape=(), dtype=float32)...",1
2,"(tf.Tensor(-0.021361161, shape=(), dtype=float...",0
3,"(tf.Tensor(-0.12938271, shape=(), dtype=float3...",1
4,"(tf.Tensor(-0.09563908, shape=(), dtype=float3...",0
...,...,...
13235,"(tf.Tensor(0.10650693, shape=(), dtype=float32...",1
13236,"(tf.Tensor(0.024536252, shape=(), dtype=float3...",0
13237,"(tf.Tensor(-0.28171122, shape=(), dtype=float3...",1
13238,"(tf.Tensor(0.25839, shape=(), dtype=float32), ...",1


In [53]:
type(df.iloc[0]['tweet'])

tensorflow.python.framework.ops.EagerTensor

In [54]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

In [55]:
type(X_train.iloc[0])

tensorflow.python.framework.ops.EagerTensor

In [56]:
y_train

6740     0
9789     1
539      0
195      0
10577    0
        ..
4303     0
5494     0
5284     1
3192     0
11847    0
Name: Offensive, Length: 9897, dtype: int64

In [57]:
maxim = 0
for i in range(len(df)):
    if len(df.iloc[i]['tweet']) > maxim:
        maxim = len(df.iloc[i]['tweet'])

maxim

100

In [58]:
def create_model():
    model = Sequential()
    model.add(Embedding(8000, 32, input_length=100))
    model.add(LSTM(20))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [59]:
model = create_model()

In [60]:
model.fit(X_train, y_train, epochs=100, batch_size = 64, validation_data=(X_test, y_test))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type tensorflow.python.framework.ops.EagerTensor).