In [1]:
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import wordnet
import string
from sklearn.model_selection import KFold
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split, GridSearchCV

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')

# initialize the document embeddings, mode = mean
doc_embed = DocumentPoolEmbeddings([glove_embedding])

In [3]:
def embedding(sentence):
    sentence = Sentence(sentence)
    doc_embed.embed(sentence)
    np_tens = sentence.embedding.numpy()
    #tf_tensor = tf.convert_to_tensor(np_tens)
    return np_tens

In [4]:
df = pd.read_csv(r"olid-training-v1.0.tsv", sep="\t")

In [5]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
...,...,...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,@USER And why report this garbage. We don't g...,OFF,TIN,OTH
13238,27429,@USER Pussy,OFF,UNT,


In [6]:
df['tweet'] = df['tweet'].str.replace('@USER','')

In [7]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [8]:
# Removing hyperlinks
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

In [9]:
df['tweet'] = df['tweet'].apply(hyperlink)

In [10]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [11]:
# Removing retweets
def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

In [12]:
df['tweet'] = df['tweet'].apply(retweets)

In [13]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [14]:
def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

In [15]:
df["tweet"] = df["tweet"].apply(split_hashtag)

In [16]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [17]:
# Splitting joined words
def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

In [18]:
df['tweet'] = df['tweet'].apply(join_words)

In [19]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [20]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [21]:
df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')

In [22]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk MAGA Trump 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,Someone shouldve Taken this piece of shit to ...,OFF,UNT,
4,43605,Obama wanted liberals amp illegals to move i...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage We dont give a crap,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [23]:
df['tweet'] = df['tweet'].str.strip()

In [24]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what the...,OFF,UNT,
1,90194,Go home you’re drunk MAGA Trump 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,Someone shouldve Taken this piece of shit to a...,OFF,UNT,
4,43605,Obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and t...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage We dont give a crap,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [25]:
df['tweet'] = df['tweet'].str.lower()

In [26]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [27]:
from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

In [28]:
def token(tweet):
    return tokenizer.tokenize(tweet)

In [29]:
df['tweet'] = df['tweet'].apply(token)

In [30]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[she, should, ask, a, few, native, americans, ...",OFF,UNT,
1,90194,"[go, home, you, ’, re, drunk, maga, trump, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, is, investigating, chinese, employees...",NOT,,
3,62688,"[someone, shouldve, taken, this, piece, of, sh...",OFF,UNT,
4,43605,"[obama, wanted, liberals, amp, illegals, to, m...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, i, get, strong, vibes, from, peopl...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, not, to...",NOT,,
13237,82921,"[and, why, report, this, garbage, we, dont, gi...",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [31]:
stop_words = set(stopwords.words('english'))

In [32]:
def remove(tweet):
    return [i for i in tweet if i not in stop_words]

In [33]:
df['tweet'] = df['tweet'].apply(remove)

In [34]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[ask, native, americans, take]",OFF,UNT,
1,90194,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, investigating, chinese, employees, se...",NOT,,
3,62688,"[someone, shouldve, taken, piece, shit, volcan...",OFF,UNT,
4,43605,"[obama, wanted, liberals, amp, illegals, move,...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, get, strong, vibes, people, man, ’...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT,,
13237,82921,"[report, garbage, dont, give, crap]",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [35]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [36]:
def get_wordnet_POS(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(word, wordnet.NOUN)


In [37]:
def lemm(tweet):
    return [lemmatizer.lemmatize(i, get_wordnet_POS(i)) for i in tweet]

In [38]:
df['tweet'] = df['tweet'].apply(lemm)

In [39]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[ask, native, american, take]",OFF,UNT,
1,90194,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, investigating, chinese, employee, sel...",NOT,,
3,62688,"[someone, shouldve, taken, piece, shit, volcan...",OFF,UNT,
4,43605,"[obama, wanted, liberal, amp, illegals, move, ...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, get, strong, vibe, people, man, ’,...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT,,
13237,82921,"[report, garbage, dont, give, crap]",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [40]:
df = df.drop(['subtask_b', 'subtask_c', 'id'], axis=1)

In [41]:
df

Unnamed: 0,tweet,subtask_a
0,"[ask, native, american, take]",OFF
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF
2,"[amazon, investigating, chinese, employee, sel...",NOT
3,"[someone, shouldve, taken, piece, shit, volcan...",OFF
4,"[obama, wanted, liberal, amp, illegals, move, ...",NOT
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",OFF
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT
13237,"[report, garbage, dont, give, crap]",OFF
13238,[pussy],OFF


In [42]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [43]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",OFF
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF
2,"[amazon, investigating, chinese, employee, sel...",NOT
3,"[someone, shouldve, taken, piece, shit, volcan...",OFF
4,"[obama, wanted, liberal, amp, illegals, move, ...",NOT
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",OFF
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT
13237,"[report, garbage, dont, give, crap]",OFF
13238,[pussy],OFF


In [44]:
def off(cls):
    if cls =='OFF':
        return 1
    elif cls == 'NOT':
        return 0

In [45]:
df["Offensive"] = df["Offensive"].apply(off)

In [46]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",1
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",1
2,"[amazon, investigating, chinese, employee, sel...",0
3,"[someone, shouldve, taken, piece, shit, volcan...",1
4,"[obama, wanted, liberal, amp, illegals, move, ...",0
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",1
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",0
13237,"[report, garbage, dont, give, crap]",1
13238,[pussy],1


In [47]:
rows_to_drop = []

for i in range(len(df)):
    try:
        df.iloc[i]['tweet'] = embedding(df.iloc[i]['tweet'])
    except:
        rows_to_drop.append(i)

print(rows_to_drop)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


[847, 995, 1549, 2709, 2734, 2927, 3315, 3418, 3840, 4241, 4422, 5154, 5175, 6425, 7310, 7381, 7634, 7816, 8008, 8177, 8304, 8652, 8653, 8871, 9567, 9713, 9833, 9925, 10095, 10103, 10373, 10685, 10710, 10718, 10726, 11010, 11268, 11609, 11618, 11635, 11667, 11679, 12615, 12872]


In [48]:
df.drop(rows_to_drop, inplace=True)

In [49]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",1
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",1
2,"[amazon, investigating, chinese, employee, sel...",0
3,"[someone, shouldve, taken, piece, shit, volcan...",1
4,"[obama, wanted, liberal, amp, illegals, move, ...",0
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",1
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",0
13237,"[report, garbage, dont, give, crap]",1
13238,[pussy],1


In [50]:
df["tweet"] = df["tweet"].apply(embedding)

In [51]:
df.iloc[0]['tweet']

array([-0.0046315 ,  0.3850355 ,  0.53388   , -0.081911  , -0.1186325 ,
        0.17782725, -0.34666944, -0.03825501, -0.16063249, -0.0504125 ,
       -0.34632948, -0.29153997,  0.09328249, -0.05276125, -0.15749174,
       -0.17324498,  0.44185752,  0.04509751, -0.72000253,  0.55329   ,
        0.2311575 , -0.045605  ,  0.13985574, -0.0055725 ,  0.24697998,
       -0.081535  , -0.1513125 , -0.95780003,  0.5171975 , -0.0983775 ,
       -0.48116273,  0.50203   , -0.0489075 ,  0.0139003 ,  0.00868   ,
        0.293555  , -0.0516055 ,  0.21422501,  0.3084825 ,  0.00375175,
       -0.8380075 , -0.3673225 , -0.01753975, -0.23527902, -0.1801703 ,
       -0.01202675, -0.0765925 , -0.17686075, -0.1126645 , -0.76581997,
       -0.22583751,  0.13133201,  0.2711975 ,  0.4609725 , -0.12288079,
       -1.7613275 , -0.197709  , -0.4304255 ,  1.768525  ,  0.33898976,
        0.060244  ,  0.75451   ,  0.059105  , -0.33393875,  0.784495  ,
       -0.24248776,  0.14745337,  0.630935  ,  0.18118002, -0.03

In [52]:
type(df.iloc[0]['tweet'])

numpy.ndarray

In [62]:
# This function will apply reshape to every element in tweet

def expanding(item):
    x = np.expand_dims(item, 1)
    x = np.reshape(item, (100, 1, 1))
    return x

In [63]:
df["tweet"] = df["tweet"].apply(expanding)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

In [56]:
maxim = 0
for i in range(len(df)):
    if len(df.iloc[i]['tweet']) > maxim:
        maxim = len(df.iloc[i]['tweet'])

maxim

100

In [57]:
def create_model():
    model = Sequential()
    model.add(Embedding(8000, 32, input_length=1))
    model.add(LSTM(20))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [58]:
model = create_model()
model

Instructions for updating:
Colocations handled automatically by placer.


<keras.engine.sequential.Sequential at 0x19143ac1e88>

In [59]:
X_train

6740     [[[-0.033784185]], [[0.18223864]], [[0.3139745...
9789     [[[-0.06403985]], [[0.1766307]], [[0.19665577]...
539      [[[0.35608]], [[0.19476]], [[0.60341]], [[-0.6...
195      [[[-0.081377335]], [[0.34111]], [[0.150256]], ...
10577    [[[-0.08837729]], [[0.14820686]], [[0.2811603]...
                               ...                        
4303     [[[0.19914995]], [[0.2613253]], [[0.3200894]],...
5494     [[[-0.081934646]], [[0.30932894]], [[0.0633407...
5284     [[[-0.078757085]], [[0.21455105]], [[0.3401470...
3192     [[[-0.17399804]], [[0.24002126]], [[0.33891255...
11847    [[[0.10451001]], [[-0.051071294]], [[0.369976]...
Name: tweet, Length: 9897, dtype: object

In [60]:
X_train[1].shape

(100, 1, 1)

In [61]:
model.fit(X_train, y_train, epochs=100, batch_size = 64, validation_data=(X_test, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 9897 samples, validate on 3299 samples
Epoch 1/100


ValueError: setting an array element with a sequence.