In [64]:
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import wordnet
import string
from sklearn.model_selection import KFold
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split, GridSearchCV

In [65]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')

# initialize the document embeddings, mode = mean
doc_embed = DocumentPoolEmbeddings([glove_embedding])

In [66]:
def embedding(sentence):
    sentence = Sentence(sentence)
    doc_embed.embed(sentence)
    np_tens = sentence.embedding.numpy()
    #tf_tensor = tf.convert_to_tensor(np_tens)
    return np_tens

In [67]:
df = pd.read_csv(r"olid-training-v1.0.tsv", sep="\t")

In [68]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
...,...,...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,@USER And why report this garbage. We don't g...,OFF,TIN,OTH
13238,27429,@USER Pussy,OFF,UNT,


In [69]:
df['tweet'] = df['tweet'].str.replace('@USER','')

In [70]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [71]:
# Removing hyperlinks
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

In [72]:
df['tweet'] = df['tweet'].apply(hyperlink)

In [73]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [74]:
# Removing retweets
def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

In [75]:
df['tweet'] = df['tweet'].apply(retweets)

In [76]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [77]:
def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

In [78]:
df["tweet"] = df["tweet"].apply(split_hashtag)

In [79]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [80]:
# Splitting joined words
def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

In [81]:
df['tweet'] = df['tweet'].apply(join_words)

In [82]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [83]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [84]:
df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')

In [85]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk MAGA Trump 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,Someone shouldve Taken this piece of shit to ...,OFF,UNT,
4,43605,Obama wanted liberals amp illegals to move i...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage We dont give a crap,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [86]:
df['tweet'] = df['tweet'].str.strip()

In [87]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what the...,OFF,UNT,
1,90194,Go home you’re drunk MAGA Trump 👊🇺🇸👊 URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,Someone shouldve Taken this piece of shit to a...,OFF,UNT,
4,43605,Obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and t...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage We dont give a crap,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [88]:
df['tweet'] = df['tweet'].str.lower()

In [89]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [90]:
from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

In [91]:
def token(tweet):
    return tokenizer.tokenize(tweet)

In [92]:
df['tweet'] = df['tweet'].apply(token)

In [93]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[she, should, ask, a, few, native, americans, ...",OFF,UNT,
1,90194,"[go, home, you, ’, re, drunk, maga, trump, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, is, investigating, chinese, employees...",NOT,,
3,62688,"[someone, shouldve, taken, this, piece, of, sh...",OFF,UNT,
4,43605,"[obama, wanted, liberals, amp, illegals, to, m...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, i, get, strong, vibes, from, peopl...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, not, to...",NOT,,
13237,82921,"[and, why, report, this, garbage, we, dont, gi...",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [94]:
stop_words = set(stopwords.words('english'))

In [95]:
def remove(tweet):
    return [i for i in tweet if i not in stop_words]

In [96]:
df['tweet'] = df['tweet'].apply(remove)

In [97]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[ask, native, americans, take]",OFF,UNT,
1,90194,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, investigating, chinese, employees, se...",NOT,,
3,62688,"[someone, shouldve, taken, piece, shit, volcan...",OFF,UNT,
4,43605,"[obama, wanted, liberals, amp, illegals, move,...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, get, strong, vibes, people, man, ’...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT,,
13237,82921,"[report, garbage, dont, give, crap]",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [98]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [99]:
def get_wordnet_POS(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(word, wordnet.NOUN)


In [100]:
def lemm(tweet):
    return [lemmatizer.lemmatize(i, get_wordnet_POS(i)) for i in tweet]

In [101]:
df['tweet'] = df['tweet'].apply(lemm)

In [102]:
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,"[ask, native, american, take]",OFF,UNT,
1,90194,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF,TIN,IND
2,16820,"[amazon, investigating, chinese, employee, sel...",NOT,,
3,62688,"[someone, shouldve, taken, piece, shit, volcan...",OFF,UNT,
4,43605,"[obama, wanted, liberal, amp, illegals, move, ...",NOT,,
...,...,...,...,...,...
13235,95338,"[sometimes, get, strong, vibe, people, man, ’,...",OFF,TIN,IND
13236,67210,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT,,
13237,82921,"[report, garbage, dont, give, crap]",OFF,TIN,OTH
13238,27429,[pussy],OFF,UNT,


In [103]:
df = df.drop(['subtask_b', 'subtask_c', 'id'], axis=1)

In [104]:
df

Unnamed: 0,tweet,subtask_a
0,"[ask, native, american, take]",OFF
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF
2,"[amazon, investigating, chinese, employee, sel...",NOT
3,"[someone, shouldve, taken, piece, shit, volcan...",OFF
4,"[obama, wanted, liberal, amp, illegals, move, ...",NOT
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",OFF
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT
13237,"[report, garbage, dont, give, crap]",OFF
13238,[pussy],OFF


In [105]:
df = df.rename(columns={'subtask_a': 'Offensive'})

In [106]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",OFF
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",OFF
2,"[amazon, investigating, chinese, employee, sel...",NOT
3,"[someone, shouldve, taken, piece, shit, volcan...",OFF
4,"[obama, wanted, liberal, amp, illegals, move, ...",NOT
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",OFF
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",NOT
13237,"[report, garbage, dont, give, crap]",OFF
13238,[pussy],OFF


In [107]:
def off(cls):
    if cls =='OFF':
        return 1
    elif cls == 'NOT':
        return 0

In [108]:
df["Offensive"] = df["Offensive"].apply(off)

In [109]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",1
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",1
2,"[amazon, investigating, chinese, employee, sel...",0
3,"[someone, shouldve, taken, piece, shit, volcan...",1
4,"[obama, wanted, liberal, amp, illegals, move, ...",0
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",1
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",0
13237,"[report, garbage, dont, give, crap]",1
13238,[pussy],1


In [110]:
rows_to_drop = []

for i in range(len(df)):
    try:
        df.iloc[i]['tweet'] = embedding(df.iloc[i]['tweet'])
    except:
        rows_to_drop.append(i)

print(rows_to_drop)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


[847, 995, 1549, 2709, 2734, 2927, 3315, 3418, 3840, 4241, 4422, 5154, 5175, 6425, 7310, 7381, 7634, 7816, 8008, 8177, 8304, 8652, 8653, 8871, 9567, 9713, 9833, 9925, 10095, 10103, 10373, 10685, 10710, 10718, 10726, 11010, 11268, 11609, 11618, 11635, 11667, 11679, 12615, 12872]


In [111]:
df.drop(rows_to_drop, inplace=True)

In [112]:
df

Unnamed: 0,tweet,Offensive
0,"[ask, native, american, take]",1
1,"[go, home, ’, drunk, maga, trump, 👊, 🇺, 🇸, 👊, ...",1
2,"[amazon, investigating, chinese, employee, sel...",0
3,"[someone, shouldve, taken, piece, shit, volcan...",1
4,"[obama, wanted, liberal, amp, illegals, move, ...",0
...,...,...
13235,"[sometimes, get, strong, vibe, people, man, ’,...",1
13236,"[benidorm, ✅, creamfields, ✅, maga, ✅, shabby,...",0
13237,"[report, garbage, dont, give, crap]",1
13238,[pussy],1


In [113]:
df["tweet"] = df["tweet"].apply(embedding)

In [114]:
df.iloc[0]['tweet']

array([-0.0046315 ,  0.3850355 ,  0.53388   , -0.081911  , -0.1186325 ,
        0.17782725, -0.34666944, -0.03825501, -0.16063249, -0.0504125 ,
       -0.34632948, -0.29153997,  0.09328249, -0.05276125, -0.15749174,
       -0.17324498,  0.44185752,  0.04509751, -0.72000253,  0.55329   ,
        0.2311575 , -0.045605  ,  0.13985574, -0.0055725 ,  0.24697998,
       -0.081535  , -0.1513125 , -0.95780003,  0.5171975 , -0.0983775 ,
       -0.48116273,  0.50203   , -0.0489075 ,  0.0139003 ,  0.00868   ,
        0.293555  , -0.0516055 ,  0.21422501,  0.3084825 ,  0.00375175,
       -0.8380075 , -0.3673225 , -0.01753975, -0.23527902, -0.1801703 ,
       -0.01202675, -0.0765925 , -0.17686075, -0.1126645 , -0.76581997,
       -0.22583751,  0.13133201,  0.2711975 ,  0.4609725 , -0.12288079,
       -1.7613275 , -0.197709  , -0.4304255 ,  1.768525  ,  0.33898976,
        0.060244  ,  0.75451   ,  0.059105  , -0.33393875,  0.784495  ,
       -0.24248776,  0.14745337,  0.630935  ,  0.18118002, -0.03

In [187]:
type(df.iloc[0]['tweet'])

numpy.ndarray

In [188]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['Offensive'], stratify=df['Offensive'], random_state=0)

In [189]:
type(X_train.iloc[0])

numpy.ndarray

In [190]:
type(y_train.iloc[0])

numpy.int64

In [191]:
maxim = 0
for i in range(len(df)):
    if len(df.iloc[i]['tweet']) > maxim:
        maxim = len(df.iloc[i]['tweet'])

maxim

100

In [253]:
def create_model():
    model = Sequential()
    model.add(Embedding(8000, 32, input_length=100))
    model.add(LSTM(20))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [254]:
model = create_model()
model

<keras.engine.sequential.Sequential at 0x21702b10208>

In [255]:
X_train

6740     [-0.033784185, 0.18223864, 0.31397453, -0.1651...
9789     [-0.06403985, 0.1766307, 0.19665577, -0.334790...
539      [0.35608, 0.19476, 0.60341, -0.61308503, -0.49...
195      [-0.081377335, 0.34111, 0.150256, -0.228305, 0...
10577    [-0.08837729, 0.14820686, 0.2811603, -0.311066...
                               ...                        
4303     [0.19914995, 0.2613253, 0.3200894, -0.19457823...
5494     [-0.081934646, 0.30932894, 0.06334071, -0.1156...
5284     [-0.078757085, 0.21455105, 0.34014702, -0.3832...
3192     [-0.17399804, 0.24002126, 0.33891255, -0.11598...
11847    [0.10451001, -0.051071294, 0.369976, -0.416627...
Name: tweet, Length: 9897, dtype: object

In [256]:
X_train.shape

(9897,)

In [257]:
X_train1 = np.asarray(X_train, dtype=object)
y_train1 = np.asarray(y_train, dtype=object)
X_test1 = np.asarray(X_test, dtype=object)
y_test1 = np.asarray(y_test, dtype=object)

In [258]:
X_train1

array([array([-3.37841846e-02,  1.82238638e-01,  3.13974530e-01, -1.65178001e-01,
       -2.92644918e-01,  2.62234390e-01, -4.45966460e-02,  2.53787160e-01,
       -3.12278271e-01, -3.70326340e-01, -7.94536714e-03, -4.77104560e-02,
        1.46802545e-01,  1.10977739e-01,  1.38641357e-01, -2.03035638e-01,
        7.12371841e-02,  1.64705902e-01, -3.95721793e-01,  3.20700139e-01,
        2.26792768e-01,  9.48056430e-02, -2.19005644e-02, -9.76072624e-02,
        2.79099196e-01,  6.39651939e-02, -2.86521405e-01, -4.19285566e-01,
        2.26969913e-01, -8.37318879e-03, -3.36163998e-01,  2.74482101e-01,
       -1.53909076e-03,  1.27478078e-01, -9.69440937e-02,  3.65890414e-01,
       -3.20975572e-01,  7.44276345e-02,  1.36901438e-01, -1.46995723e-01,
       -4.15546447e-01, -1.96348261e-02,  2.51593571e-02, -3.12878907e-01,
       -1.46404088e-01, -5.86481802e-02, -3.95463519e-02, -3.50214005e-01,
        2.91165620e-01, -7.11496294e-01, -1.04337931e-01, -1.38213813e-01,
        1.43356994

In [259]:
for i in range(len(X_train1)):
    if X_train1[i].shape != (100,):
        print('hello')

In [260]:
model.fit(X_train, y_train, epochs=100, batch_size = 64, validation_data=(X_test, y_test))

ValueError: Error when checking input: expected embedding_9_input to have shape (100,) but got array with shape (1,)