In [16]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
import pandas as pd
import numpy

## Dataset

In [3]:
df = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\olid-training-v1.0.tsv', delimiter='\t')
df['tweet'] = df['tweet'].str.replace('@USER','')
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


## Preprocessing and Cleaning

In [4]:
import re
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

df['tweet'] = df['tweet'].apply(hyperlink)
df['tweet'] = df['tweet'].apply(retweets)
df['tweet'] = df['tweet'].apply(split_hashtag)
df['tweet'] = df['tweet'].apply(join_words)
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [5]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [6]:
import string

df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df['tweet'] = df['tweet'].str.strip()
df['tweet'] = df['tweet'].str.lower()

from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

df

  df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')


Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [7]:
df = df.drop(['id', 'subtask_b', 'subtask_c'], axis=1)
df = df.rename(columns={'subtask_a': 'offensive'})
df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,OFF
1,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF
2,amazon is investigating chinese employees who ...,NOT
3,someone shouldve taken this piece of shit to a...,OFF
4,obama wanted liberals amp illegals to move int...,NOT
...,...,...
13235,sometimes i get strong vibes from people and t...,OFF
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT
13237,and why report this garbage we dont give a crap,OFF
13238,pussy,OFF


In [8]:
def repl(off):
    if off == 'OFF':
        return 1
    return 0

df['offensive'] = df['offensive'].apply(repl)

df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,1
1,go home you’re drunk maga trump 👊🇺🇸👊 url,1
2,amazon is investigating chinese employees who ...,0
3,someone shouldve taken this piece of shit to a...,1
4,obama wanted liberals amp illegals to move int...,0
...,...,...
13235,sometimes i get strong vibes from people and t...,1
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,0
13237,and why report this garbage we dont give a crap,1
13238,pussy,1


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['offensive'], stratify=df['offensive'], shuffle=0)

In [10]:
X_train

2100     ohno benefits and employment goes up see how i...
1488     wisdomwednesday   forgiveness says you are giv...
7625     meanwhile greatest economy ever more jobs for ...
1116     wont be legalized federally since it can be us...
9294     this high road shit doesnt work on trumpconser...
                               ...                        
3380     and what about  s accuser or  s accuser s or d...
10182        is a disgrace to the united states of america
11466                                          yeah she is
3215     my guess a lot of unflattering facts will come...
8694     because he is the most influential person in p...
Name: tweet, Length: 9930, dtype: object

In [11]:
y_train

2100     0
1488     0
7625     1
1116     0
9294     0
        ..
3380     0
10182    0
11466    0
3215     0
8694     0
Name: offensive, Length: 9930, dtype: int64

In [12]:
# tokenize tweets using TweetTokenizer

tweets = [tokenizer.tokenize(tw) for tw in X_train]
len(tweets)

9930

## Creating Doc2Vec

In [19]:
tagged = [TaggedDocument(d, [i]) for i, d in enumerate(tweets)]
tagged

[TaggedDocument(words=['ohno', 'benefits', 'and', 'employment', 'goes', 'up', 'see', 'how', 'it', 'works', 'yet'], tags=[0]),
 TaggedDocument(words=['wisdomwednesday', 'forgiveness', 'says', 'you', 'are', 'given', 'another', 'chance', 'for', 'a', 'new', 'beginning', 'pray', 'with', 'forgiveness', 'in', 'your', 'heart', 'and', 'you', 'can', 'guarantee', 'an', 'answer', 'to', '…', 'url'], tags=[1]),
 TaggedDocument(words=['meanwhile', 'greatest', 'economy', 'ever', 'more', 'jobs', 'for', 'blacks', 'and', 'hispanics', 'in', 'history', 'k', 'is', 'kicking', 'ass', 'ms', 'and', 'illegals', 'are', 'being', 'deported', 'daily', 'nk', 'is', 'back', 'to', 'the', 'table', 'for', 'denuclearizing', 'trump', 'is', 'draining', 'the', 'corruption', 'of', 'doj', 'and', 'fbi', 'outfrickingstanding', 'maga', 'url'], tags=[2]),
 TaggedDocument(words=['wont', 'be', 'legalized', 'federally', 'since', 'it', 'can', 'be', 'used', 'as', 'back', 'door', 'gun', 'control'], tags=[3]),
 TaggedDocument(words=['this

In [23]:
model = Doc2Vec(workers = 8, epochs = 20)

In [24]:
model.build_vocab(tagged)

In [25]:
model.train(tagged, total_examples = model.corpus_count, epochs = model.epochs)

In [34]:
model.wv.most_similar('idiot')

[('incredible', 0.7977585196495056),
 ('cutie', 0.7843677401542664),
 ('fan', 0.7835972309112549),
 ('angry', 0.7712830305099487),
 ('dog', 0.7634627223014832),
 ('hurts', 0.7597971558570862),
 ('politically', 0.7577535510063171),
 ('joke', 0.7539687752723694),
 ('statement', 0.7530205845832825),
 ('missing', 0.7513835430145264)]

In [30]:
vec = model['king'] - model['man'] + model['woman'] # doesn't get 'queen' because not enough data
model.wv.most_similar([vec])

[('threatening', 0.5895255208015442),
 ('woman', 0.5848171710968018),
 ('anywhere', 0.5471601486206055),
 ('emotional', 0.5412232279777527),
 ('likely', 0.5335356593132019),
 ('dream', 0.5299286246299744),
 ('details', 0.5275073647499084),
 ('population', 0.526707112789154),
 ('each', 0.5256714224815369),
 ('children', 0.5218897461891174)]

# SVM

### Getting the vector for each word

In [129]:
from sklearn.svm import SVC
clf = SVC()
word_vectors = model1.wv
w2v = dict(zip(model1.wv.index2word, model1.wv.syn0))
w2vec = numpy.array([])

w2v

  after removing the cwd from sys.path.


{'the': array([-0.21288082, -0.30514976,  0.45683023, -0.0820969 , -0.56138945,
         0.11372207, -0.5533024 ,  0.04349791,  0.06861335,  0.3547414 ,
         0.19841443, -0.14173745,  0.3158547 ,  0.06373475, -0.5846047 ,
        -0.71861476,  0.06712539,  0.46101683, -0.3592397 ,  0.1203393 ,
        -0.6401876 ,  0.08784334, -0.23467247,  0.1962325 , -0.87416166,
         0.7918254 ,  0.63311756,  0.80111426,  0.31387395,  0.10559755,
         0.5042936 , -0.21549341], dtype=float32),
 'is': array([-0.9635922 , -0.30766734,  0.8769891 ,  0.07227   ,  0.36279854,
         0.21189405, -0.0851189 , -0.03018517,  0.0324925 , -0.05934168,
         0.65372294, -0.52137035,  0.6959733 ,  0.21613449, -0.22281252,
        -0.7416211 , -0.18109477,  0.6637054 ,  0.04225441,  0.7666011 ,
        -0.9472743 ,  0.09826022, -0.05002215,  0.5571439 , -0.4923499 ,
         0.7726493 ,  0.41609982,  0.5667858 , -0.2511608 ,  0.44146267,
         0.2924284 , -0.08426592], dtype=float32),
 'to': ar

### Making a list of vectors, problem is this length (16k) is greater than the no. of tweets in training (9k)

In [124]:
vect_list = [list(w2v[i]) for i in w2v]

vect_list[0]

[-0.21288082,
 -0.30514976,
 0.45683023,
 -0.0820969,
 -0.56138945,
 0.11372207,
 -0.5533024,
 0.043497913,
 0.06861335,
 0.3547414,
 0.19841443,
 -0.14173745,
 0.3158547,
 0.06373475,
 -0.5846047,
 -0.71861476,
 0.067125395,
 0.46101683,
 -0.3592397,
 0.1203393,
 -0.6401876,
 0.08784334,
 -0.23467247,
 0.1962325,
 -0.87416166,
 0.7918254,
 0.63311756,
 0.80111426,
 0.31387395,
 0.10559755,
 0.5042936,
 -0.21549341]

In [57]:
params = {'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'C': [0.001, 0.05, 0.01, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.1, 1, 5, 10, 100], 'degree': [1, 2, 3, 4, 5, 6]}

### Hence, this error due to size mismatch
#### What's going wrong

In [121]:
clf.fit(vect_list, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [16724, 9930]