In [1]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.svm import SVC
import pandas as pd
import numpy

## Dataset

In [2]:
df = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\olid-training-v1.0.tsv', delimiter='\t')
df['tweet'] = df['tweet'].str.replace('@USER','')
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


## Preprocessing and Cleaning

In [3]:
import re
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

df['tweet'] = df['tweet'].apply(hyperlink)
df['tweet'] = df['tweet'].apply(retweets)
df['tweet'] = df['tweet'].apply(split_hashtag)
df['tweet'] = df['tweet'].apply(join_words)
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [4]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [5]:
import string

df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df['tweet'] = df['tweet'].str.strip()
df['tweet'] = df['tweet'].str.lower()

from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

df

  df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')


Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [6]:
df = df.drop(['id', 'subtask_b', 'subtask_c'], axis=1)
df = df.rename(columns={'subtask_a': 'offensive'})
df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,OFF
1,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF
2,amazon is investigating chinese employees who ...,NOT
3,someone shouldve taken this piece of shit to a...,OFF
4,obama wanted liberals amp illegals to move int...,NOT
...,...,...
13235,sometimes i get strong vibes from people and t...,OFF
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT
13237,and why report this garbage we dont give a crap,OFF
13238,pussy,OFF


In [7]:
def repl(off):
    if off == 'OFF':
        return 1
    return 0

df['offensive'] = df['offensive'].apply(repl)

df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,1
1,go home you’re drunk maga trump 👊🇺🇸👊 url,1
2,amazon is investigating chinese employees who ...,0
3,someone shouldve taken this piece of shit to a...,1
4,obama wanted liberals amp illegals to move int...,0
...,...,...
13235,sometimes i get strong vibes from people and t...,1
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,0
13237,and why report this garbage we dont give a crap,1
13238,pussy,1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['offensive'], stratify=df['offensive'], shuffle=0)

In [9]:
X_train

8403     gotta give him credit where its due he may not...
3183                      f the democ rats  slimy bastards
1325     mazzone has convinced him he can play in the n...
8340          we all know liberals poll is completely a bs
10959                     feel free to start with antifa 😉
                               ...                        
1721                     something went down she is hiding
4774     look at his reaction at the  second mark what ...
5859      yes the cpc needs more liberals good call scheer
10887    same i just want to win so i can finally have ...
11438    lmao saying you are happy for someone publical...
Name: tweet, Length: 9930, dtype: object

In [10]:
y_train

8403     1
3183     1
1325     0
8340     0
10959    0
        ..
1721     0
4774     1
5859     0
10887    1
11438    0
Name: offensive, Length: 9930, dtype: int64

In [11]:
# tokenize tweets using TweetTokenizer

tweets = [tokenizer.tokenize(tw) for tw in X_train]
len(tweets)
test = [tokenizer.tokenize(tw) for tw in X_test]
len(test)

3310

## Creating Doc2Vec

In [12]:
tagged = [TaggedDocument(d, [i]) for i, d in enumerate(tweets)]
tagged_test = [TaggedDocument(d, [i]) for i, d in enumerate(test)]
tagged

[TaggedDocument(words=['gotta', 'give', 'him', 'credit', 'where', 'its', 'due', 'he', 'may', 'not', 'be', 'a', 'mason', 'but', 'he', 'is', 'out', 'there', 'competing', 'his', 'ass', 'off'], tags=[0]),
 TaggedDocument(words=['f', 'the', 'democ', 'rats', 'slimy', 'bastards'], tags=[1]),
 TaggedDocument(words=['mazzone', 'has', 'convinced', 'him', 'he', 'can', 'play', 'in', 'the', 'nfl', 'if', 'he', 'does', 'as', 'he', 'is', 'told', 'and', 'acts', 'like', 'a', 'good', 'little', 'pocket', 'passer'], tags=[2]),
 TaggedDocument(words=['we', 'all', 'know', 'liberals', 'poll', 'is', 'completely', 'a', 'bs'], tags=[3]),
 TaggedDocument(words=['feel', 'free', 'to', 'start', 'with', 'antifa', '😉'], tags=[4]),
 TaggedDocument(words=['if', 'you', 'go', 'by', 'anything', 'other', 'than', 'he', 'or', 'she', 'you', 'are', 'fucked', 'in', 'the', 'head'], tags=[5]),
 TaggedDocument(words=['you', 'do', 'scare', 'all', 'americans', 'with', 'ur', 'fake', 'democratic', 'platform', 'it', '’', 's', 'mainly', 

In [13]:
model = Doc2Vec(workers = 8, epochs = 20, dm=0)

In [14]:
model.build_vocab(tagged)

In [15]:
model.train(tagged, total_examples = model.corpus_count, epochs = model.epochs)

In [16]:
model.wv.most_similar('trump')

[('available', 0.3252343535423279),
 ('bigger', 0.3229202628135681),
 ('🤗', 0.320919930934906),
 ('diane', 0.3168693482875824),
 ('yourself', 0.3146665096282959),
 ('💖', 0.30668213963508606),
 ('fact', 0.30004918575286865),
 ('alert', 0.29771342873573303),
 ('lefties', 0.29474225640296936),
 ('pig', 0.29258230328559875)]

In [17]:
vec = model['king'] - model['man'] + model['woman'] # doesn't get 'queen' because not enough data
model.wv.most_similar([vec])

[('woman', 0.6744350790977478),
 ('king', 0.4634184241294861),
 ('owner', 0.33284690976142883),
 ('end', 0.30278295278549194),
 ('known', 0.3017917275428772),
 ('loose', 0.28864502906799316),
 ('county', 0.2863807678222656),
 ('tweeting', 0.28192421793937683),
 ('guy', 0.28166404366493225),
 ('propaganda', 0.28124552965164185)]

In [18]:
model1 = Doc2Vec(workers = 8, epochs = 20, dm=1)

In [19]:
model1.build_vocab(tagged)

In [20]:
model1.train(tagged, total_examples = model1.corpus_count, epochs = model1.epochs)

In [21]:
model1.wv.most_similar('trump')

[('president', 0.648995578289032),
 ('trumps', 0.6409957408905029),
 ('donald', 0.6380090713500977),
 ('elected', 0.617723286151886),
 ('hillary', 0.6041927933692932),
 ('kavanaugh', 0.5823317170143127),
 ('america', 0.5779498815536499),
 ('supporters', 0.548123836517334),
 ('walter', 0.5445637106895447),
 ('obama', 0.54308021068573)]

In [22]:
vec = model1['king'] - model1['man'] + model1['woman'] # doesn't get 'queen' because not enough data
model1.wv.most_similar([vec])

[('threatening', 0.6172788143157959),
 ('promised', 0.5943372845649719),
 ('westminster', 0.5845928192138672),
 ('outside', 0.5828120708465576),
 ('minions', 0.5777948498725891),
 ('tantrums', 0.5677062273025513),
 ('woman', 0.5646189451217651),
 ('social', 0.5620445013046265),
 ('power', 0.5571749210357666),
 ('diane', 0.5467966794967651)]

In [23]:
def get_vectors(model, input_docs):
    vectors = [model.infer_vector(doc.words) for doc in input_docs]
    return vectors

## SVM

In [24]:
X_train = get_vectors(model1, tagged)

In [25]:
clf = SVC()

In [26]:
clf.fit(X_train, y_train)

SVC()

In [27]:
X_test = get_vectors(model1, tagged_test)

In [28]:
clf.score(X_test, y_test)

0.6885196374622357

In [30]:
model1.save("doc2vec.model")