In [49]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.svm import SVC
import pandas as pd
import numpy

## Dataset

In [6]:
df = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\olid-training-v1.0.tsv', delimiter='\t')
df['tweet'] = df['tweet'].str.replace('@USER','')
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


## Preprocessing and Cleaning

In [7]:
import re
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

df['tweet'] = df['tweet'].apply(hyperlink)
df['tweet'] = df['tweet'].apply(retweets)
df['tweet'] = df['tweet'].apply(split_hashtag)
df['tweet'] = df['tweet'].apply(join_words)
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [8]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [9]:
import string

df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df['tweet'] = df['tweet'].str.strip()
df['tweet'] = df['tweet'].str.lower()

from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

df

  df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')


Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [10]:
df = df.drop(['id', 'subtask_b', 'subtask_c'], axis=1)
df = df.rename(columns={'subtask_a': 'offensive'})
df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,OFF
1,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF
2,amazon is investigating chinese employees who ...,NOT
3,someone shouldve taken this piece of shit to a...,OFF
4,obama wanted liberals amp illegals to move int...,NOT
...,...,...
13235,sometimes i get strong vibes from people and t...,OFF
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT
13237,and why report this garbage we dont give a crap,OFF
13238,pussy,OFF


In [11]:
def repl(off):
    if off == 'OFF':
        return 1
    return 0

df['offensive'] = df['offensive'].apply(repl)

df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,1
1,go home you’re drunk maga trump 👊🇺🇸👊 url,1
2,amazon is investigating chinese employees who ...,0
3,someone shouldve taken this piece of shit to a...,1
4,obama wanted liberals amp illegals to move int...,0
...,...,...
13235,sometimes i get strong vibes from people and t...,1
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,0
13237,and why report this garbage we dont give a crap,1
13238,pussy,1


In [56]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['offensive'], stratify=df['offensive'], shuffle=0)

In [57]:
X_train

8061     same paint ball range they used for the antifa...
5031                                       you are too 😘😘😘
2693                          i’ll toast to me hillary url
7966     trump will blame it on the immigrants for comi...
1841     the comments below defending drug fords blatan...
                               ...                        
6389     bono is utterly stupid ridiculous wealthy tax ...
11781    i didnt think about violence towards antifa wh...
12299    did he read ‘mein kampf’ to understand hitler’...
7986                   that was fucking fast holy shit lol
7219     he wasnt really different with who he is now o...
Name: tweet, Length: 9930, dtype: object

In [58]:
y_train

8061     0
5031     0
2693     0
7966     0
1841     1
        ..
6389     1
11781    0
12299    0
7986     1
7219     0
Name: offensive, Length: 9930, dtype: int64

In [74]:
# tokenize tweets using TweetTokenizer

tweets = [tokenizer.tokenize(tw) for tw in X_train_unlab]
len(tweets)
test = [tokenizer.tokenize(tw) for tw in X_test]
len(test)

3310

## Creating Doc2Vec

In [75]:
tagged = [TaggedDocument(d, [i]) for i, d in enumerate(tweets)]
tagged_test = [TaggedDocument(d, [i]) for i, d in enumerate(test)]
tagged

[TaggedDocument(words=['same', 'paint', 'ball', 'range', 'they', 'used', 'for', 'the', 'antifa', 'vs', 'proudboy', 'match', 'last', 'year'], tags=[0]),
 TaggedDocument(words=['you', 'are', 'too', '😘', '😘', '😘'], tags=[1]),
 TaggedDocument(words=['i', '’', 'll', 'toast', 'to', 'me', 'hillary', 'url'], tags=[2]),
 TaggedDocument(words=['trump', 'will', 'blame', 'it', 'on', 'the', 'immigrants', 'for', 'coming', 'and', 'liberals', 'for', 'noticing', 'he', 'likely', 'thinks', 'head', 'starthiv', 'are', 'wasteful', 'spending', 'comgress', 'can', 'no', 'longer', 'pass', 'broad', 'budgets', 'they', 'will', 'need', 'to', 'itemize', 'every', 'and', 'forbid', 'diversion', 'without', 'congressional', 'permission'], tags=[3]),
 TaggedDocument(words=['the', 'comments', 'below', 'defending', 'drug', 'fords', 'blatant', 'abuse', 'of', 'power', 'is', 'completely', 'laughable', 'but', 'but', 'the', 'liberals', 'bias', 'bias', 'i', 'tell', 'you', 'must', 'be', 'only', 'the', 'ndp', 'rich', 'downtown', 'a

In [17]:
model = Doc2Vec(workers = 8, epochs = 20, dm=0)

In [18]:
model.build_vocab(tagged)

In [19]:
model.train(tagged, total_examples = model.corpus_count, epochs = model.epochs)

In [20]:
model.wv.most_similar('trump')

[('visit', 0.386379212141037),
 ('mother', 0.33676138520240784),
 ('platforms', 0.30505114793777466),
 ('pls', 0.29837530851364136),
 ('murders', 0.29471415281295776),
 ('set', 0.2873609662055969),
 ('shooting', 0.2844494581222534),
 ('lunch', 0.2804383933544159),
 ('already', 0.2784564793109894),
 ('matters', 0.27271130681037903)]

In [21]:
vec = model['king'] - model['man'] + model['woman'] # doesn't get 'queen' because not enough data
model.wv.most_similar([vec])

[('woman', 0.5522732734680176),
 ('king', 0.5491523742675781),
 ('supporting', 0.3437233865261078),
 ('followers', 0.30055946111679077),
 ('devil', 0.29156792163848877),
 ('caring', 0.28650444746017456),
 ('laughs', 0.2852778732776642),
 ('perhaps', 0.28387171030044556),
 ('daughter', 0.2798519432544708),
 ('havent', 0.27737724781036377)]

In [22]:
model1 = Doc2Vec(workers = 8, epochs = 20, dm=1)

In [23]:
model1.build_vocab(tagged)

In [24]:
model1.train(tagged, total_examples = model1.corpus_count, epochs = model1.epochs)

In [25]:
model1.wv.most_similar('trump')

[('donald', 0.7244689464569092),
 ('america', 0.6783621907234192),
 ('kavanaugh', 0.6443151831626892),
 ('president', 0.6235218048095703),
 ('elected', 0.6151584386825562),
 ('trumps', 0.6124414205551147),
 ('judge', 0.6107865571975708),
 ('mc', 0.604667067527771),
 ('j', 0.6035543084144592),
 ('supporters', 0.5874176621437073)]

In [26]:
vec = model1['king'] - model1['man'] + model1['woman'] # doesn't get 'queen' because not enough data
model1.wv.most_similar([vec])

[('credibility', 0.6265100240707397),
 ('over', 0.6009899377822876),
 ('powers', 0.5848269462585449),
 ('destroyed', 0.5734577775001526),
 ('emotional', 0.5730862617492676),
 ('force', 0.5686177015304565),
 ('identity', 0.5619586706161499),
 ('threatening', 0.5534588098526001),
 ('children', 0.5529146194458008),
 ('theyd', 0.5456287264823914)]

In [27]:
def get_vectors(model, input_docs):
    vectors = [model.infer_vector(doc.words) for doc in input_docs]
    return vectors

## SVM

In [35]:
X_train = get_vectors(model1, tagged)

In [36]:
clf = SVC()

In [37]:
clf.fit(X_train, y_train)

SVC()

In [38]:
X_test = get_vectors(model1, tagged_test)

In [39]:
clf.score(X_test, y_test)

0.6833836858006043

In [40]:
model1.save("doc2vec.model")

In [41]:
params = {'kernel': ['linear', 'rbf', 'sigmoid'], 'C': [0.001, 0.05, 0.01, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.1, 1, 5, 10, 100]}

In [43]:
gsc = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [44]:
grid_result = gsc.fit(X_train, y_train)

In [45]:
best_params = grid_result.best_params_
best_params

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [46]:
grid_result.best_score_

0.6900302114803626

In [47]:
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True)
clf

SVC(C=1, gamma=1, probability=True)

## Loading and Processing Unlabelled Data



In [28]:
df_unlab = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\processed_unlab.csv')

In [29]:
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,15923,"['q', 'wheres', 'server', 'dump', 'nike', 'dec..."
1,1,27014,"['constitution', 'day', 'revered', 'conservati..."
2,2,30530,"['foxnews', 'nra', 'maga', 'potus', 'trump', '..."
3,3,13876,"['watching', 'boomer', 'getting', 'news', 'sti..."
4,4,60133,"['pasaran', 'unity', 'demo', 'oppose', 'farrig..."
...,...,...,...
1308,1308,30778,"['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ..."
1309,1309,22569,"['antifa', 'mentally', 'unstable', 'cowards', ..."
1310,1310,48938,"['browning', 'looked', 'like', 'dog', 'shit', ..."
1311,1311,41438,"['two', 'taste', 'like', 'ass', 'url']"


In [30]:
df_unlab['tweet'] = df_unlab['tweet'].str.replace('@USER','')
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,15923,"['q', 'wheres', 'server', 'dump', 'nike', 'dec..."
1,1,27014,"['constitution', 'day', 'revered', 'conservati..."
2,2,30530,"['foxnews', 'nra', 'maga', 'potus', 'trump', '..."
3,3,13876,"['watching', 'boomer', 'getting', 'news', 'sti..."
4,4,60133,"['pasaran', 'unity', 'demo', 'oppose', 'farrig..."
...,...,...,...
1308,1308,30778,"['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ..."
1309,1309,22569,"['antifa', 'mentally', 'unstable', 'cowards', ..."
1310,1310,48938,"['browning', 'looked', 'like', 'dog', 'shit', ..."
1311,1311,41438,"['two', 'taste', 'like', 'ass', 'url']"


In [31]:
df_unlab['tweet'] = df_unlab['tweet'].apply(hyperlink)
df_unlab['tweet'] = df_unlab['tweet'].apply(retweets)
df_unlab['tweet'] = df_unlab['tweet'].apply(split_hashtag)
df_unlab['tweet'] = df_unlab['tweet'].apply(join_words)
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,15923,"['q', 'wheres', 'server', 'dump', 'nike', 'dec..."
1,1,27014,"['constitution', 'day', 'revered', 'conservati..."
2,2,30530,"['foxnews', 'nra', 'maga', 'potus', 'trump', '..."
3,3,13876,"['watching', 'boomer', 'getting', 'news', 'sti..."
4,4,60133,"['pasaran', 'unity', 'demo', 'oppose', 'farrig..."
...,...,...,...
1308,1308,30778,"['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ..."
1309,1309,22569,"['antifa', 'mentally', 'unstable', 'cowards', ..."
1310,1310,48938,"['browning', 'looked', 'like', 'dog', 'shit', ..."
1311,1311,41438,"['two', 'taste', 'like', 'ass', 'url']"


In [32]:
df_unlab['tweet'] = df_unlab['tweet'].replace('\d+', '', regex=True)
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,15923,"['q', 'wheres', 'server', 'dump', 'nike', 'dec..."
1,1,27014,"['constitution', 'day', 'revered', 'conservati..."
2,2,30530,"['foxnews', 'nra', 'maga', 'potus', 'trump', '..."
3,3,13876,"['watching', 'boomer', 'getting', 'news', 'sti..."
4,4,60133,"['pasaran', 'unity', 'demo', 'oppose', 'farrig..."
...,...,...,...
1308,1308,30778,"['stop', 'etchecopar', 'fuck', '🖕', '🖕', '🖕', ..."
1309,1309,22569,"['antifa', 'mentally', 'unstable', 'cowards', ..."
1310,1310,48938,"['browning', 'looked', 'like', 'dog', 'shit', ..."
1311,1311,41438,"['two', 'taste', 'like', 'ass', 'url']"


In [33]:
df_unlab['tweet'] = df_unlab['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df_unlab['tweet'] = df_unlab['tweet'].str.strip()
df_unlab['tweet'] = df_unlab['tweet'].str.lower()

  df_unlab['tweet'] = df_unlab['tweet'].str.replace('[{}]'.format(string.punctuation), '')


In [34]:
df_unlab = df_unlab.drop(['id', 'Unnamed: 0'], axis=1)
df_unlab

Unnamed: 0,tweet
0,q wheres server dump nike declasfisa democrats...
1,constitution day revered conservatives hated p...
2,foxnews nra maga potus trump nd amendment rnc ...
3,watching boomer getting news still parole alwa...
4,pasaran unity demo oppose farright london – an...
...,...
1308,stop etchecopar fuck 🖕 🖕 🖕 que florezcan mil b...
1309,antifa mentally unstable cowards pretending re...
1310,browning looked like dog shit fcs school ’ point
1311,two taste like ass url


## Adding Unlabelled Data to Labelled

In [60]:
X_train = X_train.reset_index().drop(['index'], axis=1)
X_train

Unnamed: 0,tweet
0,same paint ball range they used for the antifa...
1,you are too 😘😘😘
2,i’ll toast to me hillary url
3,trump will blame it on the immigrants for comi...
4,the comments below defending drug fords blatan...
...,...
9925,bono is utterly stupid ridiculous wealthy tax ...
9926,i didnt think about violence towards antifa wh...
9927,did he read ‘mein kampf’ to understand hitler’...
9928,that was fucking fast holy shit lol


In [61]:
X_train_unlab = X_train.append(df_unlab).reset_index().drop(['index'], axis=1)

In [62]:
X_train_unlab

Unnamed: 0,tweet
0,same paint ball range they used for the antifa...
1,you are too 😘😘😘
2,i’ll toast to me hillary url
3,trump will blame it on the immigrants for comi...
4,the comments below defending drug fords blatan...
...,...
11238,stop etchecopar fuck 🖕 🖕 🖕 que florezcan mil b...
11239,antifa mentally unstable cowards pretending re...
11240,browning looked like dog shit fcs school ’ point
11241,two taste like ass url


In [63]:
X_train_unlab = X_train_unlab['tweet']
X_train_unlab

0        same paint ball range they used for the antifa...
1                                          you are too 😘😘😘
2                             i’ll toast to me hillary url
3        trump will blame it on the immigrants for comi...
4        the comments below defending drug fords blatan...
                               ...                        
11238    stop etchecopar fuck 🖕 🖕 🖕 que florezcan mil b...
11239    antifa mentally unstable cowards pretending re...
11240     browning looked like dog shit fcs school ’ point
11241                               two taste like ass url
11242    despicable dems lie rifles dem distorted law p...
Name: tweet, Length: 11243, dtype: object

In [64]:
y_train = y_train.reset_index().drop(['index'], axis=1)
y_train

Unnamed: 0,offensive
0,0
1,0
2,0
3,0
4,1
...,...
9925,1
9926,0
9927,0
9928,1


In [68]:
values = [-1 for i in range(len(df_unlab))]

y_unlab = pd.DataFrame({'offensive':values})

In [69]:
y_train_unlab = y_train.append(y_unlab)
y_train_unlab

Unnamed: 0,offensive
0,0
1,0
2,0
3,0
4,1
...,...
1308,-1
1309,-1
1310,-1
1311,-1


In [70]:
y_train_unlab = y_train_unlab.reset_index().drop(['index'], axis = 1)
y_train_unlab

Unnamed: 0,offensive
0,0
1,0
2,0
3,0
4,1
...,...
11238,-1
11239,-1
11240,-1
11241,-1


In [81]:
X_test = get_vectors(model1, tagged_test)

In [76]:
X_train_unlab = get_vectors(model1, tagged)

In [78]:
len(X_train_unlab)

11243

## Self Training

In [83]:
self_training_model = SelfTrainingClassifier(clf, max_iter=100, threshold=0.95)

In [84]:
self_training_model.fit(X_train_unlab, y_train_unlab)

  return f(*args, **kwargs)


SelfTrainingClassifier(base_estimator=SVC(C=1, gamma=1, probability=True),
                       max_iter=100, threshold=0.95)

In [85]:
self_training_model.score(X_test, y_test)

0.6891238670694864