In [75]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.svm import SVC
import pandas as pd
import numpy

## Dataset

In [76]:
df = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\olid-training-v1.0.tsv', delimiter='\t')
df['tweet'] = df['tweet'].str.replace('@USER','')
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


## Preprocessing and Cleaning

In [77]:
import re
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

df['tweet'] = df['tweet'].apply(hyperlink)
df['tweet'] = df['tweet'].apply(retweets)
df['tweet'] = df['tweet'].apply(split_hashtag)
df['tweet'] = df['tweet'].apply(join_words)
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [78]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [79]:
import string

df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df['tweet'] = df['tweet'].str.strip()
df['tweet'] = df['tweet'].str.lower()

from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

df

  df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')


Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [80]:
df = df.drop(['id', 'subtask_b', 'subtask_c'], axis=1)
df = df.rename(columns={'subtask_a': 'offensive'})
df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,OFF
1,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF
2,amazon is investigating chinese employees who ...,NOT
3,someone shouldve taken this piece of shit to a...,OFF
4,obama wanted liberals amp illegals to move int...,NOT
...,...,...
13235,sometimes i get strong vibes from people and t...,OFF
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT
13237,and why report this garbage we dont give a crap,OFF
13238,pussy,OFF


In [81]:
def repl(off):
    if off == 'OFF':
        return 1
    return 0

df['offensive'] = df['offensive'].apply(repl)

df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,1
1,go home you’re drunk maga trump 👊🇺🇸👊 url,1
2,amazon is investigating chinese employees who ...,0
3,someone shouldve taken this piece of shit to a...,1
4,obama wanted liberals amp illegals to move int...,0
...,...,...
13235,sometimes i get strong vibes from people and t...,1
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,0
13237,and why report this garbage we dont give a crap,1
13238,pussy,1


In [140]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['offensive'], stratify=df['offensive'], shuffle=0)

In [83]:
X_train

4883     i love women like you we believe all women exc...
3311                                          love it maga
1927     her family consist of  cheating husband who ne...
5817     those are old photos and in no way represent a...
7050                    because he is a scummy human being
                               ...                        
2474     yeahh tbh most that do are weirdos and think a...
919      believe the woman  what person would volunteer...
172            o k conservatives get to the polls vote red
12833    america  walk away get out and vote or live in...
8176                            antifa is hatred incarnate
Name: tweet, Length: 9930, dtype: object

In [84]:
y_train

4883     1
3311     0
1927     1
5817     0
7050     1
        ..
2474     1
919      1
172      0
12833    1
8176     0
Name: offensive, Length: 9930, dtype: int64

In [142]:
X_test

8316     i believe that melania trump is an abused woma...
4366     you  speak the truth about gt louis farrakhan ...
6482            brennan is a disgracewithout a clearance 😃
13177    americans make great clients but so many appra...
13117    sounds like he joined antifa  gov   of wa stat...
                               ...                        
8838                                       its antifa ford
9159     i guess i dont know when im talking to a proud...
4587     narrator it was at this moment antifa knew xhe...
12732                        because he support d roseanne
2319     there he is theirs the first baby growing so f...
Name: tweet, Length: 3310, dtype: object

In [143]:
# tokenize tweets using TweetTokenizer

#tweets_unlab = [tokenizer.tokenize(tw) for tw in X_train_unlab]
tweets = [tokenizer.tokenize(tw) for tw in X_train]
print(len(tweets_unlab))
test = [tokenizer.tokenize(tw) for tw in X_test]

20475


## Creating Doc2Vec

In [144]:
tagged = [TaggedDocument(d, [i]) for i, d in enumerate(tweets)]
tagged_unlab = [TaggedDocument(d, [i]) for i, d in enumerate(tweets_unlab)]
tagged_test = [TaggedDocument(d, [i]) for i, d in enumerate(test)]
tagged_unlab

[TaggedDocument(words=['it', 'is', 'false', 'she', 'is', 'lying', 'check', 'her', 'source'], tags=[0]),
 TaggedDocument(words=['youre', 'right', 'they', 'seemed', 'pretty', 'tuned', 'into', 'your', 'conspiracy', 'with', 'russia', 'maga', 'trump', 'lies'], tags=[1]),
 TaggedDocument(words=['we', 'are', 'here', 'for', 'q', 'and', 'the', 'revolution', 'do', 'not', 'allow', 'the', 'shiny', 'objects', 'to', 'divide', 'take', 'a', 'step', 'back', 'and', 'look', 'at', 'all', 'of', 'it', 'from', 'that', 'way', 'maga', 'we', 'the', 'people'], tags=[2]),
 TaggedDocument(words=['its', 'kind', 'of', 'like', 'when', 'a', 'mean', 'old', 'woman', 'says', 'that', 'a', 'pretty', 'young', 'one', 'is', 'ugly', 'and', 'untalented', 'just', 'because', 'she', 'is', 'washed', 'up', 'and', 'will', 'never', 'look', 'like', 'that', 'again', 'for', 'one', 'thing', 'its', 'catty', 'secondly', 'it', 'makes', 'her', 'look', 'like', 'a', 'jealous', 'hater', 'and', 'hasbeen', 'montel'], tags=[3]),
 TaggedDocument(wor

In [87]:
model = Doc2Vec(workers = 8, epochs = 20, dm=0)

In [88]:
model.build_vocab(tagged)

In [89]:
model.train(tagged, total_examples = model.corpus_count, epochs = model.epochs)

In [90]:
model.wv.most_similar('trump')

[('lived', 0.34128931164741516),
 ('pervert', 0.31978675723075867),
 ('true', 0.31052401661872864),
 ('suggest', 0.3036750853061676),
 ('delay', 0.3002278208732605),
 ('considered', 0.2941600978374481),
 ('wolf', 0.2919587194919586),
 ('classical', 0.2791879177093506),
 ('obamas', 0.2780962586402893),
 ('morals', 0.2726018726825714)]

In [91]:
vec = model['king'] - model['man'] + model['woman'] # doesn't get 'queen' because not enough data
model.wv.most_similar([vec])

[('woman', 0.5695428252220154),
 ('king', 0.5462932586669922),
 ('light', 0.3727681040763855),
 ('troll', 0.31497788429260254),
 ('perjury', 0.3022294044494629),
 ('sums', 0.29159799218177795),
 ('vice', 0.29002368450164795),
 ('safer', 0.2863222360610962),
 ('themselves', 0.2789230942726135),
 ('shoes', 0.2780170440673828)]

In [92]:
model1 = Doc2Vec(workers = 8, epochs = 20, dm=1)

In [93]:
model1.build_vocab(tagged)

In [94]:
model1.train(tagged, total_examples = model1.corpus_count, epochs = model1.epochs)

In [95]:
model1.wv.most_similar('trump')

[('donald', 0.6739652156829834),
 ('obama', 0.6615742444992065),
 ('hillary', 0.6608936786651611),
 ('mc', 0.6170716881752014),
 ('president', 0.6042699217796326),
 ('bernie', 0.5992317199707031),
 ('j', 0.5904044508934021),
 ('elected', 0.5849534273147583),
 ('mueller', 0.5835342407226562),
 ('supporters', 0.5763676166534424)]

In [96]:
vec = model1['king'] - model1['man'] + model1['woman'] # doesn't get 'queen' because not enough data
model1.wv.most_similar([vec])

[('ndp', 0.5246723294258118),
 ('each', 0.5226001143455505),
 ('media', 0.5207162499427795),
 ('ignored', 0.5067837834358215),
 ('politics', 0.4998101592063904),
 ('treated', 0.48297736048698425),
 ('asian', 0.47838225960731506),
 ('most', 0.46951931715011597),
 ('dream', 0.4674050807952881),
 ('supremacy', 0.46622225642204285)]

In [97]:
def get_vectors(model, input_docs):
    vectors = [model.infer_vector(doc.words) for doc in input_docs]
    return vectors

## SVM

In [98]:
X_train = get_vectors(model1, tagged)

In [99]:
clf = SVC()

In [100]:
clf.fit(X_train, y_train)

SVC()

In [101]:
X_test = get_vectors(model1, tagged_test)

In [102]:
clf.score(X_test, y_test)

0.6845921450151058

In [103]:
model1.save("doc2vec.model")

In [107]:
params = {'kernel': ['linear', 'rbf', 'sigmoid'], 'C': [0.05, 0.01, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.1, 1, 5, 10]}

In [108]:
gsc = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [109]:
grid_result = gsc.fit(X_train, y_train)

In [110]:
best_params = grid_result.best_params_
best_params

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [111]:
grid_result.best_score_

0.6819738167170192

In [112]:
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True)
clf

SVC(C=1, gamma=1, probability=True)

## Loading and Processing Unlabelled Data



In [113]:
df_unlab = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\processed_unlab.csv')

In [114]:
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [115]:
df_unlab['tweet'] = df_unlab['tweet'].str.replace('@USER','')
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [116]:
df_unlab['tweet'] = df_unlab['tweet'].apply(hyperlink)
df_unlab['tweet'] = df_unlab['tweet'].apply(retweets)
df_unlab['tweet'] = df_unlab['tweet'].apply(split_hashtag)
df_unlab['tweet'] = df_unlab['tweet'].apply(join_words)
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [117]:
df_unlab['tweet'] = df_unlab['tweet'].replace('\d+', '', regex=True)
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [118]:
df_unlab['tweet'] = df_unlab['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df_unlab['tweet'] = df_unlab['tweet'].str.strip()
df_unlab['tweet'] = df_unlab['tweet'].str.lower()

  df_unlab['tweet'] = df_unlab['tweet'].str.replace('[{}]'.format(string.punctuation), '')


In [119]:
df_unlab = df_unlab.drop(['id', 'Unnamed: 0'], axis=1)
df_unlab

Unnamed: 0,tweet
0,quit ive heard knifecrime today
1,celebration emancipation day urge emancipate r...
2,’ literal dream come true win especially birth...
3,brilliant news read hoggy signed new contract ...
4,speaks truth 😌
...,...
10540,outright lie expose yet shameless liar
10541,okay im fuck nigga
10542,mean “ ” already know ’ massive fuck club
10543,nothing trump human normal unless also racist ...


## Adding Unlabelled Data to Labelled

In [122]:
X_train = X_train.reset_index().drop(['index'], axis=1)
X_train

Unnamed: 0,tweet
0,it is false she is lying check her source
1,youre right they seemed pretty tuned into your...
2,we are here for q and the revolution do not a...
3,its kind of like when a mean old woman says th...
4,he is
...,...
9925,it is poisonous indeed no wonder she is called...
9926,cool
9927,i agree amp then some please go to work tha...
9928,chicago tribune london mayor sadiq khan and th...


In [123]:
X_train_unlab = X_train.append(df_unlab).reset_index().drop(['index'], axis=1)

In [124]:
X_train_unlab

Unnamed: 0,tweet
0,it is false she is lying check her source
1,youre right they seemed pretty tuned into your...
2,we are here for q and the revolution do not a...
3,its kind of like when a mean old woman says th...
4,he is
...,...
20470,outright lie expose yet shameless liar
20471,okay im fuck nigga
20472,mean “ ” already know ’ massive fuck club
20473,nothing trump human normal unless also racist ...


In [125]:
X_train_unlab = X_train_unlab['tweet']
X_train_unlab

0                it is false she is lying check her source
1        youre right they seemed pretty tuned into your...
2        we are here for q and the revolution  do not a...
3        its kind of like when a mean old woman says th...
4                                                    he is
                               ...                        
20470               outright lie expose yet shameless liar
20471                                   okay im fuck nigga
20472            mean “ ” already know ’ massive fuck club
20473    nothing trump human normal unless also racist ...
20474    rump everything destroy fabric country order b...
Name: tweet, Length: 20475, dtype: object

In [126]:
y_train = y_train.reset_index().drop(['index'], axis=1)
y_train

Unnamed: 0,offensive
0,0
1,0
2,0
3,1
4,0
...,...
9925,0
9926,0
9927,0
9928,0


In [127]:
values = [-1 for i in range(len(df_unlab))]

y_unlab = pd.DataFrame({'offensive':values})

In [128]:
y_train_unlab = y_train.append(y_unlab)
y_train_unlab

Unnamed: 0,offensive
0,0
1,0
2,0
3,1
4,0
...,...
10540,-1
10541,-1
10542,-1
10543,-1


In [129]:
y_train_unlab = y_train_unlab.reset_index().drop(['index'], axis = 1)
y_train_unlab

Unnamed: 0,offensive
0,0
1,0
2,0
3,1
4,0
...,...
20470,-1
20471,-1
20472,-1
20473,-1


In [145]:
X_test = get_vectors(model1, tagged_test)

In [146]:
X_test

[array([-0.14803904,  0.2742911 ,  0.18235825,  0.13544647,  0.02641257,
         0.0620081 ,  0.19494906, -0.18821804,  0.19009009, -0.20280972,
        -0.08610356, -0.1341164 , -0.11015393,  0.04945436,  0.16651936,
        -0.02155951,  0.01805693,  0.13137104, -0.12066832,  0.16130051,
         0.20966285,  0.03511542,  0.1356108 ,  0.01680443,  0.05712146,
         0.14980547, -0.19967876, -0.05231167, -0.12520364, -0.1728943 ,
         0.17577869, -0.18194024,  0.09884585, -0.04771288, -0.02048315,
        -0.05610077,  0.05822486,  0.05105791, -0.16465917, -0.1030179 ,
         0.17571847, -0.22461064, -0.27737424,  0.02085344, -0.11536706,
         0.24270755,  0.03095067,  0.10629053,  0.01042075, -0.08016173,
        -0.0427218 , -0.00916884, -0.17018414, -0.10286093, -0.09533779,
        -0.01335115, -0.358412  ,  0.05917211, -0.42155927, -0.0725335 ,
         0.10167184, -0.06902937, -0.01254702,  0.00293783,  0.25611195,
        -0.3003811 , -0.4178305 , -0.25204635,  0.1

In [135]:
X_train_unlab = get_vectors(model1, tagged_unlab)

In [136]:
len(X_train_unlab)

20475

## Self Training

In [137]:
self_training_model = SelfTrainingClassifier(clf, max_iter=100, threshold=0.95)

In [138]:
self_training_model.fit(X_train_unlab, y_train_unlab)

  return f(*args, **kwargs)


SelfTrainingClassifier(base_estimator=SVC(C=1, gamma=1, probability=True),
                       max_iter=100, threshold=0.95)

In [147]:
self_training_model.score(X_test, y_test)

0.7697885196374622