In [69]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.semi_supervised import SelfTrainingClassifier, LabelPropagation
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.svm import SVC
import pandas as pd
import numpy

## Dataset

In [2]:
df = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\olid-training-v1.0.tsv', delimiter='\t')
df['tweet'] = df['tweet'].str.replace('@USER','')
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! #MAGA #Trump2020 👊🇺...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should'veTaken"" this piece of shit to...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


## Preprocessing and Cleaning

In [3]:
import re
def hyperlink(tweet):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

def retweets(tweet):
    return re.sub(r'^RT[\s]+', '', tweet)

def split_hashtag(tweet):
    return re.sub(r'#', '', tweet)

def join_words(tweet):
    return re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", tweet)

df['tweet'] = df['tweet'].apply(hyperlink)
df['tweet'] = df['tweet'].apply(retweets)
df['tweet'] = df['tweet'].apply(split_hashtag)
df['tweet'] = df['tweet'].apply(join_words)
df

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,She should ask a few native Americans what th...,OFF,UNT,
1,90194,Go home you’re drunk!!! MAGA Trump2020 👊🇺🇸👊...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"Someone should've Taken"" this piece of shit t...",OFF,UNT,
4,43605,Obama wanted liberals &amp; illegals to move...,NOT,,
...,...,...,...,...,...
13235,95338,Sometimes I get strong vibes from people and ...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT,,
13237,82921,And why report this garbage. We don't give a...,OFF,TIN,OTH
13238,27429,Pussy,OFF,UNT,


In [4]:
df['tweet'] = df['tweet'].replace('\d+', '', regex=True)

In [5]:
import string

df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df['tweet'] = df['tweet'].str.strip()
df['tweet'] = df['tweet'].str.lower()

from nltk.corpus import stopwords  
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

df

  df['tweet'] = df['tweet'].str.replace('[{}]'.format(string.punctuation), '')


Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,she should ask a few native americans what the...,OFF,UNT,
1,90194,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF,TIN,IND
2,16820,amazon is investigating chinese employees who ...,NOT,,
3,62688,someone shouldve taken this piece of shit to a...,OFF,UNT,
4,43605,obama wanted liberals amp illegals to move int...,NOT,,
...,...,...,...,...,...
13235,95338,sometimes i get strong vibes from people and t...,OFF,TIN,IND
13236,67210,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT,,
13237,82921,and why report this garbage we dont give a crap,OFF,TIN,OTH
13238,27429,pussy,OFF,UNT,


In [6]:
df = df.drop(['id', 'subtask_b', 'subtask_c'], axis=1)
df = df.rename(columns={'subtask_a': 'offensive'})
df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,OFF
1,go home you’re drunk maga trump 👊🇺🇸👊 url,OFF
2,amazon is investigating chinese employees who ...,NOT
3,someone shouldve taken this piece of shit to a...,OFF
4,obama wanted liberals amp illegals to move int...,NOT
...,...,...
13235,sometimes i get strong vibes from people and t...,OFF
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,NOT
13237,and why report this garbage we dont give a crap,OFF
13238,pussy,OFF


In [7]:
def repl(off):
    if off == 'OFF':
        return 1
    return 0

df['offensive'] = df['offensive'].apply(repl)

df

Unnamed: 0,tweet,offensive
0,she should ask a few native americans what the...,1
1,go home you’re drunk maga trump 👊🇺🇸👊 url,1
2,amazon is investigating chinese employees who ...,0
3,someone shouldve taken this piece of shit to a...,1
4,obama wanted liberals amp illegals to move int...,0
...,...,...
13235,sometimes i get strong vibes from people and t...,1
13236,benidorm ✅ creamfields ✅ maga ✅ not too sh...,0
13237,and why report this garbage we dont give a crap,1
13238,pussy,1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['offensive'], stratify=df['offensive'], shuffle=0)

In [9]:
X_train

7359     we would rather keep bieber than trudeau you a...
893                 she is so beautiful but looks like dad
7346     suddenly bozo obama is not politicianamp certa...
3921                   he tihnks he is telling the big lie
12070                                     go home has been
                               ...                        
5290     he is phony and opportunist look at the millio...
6675     you can’t pretend to care about women when onl...
7138     i was waiting on him to say the hokey pokey an...
7420     i knew we would find out that this is all the ...
10396    dave has every right to be on the exact wrong ...
Name: tweet, Length: 9930, dtype: object

In [10]:
y_train

7359     0
893      0
7346     0
3921     1
12070    0
        ..
5290     1
6675     0
7138     0
7420     1
10396    0
Name: offensive, Length: 9930, dtype: int64

In [11]:
X_test

9864    i hope he is found  best wishes to you and you...
8184    you probably need more gun control   and more ...
3263    democrats ask yourself is your party fighting ...
954     to be fair i’m positive most liberals are just...
1099    and what plan might that be  the discredited c...
                              ...                        
9979    you amp your husband are doubletalking hypocri...
996     follow along  conservatives paid fusion  fusio...
4921          richard cephalic is what he is its clinical
3342            how is the gun control working in chicago
8                                       buy more icecream
Name: tweet, Length: 3310, dtype: object

In [88]:
# tokenize tweets using TweetTokenizer


tweets = [tokenizer.tokenize(tw) for tw in X_train]
test = [tokenizer.tokenize(tw) for tw in X_test]
print(len(tweets), len(test))

9930 3310


## Creating Doc2Vec

In [89]:
tagged = [TaggedDocument(d, [i]) for i, d in enumerate(tweets)]

tagged_test = [TaggedDocument(d, [i]) for i, d in enumerate(test)]

In [16]:
model = Doc2Vec(workers = 8, epochs = 20, dm=0)

In [17]:
model.build_vocab(tagged)

In [18]:
model.train(tagged, total_examples = model.corpus_count, epochs = model.epochs)

In [19]:
model.wv.most_similar('trump')

[('um', 0.3347780406475067),
 ('deny', 0.3136875331401825),
 ('enough', 0.3112700879573822),
 ('dam', 0.30635616183280945),
 ('lied', 0.3054511547088623),
 ('work', 0.30496901273727417),
 ('remind', 0.2992699146270752),
 ('characters', 0.29458025097846985),
 ('penalty', 0.2889249324798584),
 ('senators', 0.2822878360748291)]

In [20]:
vec = model['king'] - model['man'] + model['woman'] # doesn't get 'queen' because not enough data
model.wv.most_similar([vec])

[('woman', 0.5489143133163452),
 ('king', 0.4822070002555847),
 ('quickly', 0.3472941219806671),
 ('bloody', 0.31743764877319336),
 ('courage', 0.289979487657547),
 ('‼', 0.2869696617126465),
 ('havent', 0.28184399008750916),
 ('loved', 0.27821871638298035),
 ('🌪', 0.2762487530708313),
 ('sociopath', 0.27594640851020813)]

In [21]:
model1 = Doc2Vec(workers = 8, epochs = 20, dm=1)

In [22]:
model1.build_vocab(tagged)

In [23]:
model1.train(tagged, total_examples = model1.corpus_count, epochs = model1.epochs)

In [24]:
model1.wv.most_similar('trump')

[('donald', 0.6343300342559814),
 ('mueller', 0.6028581261634827),
 ('president', 0.5847514867782593),
 ('judge', 0.5812162160873413),
 ('potus', 0.5772563815116882),
 ('immigrants', 0.5625531673431396),
 ('kavanaugh', 0.5555129647254944),
 ('mc', 0.5431937575340271),
 ('supporters', 0.5372295379638672),
 ('ford', 0.5336373448371887)]

In [25]:
vec = model1['king'] - model1['man'] + model1['woman'] # doesn't get 'queen' because not enough data
model1.wv.most_similar([vec])

[('lower', 0.6227105259895325),
 ('social', 0.5921221375465393),
 ('average', 0.5646170973777771),
 ('biggest', 0.5583135485649109),
 ('promised', 0.5580021739006042),
 ('ideas', 0.5576486587524414),
 ('threats', 0.549805760383606),
 ('planned', 0.5478804707527161),
 ('accounts', 0.5438938736915588),
 ('woman', 0.5417291522026062)]

In [26]:
def get_vectors(model, input_docs):
    vectors = [model.infer_vector(doc.words) for doc in input_docs]
    return vectors

## SVM

In [27]:
X_train = get_vectors(model1, tagged)

In [28]:
clf = SVC()

In [29]:
clf.fit(X_train, y_train)

SVC()

In [30]:
X_test = get_vectors(model1, tagged_test)

In [31]:
clf.score(X_test, y_test)

0.6888217522658611

In [32]:
model1.save("doc2vec.model")

In [33]:
params = {'kernel': ['linear', 'rbf', 'sigmoid'], 'C': [0.05, 0.01, 0.1, 1, 5, 10, 100], 'gamma': [0.01, 0.1, 1, 5, 10]}

In [34]:
gsc = GridSearchCV(clf, param_grid=params, n_jobs=-1)

In [35]:
grid_result = gsc.fit(X_train, y_train)

In [36]:
best_params = grid_result.best_params_
best_params

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [37]:
grid_result.best_score_

0.6851963746223564

In [38]:
clf = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], probability = True)
clf

SVC(C=1, gamma=1, probability=True)

## Loading and Processing Unlabelled Data



In [39]:
df_unlab = pd.read_csv(r'C:\Users\suhas\Documents\College Projects\SSL-Offensive-Lang-Detection-Social-Media\processed_unlab.csv')

In [40]:
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [41]:
df_unlab['tweet'] = df_unlab['tweet'].str.replace('@USER','')
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [42]:
df_unlab['tweet'] = df_unlab['tweet'].apply(hyperlink)
df_unlab['tweet'] = df_unlab['tweet'].apply(retweets)
df_unlab['tweet'] = df_unlab['tweet'].apply(split_hashtag)
df_unlab['tweet'] = df_unlab['tweet'].apply(join_words)
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [43]:
df_unlab['tweet'] = df_unlab['tweet'].replace('\d+', '', regex=True)
df_unlab

Unnamed: 0.1,Unnamed: 0,id,tweet
0,0,B0,"['quit', 'ive', 'heard', 'knifecrime', 'today']"
1,1,B1,"['celebration', 'emancipation', 'day', 'urge',..."
2,2,B2,"['’', 'literal', 'dream', 'come', 'true', 'win..."
3,3,B3,"['brilliant', 'news', 'read', 'hoggy', 'signed..."
4,4,B4,"['speaks', 'truth', '😌']"
...,...,...,...
10540,10540,BC2096,"['outright', 'lie', 'expose', 'yet', 'shameles..."
10541,10541,BC2097,"['okay', 'im', 'fuck', 'nigga']"
10542,10542,BC2100,"['mean', '“', '”', 'already', 'know', '’', 'ma..."
10543,10543,BC2101,"['nothing', 'trump', 'human', 'normal', 'unles..."


In [44]:
df_unlab['tweet'] = df_unlab['tweet'].str.replace('[{}]'.format(string.punctuation), '')
df_unlab['tweet'] = df_unlab['tweet'].str.strip()
df_unlab['tweet'] = df_unlab['tweet'].str.lower()

  df_unlab['tweet'] = df_unlab['tweet'].str.replace('[{}]'.format(string.punctuation), '')


In [45]:
df_unlab = df_unlab.drop(['id', 'Unnamed: 0'], axis=1)
df_unlab

Unnamed: 0,tweet
0,quit ive heard knifecrime today
1,celebration emancipation day urge emancipate r...
2,’ literal dream come true win especially birth...
3,brilliant news read hoggy signed new contract ...
4,speaks truth 😌
...,...
10540,outright lie expose yet shameless liar
10541,okay im fuck nigga
10542,mean “ ” already know ’ massive fuck club
10543,nothing trump human normal unless also racist ...


## Adding Unlabelled Data to Labelled

In [87]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['offensive'], stratify=df['offensive'], shuffle=0)

In [48]:
X_train = X_train.reset_index().drop(['index'], axis=1)
X_train

Unnamed: 0,tweet
0,hey one of your antifa bros was on here today ...
1,and when logic fails you arent a twitterer lik...
2,she is one big embarrassment to the american p...
3,worst place ive experienced that is the fuckin...
4,teach the children well maga we the people...
...,...
9925,happy birthday mrs aggers hope you are having...
9926,this is the kind of healthcare provision relig...
9927,every day her iq gets lower and lower more wan...
9928,so will this all be hidden like the liberals h...


In [49]:
X_train_unlab = X_train.append(df_unlab).reset_index().drop(['index'], axis=1)

In [50]:
X_train_unlab

Unnamed: 0,tweet
0,hey one of your antifa bros was on here today ...
1,and when logic fails you arent a twitterer lik...
2,she is one big embarrassment to the american p...
3,worst place ive experienced that is the fuckin...
4,teach the children well maga we the people...
...,...
20470,outright lie expose yet shameless liar
20471,okay im fuck nigga
20472,mean “ ” already know ’ massive fuck club
20473,nothing trump human normal unless also racist ...


In [51]:
X_train_unlab = X_train_unlab['tweet']
X_train_unlab

0        hey one of your antifa bros was on here today ...
1        and when logic fails you arent a twitterer lik...
2        she is one big embarrassment to the american p...
3        worst place ive experienced that is the fuckin...
4        teach the children well     maga we the people...
                               ...                        
20470               outright lie expose yet shameless liar
20471                                   okay im fuck nigga
20472            mean “ ” already know ’ massive fuck club
20473    nothing trump human normal unless also racist ...
20474    rump everything destroy fabric country order b...
Name: tweet, Length: 20475, dtype: object

In [52]:
y_train = y_train.reset_index().drop(['index'], axis=1)
y_train

Unnamed: 0,offensive
0,0
1,0
2,0
3,1
4,0
...,...
9925,0
9926,0
9927,1
9928,0


In [53]:
values = [-1 for i in range(len(df_unlab))]

y_unlab = pd.DataFrame({'offensive':values})

In [54]:
y_train_unlab = y_train.append(y_unlab)
y_train_unlab

Unnamed: 0,offensive
0,0
1,0
2,0
3,1
4,0
...,...
10540,-1
10541,-1
10542,-1
10543,-1


In [55]:
y_train_unlab = y_train_unlab.reset_index().drop(['index'], axis = 1)
y_train_unlab

Unnamed: 0,offensive
0,0
1,0
2,0
3,1
4,0
...,...
20470,-1
20471,-1
20472,-1
20473,-1


In [56]:
tweets_unlab = [tokenizer.tokenize(tw) for tw in X_train_unlab]
tagged_unlab = [TaggedDocument(d, [i]) for i, d in enumerate(tweets_unlab)]

In [90]:
X_test = get_vectors(model1, tagged_test)

In [58]:
X_train_unlab = get_vectors(model1, tagged_unlab)

In [59]:
len(X_train_unlab)

20475

## Self Training

In [60]:
self_training_model = SelfTrainingClassifier(clf, max_iter=100, threshold=0.95)

In [61]:
self_training_model.fit(X_train_unlab, y_train_unlab)

  return f(*args, **kwargs)


SelfTrainingClassifier(base_estimator=SVC(C=1, gamma=1, probability=True),
                       max_iter=100, threshold=0.95)

In [91]:
self_training_model.score(X_test, y_test)

0.7688821752265861

## Label Propagation

In [77]:
label_prop_model = LabelPropagation(n_jobs = -1)

In [78]:
label_prop_model.fit(X_train_unlab, y_train_unlab)

  return f(*args, **kwargs)
  self.label_distributions_ /= normalizer


LabelPropagation(n_jobs=-1)

In [92]:
label_prop_model.score(X_test, y_test)

0.6676737160120846