In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import os

In [41]:
path = os.path.join('datasets', 'train.csv')

with open(path, 'r') as file: 
    stop = 0
    data = pd.read_csv(file, delimiter=',')

In [42]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [43]:
print(data.columns.tolist())

['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [46]:
data['comment_text']

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

## Preprocessing

In [22]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
  """Removes HTML tags by replacing everywthing inside html tags with an empty space"""
  return TAG_RE.sub('', text)

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
def preprocess_text(input):
  # we want everything to be lowercase
  input = input.lower()

  # remove any html tags
  input = remove_tags(input)

  # remove punctuations and numbers
  input = re.sub('[^a-zA-Z]', ' ', input)

  # remove single characters: Mark's -> Mark
  input = re.sub(r'\s+[a-zA-Z]\s', ' ', input)

  # remove multiple spaces
  input = re.sub(r'\s+', ' ', input)

  # remove stopwords
  pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  input = pattern.sub('', input)

  return input.strip()

In [47]:
# Applying the preprocessing function
X = []
sentences = list(data['comment_text'])
for s in sentences:
  X.append(preprocess_text(s))

In [59]:
X[:10]

['explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired',
 'aww matches background colour seemingly stuck thanks talk january utc',
 'hey man really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info',
 'make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later one else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipedia good article nominations transport',
 'sir hero chance remember page',
 'congratulations well use tools well talk',
 'cocksucker piss around work',
 'vandalism matt shirvington article reverted please banned',
 'sorry word nonsense offensive anyway intending write anything a

In [49]:
tokenized_comments = [comment.split() for comment in X]

In [50]:
tokenized_comments

[['explanation',
  'edits',
  'made',
  'username',
  'hardcore',
  'metallica',
  'fan',
  'reverted',
  'vandalisms',
  'closure',
  'gas',
  'voted',
  'new',
  'york',
  'dolls',
  'fac',
  'please',
  'remove',
  'template',
  'talk',
  'page',
  'since',
  'retired'],
 ['aww',
  'matches',
  'background',
  'colour',
  'seemingly',
  'stuck',
  'thanks',
  'talk',
  'january',
  'utc'],
 ['hey',
  'man',
  'really',
  'trying',
  'edit',
  'war',
  'guy',
  'constantly',
  'removing',
  'relevant',
  'information',
  'talking',
  'edits',
  'instead',
  'talk',
  'page',
  'seems',
  'care',
  'formatting',
  'actual',
  'info'],
 ['make',
  'real',
  'suggestions',
  'improvement',
  'wondered',
  'section',
  'statistics',
  'later',
  'subsection',
  'types',
  'accidents',
  'think',
  'references',
  'may',
  'need',
  'tidying',
  'exact',
  'format',
  'ie',
  'date',
  'format',
  'etc',
  'later',
  'one',
  'else',
  'first',
  'preferences',
  'formatting',
  'style',


In [2]:
from gensim.models import Word2Vec
import gensim
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
model = Word2Vec(sentences=tokenized_comments, vector_size=100, window=5, min_count=5, sg=1, workers=4)

In [53]:
model.save("word2vec_test.model")

In [54]:
# Use the model to find similar words
print(model.wv.most_similar('black', topn=5))

[('white', 0.7524628043174744), ('caucasion', 0.664802610874176), ('mulatto', 0.6534956693649292), ('blond', 0.6479047536849976), ('straps', 0.6444939970970154)]


In [55]:
# Use the model to find similar words
print(model.wv.most_similar('white', topn=10))

[('black', 0.7524628639221191), ('straps', 0.7076944708824158), ('caucasion', 0.6970655918121338), ('mulatto', 0.6843364238739014), ('hispanics', 0.6822941899299622), ('skinned', 0.6678436398506165), ('supremacist', 0.6623958349227905), ('whites', 0.6612074375152588), ('whiter', 0.6587696671485901), ('blond', 0.6578318476676941)]


In [56]:
# Use the model to find similar words
print(model.wv.most_similar('gay', topn=10))

[('bunksteve', 0.7634089589118958), ('hellor', 0.7328298091888428), ('dashiel', 0.724945068359375), ('cody', 0.7227120995521545), ('lonnie', 0.676784873008728), ('sup', 0.6703212261199951), ('homosexual', 0.6700049042701721), ('kool', 0.6648108959197998), ('fattyjwoods', 0.6589751839637756), ('cline', 0.6388182044029236)]


In [57]:
# Use the model to find similar words
print(model.wv.most_similar('ally', topn=10))

[('waged', 0.8192621469497681), ('invaded', 0.8186164498329163), ('iraqis', 0.804930567741394), ('insurgency', 0.8047081828117371), ('georgians', 0.801575779914856), ('rhodesian', 0.7929945588111877), ('bolsheviks', 0.7891033291816711), ('afganistan', 0.7883941531181335), ('capitulation', 0.7879604697227478), ('waging', 0.7858719229698181)]


In [58]:
# Use the model to find similar words
print(model.wv.most_similar('antisemitism', topn=10))

[('zionism', 0.7802422642707825), ('antisemtism', 0.7637010216712952), ('semitism', 0.7418227791786194), ('jfj', 0.7286948561668396), ('sentiment', 0.7270519733428955), ('antisemites', 0.7229663133621216), ('mouthpiece', 0.7185529470443726), ('pederasty', 0.7165085673332214), ('virulent', 0.7142098546028137), ('protestantism', 0.7137781381607056)]


## Utilizing the Google News Dataset

In [6]:
import gensim.downloader as api
glove_vectors = api.load("glove-twitter-25")



In [12]:
# Assume 'glove_vectors' is your loaded GloVe model
word = 'ally'  # The word you want to find similar words for
n = 10  # The number of most similar words you want

# Find the n most similar words to the specified word
most_similar_words = glove_vectors.most_similar(word, topn=n)

# Print the most similar words and their similarity scores
for similar_word, similarity in most_similar_words:
    print(f"{similar_word}: {similarity}")

beth: 0.9089250564575195
rebecca: 0.8935709595680237
biebs: 0.8865821361541748
paige: 0.8861190676689148
cory: 0.8814960718154907
becky: 0.8803718686103821
dinah: 0.8781067728996277
chloe: 0.8779026865959167
lauren: 0.8775742053985596
emily: 0.8770910501480103
