In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import regex as re
import nltk
from nltk.corpus import stopwords

In [None]:
# bash code to mount the drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

filepath = '/content/drive/MyDrive/train.csv' # ElLI
#filepath = '/content/drive/MyDrive/datasets/ruddit/train.csv' # JOnaS

# Daten laden
data = pd.read_csv(filepath)

Mounted at /content/drive


In [None]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Preprocessing

In [None]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
  """Removes HTML tags by replacing everywthing inside html tags with an empty space"""
  return TAG_RE.sub('', text)

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def preprocess_text(input):
  # we want everything to be lowercase
  input = input.lower()

  # remove any html tags
  input = remove_tags(input)

  # remove punctuations and numbers
  input = re.sub('[^a-zA-Z]', ' ', input)

  # remove single characters: Mark's -> Mark
  input = re.sub(r'\s+[a-zA-Z]\s', ' ', input)

  # remove multiple spaces
  input = re.sub(r'\s+', ' ', input)

  # remove stopwords
  pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  input = pattern.sub('', input)

  return input.strip()

In [None]:
# Applying the preprocessing function
X = []
sentences = list(data['comment_text'])
for s in sentences:
  X.append(preprocess_text(s))

In [None]:
X[:10]

['explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired',
 'aww matches background colour seemingly stuck thanks talk january utc',
 'hey man really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info',
 'make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later one else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipedia good article nominations transport',
 'sir hero chance remember page',
 'congratulations well use tools well talk',
 'cocksucker piss around work',
 'vandalism matt shirvington article reverted please banned',
 'sorry word nonsense offensive anyway intending write anything a

In [None]:
tokenized_comments = [comment.split() for comment in X]

In [None]:
tokenized_comments[0]

['explanation',
 'edits',
 'made',
 'username',
 'hardcore',
 'metallica',
 'fan',
 'reverted',
 'vandalisms',
 'closure',
 'gas',
 'voted',
 'new',
 'york',
 'dolls',
 'fac',
 'please',
 'remove',
 'template',
 'talk',
 'page',
 'since',
 'retired']

In [None]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
model = Word2Vec(sentences=tokenized_comments, vector_size=100, window=5, min_count=5, sg=1, workers=4)

In [None]:
model.save("word2vec_test.model")

In [None]:
# Use the model to find similar words
print(model.wv.most_similar('black', topn=5))

[('white', 0.7688474655151367), ('mulatto', 0.7153080701828003), ('caucasion', 0.6823104619979858), ('blond', 0.6624426245689392), ('brazilian', 0.6351351141929626)]


In [None]:
# Use the model to find similar words
print(model.wv.most_similar('white', topn=10))

[('black', 0.7688474655151367), ('mulatto', 0.7109106779098511), ('caucasion', 0.683744490146637), ('straps', 0.6784337162971497), ('hispanic', 0.6606832146644592), ('hispanics', 0.6548726558685303), ('skinned', 0.6544691920280457), ('supremacist', 0.6496246457099915), ('supremacists', 0.6418265104293823), ('stripes', 0.6372109651565552)]


In [None]:
# Use the model to find similar words
print(model.wv.most_similar('gay', topn=10))

[('bunksteve', 0.7444447875022888), ('hellor', 0.716431200504303), ('cody', 0.6959086656570435), ('sup', 0.6888766884803772), ('dashiel', 0.6868166327476501), ('gays', 0.6672086715698242), ('cline', 0.6517469882965088), ('lesbian', 0.6433513164520264), ('lonnie', 0.6416216492652893), ('homosexual', 0.6337783336639404)]


In [None]:
# Use the model to find similar words
print(model.wv.most_similar('ally', topn=10))

[('iraqis', 0.8212122321128845), ('waged', 0.8116226196289062), ('peacekeeping', 0.8104318380355835), ('kla', 0.8038443922996521), ('georgians', 0.8034337759017944), ('invaded', 0.7955858707427979), ('ossetia', 0.7933374047279358), ('recognises', 0.7916944622993469), ('insurgency', 0.7901435494422913), ('trotsky', 0.7899577617645264)]


In [None]:
# Use the model to find similar words
print(model.wv.most_similar('antisemitism', topn=10))

[('zionism', 0.7773802280426025), ('semitism', 0.7541790008544922), ('antisemtism', 0.7303206324577332), ('revisionism', 0.7146024107933044), ('extremism', 0.7122368812561035), ('eugenics', 0.7117438316345215), ('protestantism', 0.711726725101471), ('pederasty', 0.7064185738563538), ('gamergate', 0.7049989104270935), ('racism', 0.7041177749633789)]


## Utilizing the Google News Dataset