# Trigram

In [76]:
from collections import Counter
import re
from typing import List

import nltk
import numpy as np
import pandas as pd

In [77]:
class TrigramTCG():

    def __init__(self,
                 path:str='./toxic/train.csv',
                 pos_flags:List[str]=['toxic'],
                 neg_flags:List[str]=['identity_hate']):
        '''
        pos_flags: Only use comments where pos_flags are 1
        neg_flags: Only use comments where neg_flags are 0
        Flags can be 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
        '''
        df = pd.read_csv(path)
        X = df[np.all(df[pos_flags]==1, 1)&np.all(df[neg_flags]==0, 1)].comment_text
        X = X.apply(lambda x: nltk.tokenize.ToktokTokenizer().tokenize(x.lower()))

        self._convert_text(X)

    def _convert_text(self, X):
        self._trigrams = dict()

        for s in X.values:
            s = ['<<START2>>', '<<START1>>'] + s + ['<<END1>>', '<<END2>>']
            for a, b, c in zip(s, s[1:], s[2:]):
                if a in self._trigrams:
                    if b in self._trigrams[a]:
                        self._trigrams[a][b][c] += 1
                    else:
                        self._trigrams[a][b] = Counter({c: 1})
                else:
                    self._trigrams[a] = {b: Counter({c: 1})}

        for k, v in self._trigrams.items():
            for k2, v2 in v.items():
                s = sum(v2.values())
                for k3, v3 in v2.items():
                    self._trigrams[k][k2][k3] /= s

    def _format_comment(self, comment:str):
        comment = re.sub(r" (?=[.\"'?!,)-])", "", comment)
        comment = re.sub(r"(?<=[']) ", "", comment)

        return comment

    def generate_comment(self, random_start:bool=False):
        comment = []
        while comment == []:
            if not random_start:
                a, b = '<<START2>>', '<<START1>>'
            else:
                a, b = "", "<<END1>>"
                while b == "<<END1>>":
                    a = np.random.choice(list(self._trigrams.keys()))
                    b = np.random.choice(list(self._trigrams[a].keys()))

            c = np.random.choice(list(self._trigrams[a][b].keys()), p=list(self._trigrams[a][b].values()))

            while c != '<<END1>>':
                comment.append(c)
                a = b
                b = c
                c = np.random.choice(list(self._trigrams[a][b].keys()), p=list(self._trigrams[a][b].values()))
        comment = ' '.join(comment)

        return self._format_comment(comment)

In [78]:
tcg = TrigramTCG()

In [96]:
tcg.generate_comment(random_start=True)

'is about a battle by personal attack on you to make disappear any kind of horse-shit-for-brains dumbass would write on my dickle you can fuck my momma'

# LSTM

In [1]:
import sys

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd

Using TensorFlow backend.


In [2]:
df = pd.read_csv("./toxic/train.csv")
training = ' '.join(df[df.toxic==1].comment_text.str.lower().values)

ignore_chars = {'<', '^', '`', '\x93', '\x94', '¢', '£', '¤', '¦', '§', '¨', '©',
               '\xad', '®', '¯', '°', '±', '²', '´', '·', '¸', '½', '¿', 'ß', 'à',
               'á', 'ä', 'å', 'æ', 'ç', 'è', 'ê', 'í', 'ï', 'ñ', 'ó', 'ö', 'ù',
               'ú', 'ü', 'þ', 'ą', 'ć', 'đ', 'ė', 'ě', 'ģ', 'ĥ', 'ħ', 'ı', 'ń',
               'ņ', 'ō', 'œ', 'ś', 'ş', 'š', 'ţ', 'ũ', 'ŵ', 'ŷ', 'ż', 'ƒ', 'ǔ',
               'ȳ', '̇', 'ά', 'ί', 'α', 'γ', 'δ', 'ε', 'η', 'θ', 'ι', 'κ', 'λ',
               'μ', 'ν', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'ω', 'ό', 'ύ',
               'ώ', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'и', 'й', 'к', 'л', 'м',
               'н', 'о', 'п', 'р', 'с', 'т', 'у', 'х', 'ц', 'ч', 'щ', 'ъ', 'ы',
               'ь', 'я', 'љ', 'ּ', 'א', 'ב', 'ו', 'י', 'כ', 'ל', 'מ', 'ا', 'ت',
               'س', 'ط', 'ع', 'ف', 'ك', 'ل', 'م', 'ن', 'و', 'ي', 'چ', 'ڜ', 'ڬ',
               'ڰ', 'ڵ', '\u06dd', '۞', '۬', '۵', '۸', 'ۻ', '۾', 'ݓ', 'ݗ', 'ݜ',
               'ݟ', 'ݡ', 'ݣ', 'ݭ', 'ක', 'ත', 'ඳ', 'ර', 'ව', '්', 'ු', 'ᛏ', 'ᵽ',
               'ḟ', 'ḻ', 'ṃ', 'ṗ', 'ṣ', 'ṯ', '–', '‘', '“', '”', '„', '†', '•',
               '…', '\u2060', '₡', '₨', '₩', '₪', '€', '₭', '₳', '₵', '№', '™',
               'ℳ', '⅞', '←', '↑', '→', '↔', '↨', '⇒', '⇔', '∂', '∆', '∇', '−',
               '√', '∞', '∫', '≈', '≠', '≤', '⊕', '─', '╟', '╢', '╦', '►', '◄',
               '★', '☎', '☏', '☥', '☭', '☺', '☻', '☼', '♠', '♣', '♥', '♦', '♪',
               '♫', '✄', '✉', '✋', '✍', '✎', '✽', '❝', '❞', '➨', '⟲', 'ツ', '妈',
               '学', '影', '惑', '武', '永', '烂', '的', '絡', '者', '臭', '見', '訣', '迷',
               '連', '\ufeff', '．', 'ａ', 'ｃ', 'ｋ', 'ｌ', 'ｍ', 'ｎ', 'ｏ', 'ｔ', 'ｗ',
               '🏼', '👍', '💩', '😂', '😄', '😊'}

training = ''.join([ch if ch not in ignore_chars else '\u2600' for ch in training])

chars = sorted(list(set(training)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = {v: k for k, v in char_to_int.items()}

n_chars = len(training)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  4530787
Total Vocab:  73


In [3]:
seq_length = 10
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = training[i:i + seq_length]
    seq_out = training[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

X = np.reshape(dataX, (n_patterns, seq_length, 1))
X = X / float(n_vocab)
y = np_utils.to_categorical(dataY)

# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Total Patterns:  4530777


In [6]:
filename = "./models/weights-improvement-02-2.2415.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [7]:
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
out = ''
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    out += result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

print(out)

Seed:
" elete my c "
amt dad mo the aasie toe toat and ia to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe

In [8]:
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]

In [12]:
for _ in range(100):
    prediction = model.predict(np.reshape(pattern, (1, len(pattern), 1))/float(n_vocab)).argmax()
    pred_char = int_to_char[prediction]
    print(pred_char, end='')
    pattern = pattern[1:] + [prediction]
#     print(pattern)

toe toee and the aatie to toe toee and the aatie to toe toee and the aatie to toe toee and the aatie