In [60]:
from collections import Counter
import re

import nltk
import numpy as np
import pandas as pd

In [72]:
class TrigramTCG():
    
    def __init__(self, path='./toxic/train.csv', pos_flags=['toxic'], neg_flags=['identity_hate']):
        '''
        pos_flags: Only use comments where pos_flags are 1
        neg_flags: Only use comments where neg_flags are 0
        Flags can be 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
        '''
        df = pd.read_csv(path)
        X = df[np.all(df[pos_flags]==1, 1)&np.all(df[neg_flags]==0, 1)].comment_text
        X = X.apply(lambda x: nltk.tokenize.ToktokTokenizer().tokenize(x.lower()))

        self._convert_text(X)
    
    def _convert_text(self, X):
        self._trigrams = dict()

        for s in X.values:
            s = ['<<START2>>', '<<START1>>'] + s + ['<<END1>>', '<<END2>>']
            for a, b, c in zip(s, s[1:], s[2:]):
                if a in self._trigrams:
                    if b in self._trigrams[a]:
                        self._trigrams[a][b][c] += 1
                    else:
                        self._trigrams[a][b] = Counter({c: 1})
                else:
                    self._trigrams[a] = {b: Counter({c: 1})}

        for k, v in self._trigrams.items():
            for k2, v2 in v.items():
                s = sum(v2.values())
                for k3, v3 in v2.items():
                    self._trigrams[k][k2][k3] /= s

    def _format_comment(self, comment):
        comment = re.sub(r" (?=[.\"'?!,)-])", "", comment)
        comment = re.sub(r"(?<=[']) ", "", comment)

        return comment

    def generate_comment(self, random_start=False):
        comment = []
        while comment == []:
            if not random_start:
                a, b = '<<START2>>', '<<START1>>'
            else:
                a, b = "", "<<END1>>"
                while b == "<<END1>>":
                    a = np.random.choice(list(self._trigrams.keys()))
                    b = np.random.choice(list(self._trigrams[a].keys()))

            c = np.random.choice(list(self._trigrams[a][b].keys()), p=list(self._trigrams[a][b].values()))

            while c != '<<END1>>':
                comment.append(c)
                a = b
                b = c
                c = np.random.choice(list(self._trigrams[a][b].keys()), p=list(self._trigrams[a][b].values()))
        comment = ' '.join(comment)

        return self._format_comment(comment)

In [73]:
tcg = TrigramTCG()

In [75]:
tcg.generate_comment(random_start=True)

"would vote for the death is long and this site to take this mp shit and die you pathetic fool! 3rr please refrain from posting a legitimate topic... create a user-id for wikipedia. rip wikipedia died 2009 cause of death- malfeasance and arrogance is what you type'oh that's"