In [6]:
import ast
import re
import string

import emoji
import numpy as np
import pandas as pd
from spacy.lang.en import English


In [7]:
def pre_process_text(text):
    emoticons = [
        ":-)",
        ":)",
        "(:",
        "(-:",
        ":))",
        "((:",
        ":-D",
        ":D",
        "X-D",
        "XD",
        "xD",
        "xD",
        "<3",
        "</3",
        ":\*",
        ";-)",
        ";)",
        ";-D",
        ";D",
        "(;",
        "(-;",
        ":-(",
        ":(",
        "(:",
        "(-:",
        ":,(",
        ":'(",
        ':"(',
        ":((",
        ":D",
        "=D",
        "=)",
        "(=",
        "=(",
        ")=",
        "=-O",
        "O-=",
        ":o",
        "o:",
        "O:",
        "O:",
        ":-o",
        "o-:",
        ":P",
        ":p",
        ":S",
        ":s",
        ":@",
        ":>",
        ":<",
        "^_^",
        "^.^",
        ">.>",
        "T_T",
        "T-T",
        "-.-",
        "*.*",
        "~.~",
        ":*",
        ":-*",
        "xP",
        "XP",
        "XP",
        "Xp",
        ":-|",
        ":->",
        ":-<",
        "$_$",
        "8-)",
        ":-P",
        ":-p",
        "=P",
        "=p",
        ":*)",
        "*-*",
        "B-)",
        "O.o",
        "X-(",
        ")-X",
    ]
    text = text.replace(".", " ").lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    users = re.findall("[@]\w+", text)
    for user in users:
        text = text.replace(user, "<user>")
    urls = re.findall(r"(https?://[^\s]+)", text)
    if len(urls) != 0:
        for url in urls:
            text = text.replace(url, "<url >")
    for emo in text:
        if emo in emoji.UNICODE_EMOJI:
            text = text.replace(emo, "<emoticon >")
    for emo in emoticons:
        text = text.replace(emo, "<emoticon >")
    numbers = re.findall("[0-9]+", text)
    for number in numbers:
        text = text.replace(number, "<number >")
    text = text.replace("#", "<hashtag >")
    text = re.sub(r"([?.!,¿])", r" ", text)
    text = "".join(l for l in text if l not in string.punctuation)
    text = re.sub(r'[" "]+', " ", text)
    return text

In [8]:
from spacy.lang.en import English

tok = English()

def tokenize(text):
    return [token.text for token in tok.tokenizer(pre_process_text(text))]

In [9]:
file = open("../input/vocab2index.txt", "r")
contents = file.read()
vocab2index = ast.literal_eval(contents)

In [10]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return " ".join(map(str, encoded))

In [11]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df1, df2 = df.copy(deep=True), df.copy(deep=True)

In [13]:
df1["encoded"] = df1["more_toxic"].apply(lambda x: encode_sentence(x, vocab2index))


In [14]:
df2["encoded"] = df2["less_toxic"].apply(lambda x: encode_sentence(x, vocab2index))

In [15]:
df1=df1[['encoded']]
df2=df2[['encoded']]

In [16]:
df1.to_csv('../input/jigsaw-toxic-severity-rating/validation_data_more_toxic.csv')
df2.to_csv('../input/jigsaw-toxic-severity-rating/validation_data_less_toxic.csv')