In [1]:
import os
import gc
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

!pip install langdetect
from langdetect import detect

import markdown
import json
import requests
import warnings
import time

!pip install colorama
from colorama import Fore, Back, Style, init

  import pandas.util.testing as tm


Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 2.7MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.8-cp36-none-any.whl size=993193 sha256=ec0670d05fa225eaef2d3739725743d4192a0b1ea7110c0661aee90cddc392fa
  Stored in directory: /root/.cache/pip/wheels/8d/b3/aa/6d99de9f3841d7d3d40a60ea06e6d669e8e5012e6c8b947a57
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.8
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Installing collected packages: colorama
Successfully installed colorama-0.4.3


In [0]:
try:
    from html.parser import HTMLParser
except ImportError:
    from HTMLParser import HTMLParser

def validate_language(language):
    # ISO 639-1 code validation
    # language source: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
    codes = ["ab", "aa", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay",
             "az", "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs",
             "ca", "ce", "ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de",
             "dv", "dz", "ee", "el", "en", "eo", "es", "et", "eu", "fa", "ff",
             "fi", "fj", "fo", "fr", "fy", "ga", "gd", "gl", "gn", "gu", "gv",
             "ha", "he", "hi", "ho", "hr", "ht", "hu", "hy", "hz", "ia", "id",
             "ie", "ig", "ii", "ik", "io", "is", "it", "iu", "ja", "jv", "ka",
             "kg", "ki", "kj", "kk", "kl", "km", "kn", "ko", "kr", "ks", "ku",
             "kv", "kw", "ky", "la", "lb", "lg", "li", "ln", "lo", "lt", "lu",
             "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my",
             "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv", "ny",
             "oc", "oj", "om", "or", "os", "pa", "pi", "ps", "pt", "qu", "rm",
             "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk",
             "sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw",
             "ta", "te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts",
             "tt", "tw", "ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa",
             "wo", "xh", "yi", "yo", "za", "zh", "zu"]
    return language.lower() in codes


In [0]:
def remove_html(text, md=False):
    if md:
        text = markdown.markdown(text)
    # credit: stackoverflow
    class MLStripper(HTMLParser):
        def __init__(self):
            super().__init__()
            self.reset()
            self.strict = False
            self.convert_charrefs= True
            self.fed = []
        def handle_data(self, d):
            self.fed.append(d)
        def get_data(self):
            return ''.join(self.fed)

    s = MLStripper()
    s.feed(text)
    return s.get_data()

In [0]:
allowed = ["TOXICITY",
           "SEVERE_TOXICITY",
           "TOXICITY_FAST",
           "ATTACK_ON_AUTHOR",
           "ATTACK_ON_COMMENTER",
           "INCOHERENT",
           "INFLAMMATORY",
           "OBSCENE",
           "OFF_TOPIC",
           "UNSUBSTANTIAL",
           "LIKELY_TO_REJECT"]


In [0]:
class Perspective(object):

    base_url = "https://commentanalyzer.googleapis.com/v1alpha1"

    def __init__(self, key):
        self.key = key

    def score(self, text, tests=["TOXICITY"], context=None, languages=None, do_not_store=False, token=None, text_type=None):
        # data validation
        # make sure it's a valid test
        # TODO: see if an endpoint that has valid types exists
        if isinstance(tests, str):
            tests = [tests]
        if not isinstance(tests, (list, dict)) or tests is None:
            raise ValueError("Invalid list/dictionary provided for tests")
        if isinstance(tests, list):
            new_data = {}
            for test in tests:
                new_data[test] = {}
            tests = new_data
        if text_type:
            if text_type.lower() == "html":
                text = remove_html(text)
            elif text_type.lower() == "md":
                text = remove_html(text, md=True)
            else:
                raise ValueError("{0} is not a valid text_type. Valid options are 'html' or 'md'".format(str(text_type)))

        for test in tests.keys():
            if test not in allowed:
                warnings.warn("{0} might not be accepted as a valid test.".format(str(test)))
            for key in tests[test].keys():
                if key not in ["scoreType", "scoreThreshhold"]:
                    raise ValueError("{0} is not a valid sub-property for {1}".format(key, test))

        # The API will only grade text less than 3k characters long
        if len(text) > 3000:
            # TODO: allow disassembly/reassembly of >3000char comments
            warnings.warn("Perspective only allows 3000 character strings. Only the first 3000 characters will be sent for processing")
            text = text[:3000]
        new_langs = []
        if languages:
            for language in languages:
                language = language.lower()
                if validate_language(language):
                    new_langs.append(language)

         # packaging data
        url = Perspective.base_url + "/comments:analyze"
        querystring = {"key": self.key}
        payload_data = {"comment": {"text": text}, "requestedAttributes": {}}
        for test in tests.keys():
            payload_data["requestedAttributes"][test] = tests[test]
        if new_langs != None:
            payload_data["languages"] = new_langs
        if do_not_store:
            payload_data["doNotStore"] = do_not_store
        payload = json.dumps(payload_data)
        headers = {'content-type': "application/json"}
        response = requests.post(url,
                            data=payload,
                            headers=headers,
                            params=querystring)
        data = response.json()
        if "error" in data.keys():
            raise PerspectiveAPIException(data["error"]["message"])
        c = Comment(text, [], token)
        base = data["attributeScores"]
        for test in tests.keys():
            score = base[test]["summaryScore"]["value"]
            score_type = base[test]["summaryScore"]["type"]
            a = Attribute(test, [], score, score_type)
            for span in base[test]["spanScores"]:
                beginning = span["begin"]
                end = span["end"]
                score = span["score"]["value"]
                score_type = span["score"]["type"]
                s = Span(beginning, end, score, score_type, c)
                a.spans.append(s)
            c.attributes.append(a)
        return c

class Comment(object):
    def __init__(self, text, attributes, token):
        self.text = text
        self.attributes = attributes
        self.token = token

    def __getitem__(self, key):
        if key.upper() not in allowed:
            raise ValueError("value {0} does not exist".format(key))
        for attr in self.attributes:
            if attr.name.lower() == key.lower():
                return attr
        raise ValueError("value {0} not found".format(key))

    def __str__(self):
        return self.text

    def __repr__(self):
        count = 0
        num = 0
        for attr in self.attributes:
            count += attr.score
            num += 1
        return "<({0}) {1}>".format(str(count/num), self.text)

    def __iter__(self):
        return iter(self.attributes)

    def __len__(self):
        return len(self.text)

class Attribute(object):
      def __init__(self, name, spans, score, score_type):
          self.name = name
          self.spans = spans
          self.score = score
          self.score_type = score_type

      def __getitem__(self, index):
          return self.spans[index]

      def __iter__(self):
          return iter(self.spans)

class Span(object):
    def __init__(self, begin, end, score, score_type, comment):
        self.begin = begin
        self.end = end
        self.score = score
        self.score_type = score_type
        self.comment = comment

    def __str__(self):
        return self.comment.text[self.begin:self.end]

    def __repr__(self):
        return "<({0}) {1}>".format(self.score, self.comment.text[self.begin:self.end])

class PerspectiveAPIException(Exception):
    pass

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [7]:
%cd './gdrive/My Drive/Colab Notebooks/CS263/'     
!pip install cache-magic
import cache_magic
!mkdir .cache
!ln -s './gdrive/My Drive/Colab Notebooks/CS263/.cache' /content/.cache

/content/gdrive/My Drive/Colab Notebooks/CS263
Collecting cache-magic
  Downloading https://files.pythonhosted.org/packages/03/94/4cbb25895b80704027453fca297825c0b5924b4ba7533329e0b32a4905a2/cache-magic-1.0.4.tar.gz
Building wheels for collected packages: cache-magic
  Building wheel for cache-magic (setup.py) ... [?25l[?25hdone
  Created wheel for cache-magic: filename=cache_magic-1.0.4-cp36-none-any.whl size=6698 sha256=b9a57977b4e4380cba88376aea0c659f3eac9b90b599e16e79fb528b652803ec
  Stored in directory: /root/.cache/pip/wheels/84/3a/44/00b6aea43fe9fcd0c86bbcf33b7e45d167a6b6a1803983325e
Successfully built cache-magic
Installing collected packages: cache-magic
Successfully installed cache-magic-1.0.4
mkdir: cannot create directory ‘.cache’: File exists


In [8]:
train_df = pd.read_csv('./toxic_dataset/train1.csv')
comments = train_df['comment_text']
targets = train_df['target']
severe_toxicities = train_df['severe_toxicity']
obscenities = train_df['obscene']
del train_df
gc.collect()

32

In [17]:
obscenities

0          0.000000
1          0.000000
2          0.000000
3          0.000000
4          0.000000
             ...   
1804869    0.000000
1804870    0.000000
1804871    0.000000
1804872    0.030303
1804873    0.000000
Name: obscene, Length: 1804874, dtype: float64

In [0]:
google_api_key = "AIzaSyAb2Mzded8lqAPJ3YwjC9kOJgk9gO7XPiI"
client = Perspective(google_api_key)

In [112]:
toxicity_scores = []
severe_toxicity_scores = []
obscenity_scores = []


start = time.time()
print("                         EXAMPLE WORKING OF PERSPECTIVE API                          ")
print("                         ----------------------------------                          ")
print("")
comment = 'Quiet! you i.diot' 
toxicity = client.score(comment, tests=["TOXICITY", "SEVERE_TOXICITY", "OBSCENE"])

toxicity_scores.append(toxicity["TOXICITY"].score)
severe_toxicity_scores.append(toxicity["SEVERE_TOXICITY"].score)
obscenity_scores.append(toxicity["OBSCENE"].score)
        
print("COMMENT :\n" + comment)
print("")
print("TOXICITY SCORE : " + str(toxicity["TOXICITY"].score) +\
      f' {Fore.GREEN}CORRECT \u2714{Style.RESET_ALL}')

print("SEVERE TOXICITY SCORE : " + str(toxicity["SEVERE_TOXICITY"].score) +\
      f' {Fore.GREEN}CORRECT \u2714{Style.RESET_ALL}')
print("OBSCENITY SCORE : " + str(toxicity["OBSCENE"].score) +\
      f' {Fore.GREEN}CORRECT \u2714{Style.RESET_ALL}')
print(("*********************************************************************"+\
        "***********************").replace('*', '-'))
print("")

                         EXAMPLE WORKING OF PERSPECTIVE API                          
                         ----------------------------------                          

COMMENT :
Quiet! you i.diot

TOXICITY SCORE : 0.4323854 [32mCORRECT ✔[0m
SEVERE TOXICITY SCORE : 0.28388166 [32mCORRECT ✔[0m
OBSCENITY SCORE : 0.17134532 [32mCORRECT ✔[0m
--------------------------------------------------------------------------------------------

