In [1]:
# import os
# print(os.listdir("../input/tinyversions"))

import numpy as np 
import pandas as pd 
import string

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer



In [9]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('data/tiny_train.csv', encoding="utf-8").fillna(' ')
test = pd.read_csv('data/test.csv', encoding="utf-8").fillna(' ')
badwords = pd.read_csv('data/bad-words.csv', header=None).iloc[:,0].tolist()
eng_stopwords = set(stopwords.words("english"))

# train = pd.read_csv('../input/tinyversions/tiny_train.csv', encoding="utf-8").fillna(' ')
# train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv', encoding="utf-8").fillna(' ')
# test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv', encoding="utf-8").fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [10]:
# Word Vectorizer
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000)

# N-gram Character Vectorizer
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=30000)


In [11]:
# Parts of Speech Tag Count 
class PoS_TagFeatures(TransformerMixin):
    
    def tag_PoS(self, text):
        text_splited = text.split(' ')
        text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
        text_splited = [s for s in text_splited if s]
        pos_list = pos_tag(text_splited)
        noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
        adjective_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
        verb_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
        
        words = len(text.split())
        length = len(text)
        return [noun_count, noun_count/words, noun_count/length, 
                adjective_count, adjective_count/words, adjective_count/length, 
                verb_count, verb_count/words, verb_count/length]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        return [{'nouns': counts[0],
                 'nounNormByWords': counts[1],
                 'nounNormByLength': counts[2],
                 'adjectives': counts[3],
                 'adjectiveNormByWords': counts[4],
                 'adjectiveNormByLength': counts[5],
                 'verbs': counts[6],
                 'verbNormByWords': counts[7],
                 'verbNormByLength': counts[8]}
                for counts in map(self.tag_PoS, posts)]
    
# Pipelining Parts of Speech Tag Features with DictVectorizer for processing
posTag_vectorizer = Pipeline([
    ('parts_of_speech', PoS_TagFeatures()),
    ('dictVect', DictVectorizer(sparse = False))
])

In [12]:
# Bad Words Occurance Count
class BadWords_Features(TransformerMixin):
    
    def badWordCount(self, text):
        badCount = sum(text.count(w) for w in badwords)
        return [badCount, badCount/len(text.split()), badCount/len(text)]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        return [{'badwordcount': badCounts[0],
                 'badNormByWords': badCounts[1],
                 'badNormByLength': badCounts[2]}
                for badCounts in map(self.badWordCount, posts)]
    
# Pipelining Bad Word Features with DictVectorizer for processing
badWord_vectorizer = Pipeline([
    ('bad_words', BadWords_Features()),
    ('dictVect', DictVectorizer(sparse = False))
])


In [13]:
# Symbol Occurance Count
class Symbol_Features(TransformerMixin):
    
    def symbolCount(self, text):
        foul_filler = sum(text.count(w) for w in '*&$%@#!')
        userMentions = text.count("User:")
        smileys = sum(text.count(w) for w in (':-)', ':)', ';-)', ';)'))
        exclamation = text.count("!")
        question = text.count("User:")
        punctuation = sum(text.count(w) for w in '.,;:')
        all_symbol = 1 #sum(text.count(w) for w in '*&#$%“”¨«»®´·º½¾¿¡§£₤‘’'.decode('utf-8'))
        
        words = len(text.split())
        length = len(text)
        return [foul_filler, foul_filler/words, foul_filler/length, 
                userMentions, userMentions/words, userMentions/length, 
                smileys, smileys/words, smileys/length, 
                exclamation, exclamation/words, exclamation/length, 
                question, question/words, question/length, 
                punctuation, punctuation/words, punctuation/length, 
                all_symbol, all_symbol/words, all_symbol/length]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        return [{'foul_fillerCount': symCounts[0],
                 'foul_fillerNormByWords': symCounts[1],
                 'foul_fillerNormByLength': symCounts[2],
                 'userMentionsCount': symCounts[3],
                 'userMentionsNormByWords': symCounts[4],
                 'userMentionsNormByLength': symCounts[5],
                 'smileysCount': symCounts[6],
                 'smileysNormByWords': symCounts[7],
                 'smileysNormByLength': symCounts[8],
                 'exclamationCount': symCounts[9],
                 'exclamationNormByWords': symCounts[10],
                 'exclamationNormByLength': symCounts[11],
                 'questionCount': symCounts[12],
                 'questionNormByWords': symCounts[13],
                 'questionNormByLength': symCounts[14],
                 'punctuationCount': symCounts[15],
                 'punctuationNormByWords': symCounts[16],
                 'punctuationNormByLength': symCounts[17],
                 'all_symbolCount': symCounts[18],
                 'all_symbolNormByWords': symCounts[19],
                 'all_symbolNormByLength': symCounts[20]}
                for symCounts in map(self.symbolCount, posts)]
    
# Pipelining Symbol based Features with DictVectorizer for processing
symbol_vectorizer = Pipeline([
    ('symbols', Symbol_Features()),
    ('dictVect', DictVectorizer(sparse = False))
])


In [14]:
# General Text Based Features
class TextFeatures(TransformerMixin):
    
    def featureCount(self, text):        
        words = len(text.split())
        length = len(text)
        capitals = sum(1 for c in text if c.isupper())
        paragraphs = text.count('\n')
        stopwords = sum(text.count(w) for w in eng_stopwords)
        unique = len(set(w for w in text.split()))
        word_counts = Counter(text.split())
        repeat = sum(count for word, count in sorted(word_counts.items()) if count > 10)
        
        return [words, length, words/length, 
                capitals, capitals/words, capitals/length, 
                paragraphs, paragraphs/words, paragraphs/length, 
                stopwords, stopwords/words, stopwords/length, 
                unique, unique/words, unique/length,  
                repeat, repeat/words, repeat/length]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        return [{'words': counts[0],
                 'length': counts[1],
                 'wordNormByLength': counts[2],
                 'capitalsCount': counts[3],
                 'capitalsNormByWords': counts[4],
                 'capitalsNormByLength': counts[5],
                 'paragraphsCount': counts[6],
                 'paragraphsNormByWords': counts[7],
                 'paragraphsNormByLength': counts[8],
                 'stopwordsCount': counts[9],
                 'stopwordsNormByWords': counts[10],
                 'stopwordsNormByLength': counts[11],
                 'uniqueCount': counts[12],
                 'uniqueNormByWords': counts[13],
                 'uniqueNormByLength': counts[14],
                 'repeatCount': counts[15],
                 'repeatNormByWords': counts[16],
                 'repeatNormByLength': counts[17]}
                for counts in map(self.featureCount, posts)]
    
# Pipelining Text Features with DictVectorizer for processing
text_vectorizer = Pipeline([
    ('texts', TextFeatures()),
    ('dictVect', DictVectorizer(sparse = False))
])

In [17]:
combined_features = FeatureUnion([("word", word_vectorizer), ("char", char_vectorizer), ("pos_tags", posTag_vectorizer), 
                                  ("bad_word", badWord_vectorizer), ("symbol", symbol_vectorizer), ("text", text_vectorizer)])
train_features = combined_features.fit(train_text, train["toxic"]).transform(train_text)

print(train_features.shape)

(20, 6851)


In [18]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for clas in classes:
    train_target = train[clas]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(clas, cv_score))

    classifier.fit(train_features, train_target)
    submission[clas] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))
submission.head()

CV score for class toxic is 0.766666666667




NameError: name 'test_features' is not defined

In [None]:
submission.to_csv('submission.csv', index=False)