## Comment Toxicity Classificaion

+ This folder Contains 3 files :
    - train.csv
    - test.csv
    - sample_submission.csv

In [None]:
%ls -l

## Import required packages

+ Basics
+ Vizualization
+ Natural language Processing tool
+ Feture Engineering
+ Setting

In [None]:
import pandas as pd 
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image

### About NLP Libararies

+ Spacy
    - [Spacy Tutorial by Analytics Vidiya](https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/)
+ NLTK 
    - [NLTK book](http://www.nltk.org/book/)
+ RE (Regular Expression libraries)
    - [RE tutorial](https://docs.python.org/2/howto/regex.html)

In [None]:
import string
import re    
import nltk
from nltk.corpus import stopwords

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD

stoplist = set(stopwords.words("english"))
%matplotlib inline

### Starting  feature Engg

In [None]:
train = pd.read_csv('../input/WK7525train.csv')
test = pd.read_csv('../input/WK7525test.csv')

In [None]:
train.head()

In [None]:
train['X_input'][1]

## Replacement Algorithm or Modules

In [None]:
replacement_patterns = [  
    (r'won\'t', 'will not'),  
    (r'can\'t', 'cannot'),  
    (r'i\'m', 'i am'),  
    (r'ain\'t', 'is not'),  
    (r'(\w+)\'ll', '\g<1> will'),  
    (r'(\w+)n\'t', '\g<1> not'),  
    (r'(\w+)\'ve', '\g<1> have'),  
    (r'(\w+)\'s', '\g<1> is'),  
    (r'(\w+)\'re', '\g<1> are'),  
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):  
    def __init__(self, patterns=replacement_patterns):    
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]  
        
    def replace(self, text):    
        s = text    
        for (pattern, repl) in self.patterns:      
            s = re.sub(pattern, repl, s)    
        return s


## Replacing negations with antonyms

In [None]:
from nltk.corpus import wordnet

class AntonymReplacer(object):
    
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words


# Noise Removal

In [None]:
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

stoplist = set(stopwords.words("english"))

class Remove_Noise(object):
    
    def __init__(self,stop_word = stoplist):
        self.stop_word = stoplist
    
    def noise_rm(self,doc):
        doc = re.sub('[#$%^&\',:()*+/<=>@[\\]^_``{|}~]',' ',doc)
        doc = re.sub('[0-9]+',' ',doc)
        doc = re.sub('\n','',doc)
        doc = re.sub(' +',' ',doc)
        doc = doc.lower()
        return doc
    
    def lemmatize(self,token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        lemmatizer = WordNetLemmatizer()
        return lemmatizer.lemmatize(token, tag)
    
    def tokenize(self,document): 
        #document = unicode(document,'utf-8')
        lemmy = []
        for sent in sent_tokenize(document):
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                if token in self.stop_word:
                    continue
                lemma = self.lemmatize(token, tag)
                lemmy.append(lemma)
        return lemmy

In [None]:
def join_tokens(data):
    ans = ' '.join(data)
    return ans



In [None]:
replacer = RegexpReplacer()
remover = Remove_Noise()
AntoRep = AntonymReplacer()

In [None]:
train['X_input'].fillna(' ', inplace=True)
test['X_input'].fillna(' ', inplace=True)

In [None]:
train['comment_full'] = train['X_input'].apply(replacer.replace)
test['comment_full'] = test['X_input'].apply(replacer.replace)

In [None]:
train['Remove_noise'] = train['comment_full'].apply(remover.noise_rm)
test['Remove_noise'] = test['comment_full'].apply(remover.noise_rm)

In [None]:
train['TokenandLemma'] = train['Remove_noise'].apply(remover.tokenize)
test['TokenandLemma'] = test['Remove_noise'].apply(remover.tokenize)

In [None]:
train["Processed"] = train['TokenandLemma'].apply(AntoRep.replace_negations)
test["Processed"] = test['TokenandLemma'].apply(AntoRep.replace_negations)

In [None]:
train["Sentence"] = train["Processed"].apply(join_tokens)
test["Sentence"] = test["Processed"].apply(join_tokens)

In [None]:
train.to_pickle('train_processed.pkl')
test.to_pickle('test_processed.pkl')

# Using processed data and make VSM

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [None]:
train = pd.read_pickle('train_processed.pkl')
test = pd.read_pickle('test_processed.pkl')

In [None]:
test.head()

In [None]:
train_text = train['Sentence']
test_text = test['Sentence']

In [None]:
!ls -lah

In [None]:
word_vec = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='word',ngram_range=(1, 2),max_features=20000)

In [None]:
char_vec = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',ngram_range=(1, 6),max_features=20000)

In [None]:
train_word_features = word_vec.fit_transform(train_text)
test_word_features = word_vec.transform(test_text)

In [None]:
train_char_features = char_vec.fit_transform(train_text)
test_char_features = char_vec.transform(test_text)

In [None]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

# Using Model

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score , precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve,precision_score,recall_score,classification_report

In [None]:
classifierEtree = ExtraTreesClassifier(n_estimators=200,n_jobs=-1)
classifierLR = LogisticRegression(solver='sag')

In [None]:
train_target = train['Y']
test_target = test['Y']

In [None]:
classifierEtree.fit(train_features, train_target)

In [None]:
classifierLR.fit(train_features, train_target)

In [None]:
test_predicted = classifierEtree.predict(test_features)
    
accuracy = accuracy_score(test_target,test_predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
print(classification_report(test_target,test_predicted))

In [None]:
test_predicted = classifierLR.predict(test_features)
    
accuracy = accuracy_score(test_target,test_predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
print(classification_report(test_target,test_predicted))

# Saving model


In [None]:
from sklearn.externals import joblib

In [None]:
filename = 'final_modelET.pkl'
joblib.dump(classifierEtree,filename)

In [None]:
filename = 'final_modelLR.pkl'
joblib.dump(classifierLR,filename)

In [None]:
filename = 'char_vectorizer.pkl'
joblib.dump(char_vec,filename)

In [None]:
filename = 'word_vectorizer.pkl'
joblib.dump(word_vec,filename)

# Load Model

In [None]:
classifier1 = joblib.load('final_modelLR.pkl')
classifier2 = joblib.load('final_modelET.pkl')
classifier3 = joblib.load('char_vectorizer.pkl')
classifier4 = joblib.load('word_vectorizer.pkl')