In [None]:
import pandas as pd
import numpy as np
import re, string
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, f1_score


In [None]:
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")
train.head()


In [None]:
train["toxicity_score"] = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(axis =1)
train["harmful"] = np.where(train["toxicity_score"] > 0, 1, 0)
train.head(3)

In [None]:
print(train["harmful"].value_counts())
round(len(train[train["harmful"] == 1])/len(train) * 100)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train["comment_text"], train["harmful"], 
                                                    test_size = 0.3, random_state = 0)


In [None]:
X_train.head()


In [None]:
# clean text - remove everything but words and spaces, remove extra spaces, 
def clean_text(text):
    text = text.lower()
    pat = re.compile(r"[^A-Za-z\s']")
    text = pat.sub(" ", text)
    text = text.rstrip()
    newLines = re.compile(r"[\n\r\t]")
    text = newLines.sub(" ", text)
    extraspace = re.compile(r'\s{2,}')
    text = extraspace.sub(" ", text)
    return text

X_train = X_train.map(clean_text)
X_train.head()


In [None]:
# lemmatize, stem, remove stopwords, remove words with less than 3 characters
eng_stopwords = [set(stopwords.words('english')), "i'm", "can't", "you"]
def preprocess_text(text): 
    text = " ".join([word for word in text.split() if len(word) >2])
    text = " ".join([word for word in text.split() if word not in eng_stopwords])
    text = " ".join([WordNetLemmatizer().lemmatize(word) for word in text.split()])
    return text
X_train = X_train.map(preprocess_text)
X_train.head(3)


In [None]:
tfidfVectClean = TfidfVectorizer(min_df = 50, strip_accents = "unicode").fit(X_train)
X_train_dtm_tfidf = tfidfVectClean.transform(X_train)

nbModel = MultinomialNB(alpha = 0.1)      
nbModel.fit(X_train_dtm_tfidf, y_train)


In [None]:
X_test = X_test.map(clean_text)
X_test = X_test.map(preprocess_text)
predictions = nbModel.predict(tfidfVectClean.transform(X_test))
print("Accuracy score: ", round(accuracy_score(y_test, predictions),3))     
print("Precision score: ", round(precision_score(y_test, predictions),3))    
print("Recall score: ", round(recall_score(y_test, predictions), 3)) 
print("F1 score: ", round(f1_score(y_test, predictions), 3))


In [None]:
pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                      ('clf', MultinomialNB())])

parameters = {'clf__alpha': (0.1, 0.5, 1), 
              'tfidf__min_df': (50,100,500,1000),
              'tfidf__sublinear_tf': (True, False),
              'tfidf__ngram_range': ((1,1), (1,2), (1,3)),
              'tfidf__use_idf':(True, False),
              'tfidf__smooth_idf': (True, False)
             }
grid_search = GridSearchCV(pipeline, parameters, scoring = "recall")
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
tfidfVectClean = TfidfVectorizer(strip_accents = "unicode", min_df = 50, ngram_range = (1,3), smooth_idf = False, sublinear_tf = True, use_idf = True).fit(X_train)
X_train_dtm_tfidf = tfidfVectClean.transform(X_train)
nbModel = MultinomialNB(alpha = 0.1)      
nbModel.fit(X_train_dtm_tfidf, y_train)


In [None]:
predictions = nbModel.predict(tfidfVectClean.transform(X_test))
print("Accuracy score: ", round(accuracy_score(y_test, predictions),3))     
print("Precision score: ", round(precision_score(y_test, predictions),3))    
print("Recall score: ", round(recall_score(y_test, predictions), 3))        
print("F1 score: ", round(f1_score(y_test, predictions), 3))                 
