In [0]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report, accuracy_score
from sklearn import svm

In [0]:
from google.colab import drive
drive.mount('/gdrive')

base_path = "/gdrive/My Drive/data/" #as per individual folder paths

training_file = "train.csv"
testing_file = "test.csv"

trainDf = pd.read_csv(base_path + training_file).fillna(' ')
testDf = pd.read_csv(base_path + testing_file).fillna(' ')

In [0]:
def isToxic(row):
    """Comments which are toxic, obscene, a threat, an insult and has identity hate are marked as inappropriate"""
    if row["toxic"] == 1 or row["severe_toxic"] == 1 or row["obscene"] == 1 or row["threat"] == 1 or row["insult"] ==1 or row["identity_hate"] == 1:
        return 1
    return 0
trainDf["inappropriate"] = trainDf.apply(lambda x: isToxic(x), axis=1)

In [0]:
columns = ["id", "comment_text", "inappropriate"]
trainDf = trainDf[columns]

vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    max_features=10000,
    strip_accents='unicode',
    analyzer='word',
    stop_words='english',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1)
    )

vectorizer.fit(pd.concat([trainDf['comment_text'], testDf['comment_text']]))
trainFeatures = vectorizer.transform(trainDf['comment_text'])

X_train, X_test, Y_train, Y_test = train_test_split( trainFeatures, trainDf['inappropriate'], test_size=0.2, random_state=42)

In [0]:
naiveBayesClassifier = MultinomialNB()
naiveBayesClassifier.fit(X_train, Y_train)

X_predicted = naiveBayesClassifier.predict(X_test)

In [0]:
pd.DataFrame(confusion_matrix(Y_test, X_predicted),
   index = [['Actual', 'Actual'], ['Appropriate', 'Inappropriate']],
   columns = [['Predicted', 'Predicted'], ['Appropiate', 'Inappropriate']])



In [0]:
print(classification_report(Y_test, X_predicted))

In [0]:
print("Naive Bayes Accuracy = "+str(accuracy_score(Y_test,X_predicted)))

In [0]:
svmClassifier = svm.SVC(C = 0.1, kernel = 'linear', verbose=True)
svmClassifier.fit(X_train, Y_train)

In [0]:
X_predicted = svmClassifier.predict(X_test)

In [0]:
pd.DataFrame(confusion_matrix(Y_test, X_predicted),
   index = [['Actual', 'Actual'], ['Appropriate', 'Inappropriate']],
   columns = [['Predicted', 'Predicted'], ['Appropiate', 'Inappropriate']])

In [0]:
print(classification_report(Y_test, X_predicted))

In [0]:
print("SVM Accuracy = "+str(accuracy_score(Y_test,X_predicted)))