In [29]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
import re

In [35]:
# helper functions
def sentence_tokenizer(text):
    sentences = sent_tokenize(text)
    return sentences

def polarity_sentence(sentences):
    listy = []
    for i in list(range(0,len(sentences))):
        pol = TextBlob(sentences[i]).polarity
        listy.append(pol)
    return np.min(listy), np.max(listy), np.mean(listy),listy

def polarity_comment(text):
    txt = " ".join(text)
    return TextBlob(txt).polarity

def token_clean(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = text.lower().split()
    return text

In [31]:
df = pd.read_csv('../data/train.csv') # train data
df['idx'] = df['id']
df = df.set_index('idx')
print(df.shape)
df.head()

(159571, 8)


Unnamed: 0_level_0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000997932d777bf,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
000103f0d9cfb60f,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
000113f07ec002fd,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
0001b41b1c6bb37e,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
0001d958c54c6e35,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [32]:
df.sum(axis=0,numeric_only=True)

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [33]:
df['rating'] = df['toxic'] + df['severe_toxic'] + df['obscene'] + df['threat'] + df['insult'] + df['identity_hate']
df = df.sort_values(['rating'],ascending=[False])
df.groupby('rating').nunique()['id'] # class imbalance issue

rating
0    143346
1      6360
2      3480
3      4209
4      1760
5       385
6        31
Name: id, dtype: int64

In [None]:
# note that classes are not mutually exclusive, any comment to belong to any of 6 classes
# as such, may need to test each classification separately, unless there is a way to test all together?
df['token_clean'] = df['comment_text'].apply(token_clean)
df['sent_token'] = df['comment_text'].apply(sentence_tokenizer)
df['polarity_sentence'] = df['sent_token'].apply(polarity_sentence)
df['polarity_comment'] = df['sent_token'].apply(polarity_comment)
df['word_count'] = df['token_clean'].apply(len)

In [None]:
df['polarity_min'] = [x[0] for x in df['polarity_sent_token']]
df['polarity_max'] = [x[1] for x in df['polarity_sent_token']]
df['polarity_mean'] = [x[2] for x in df['polarity_sent_token']]
df.head()

In [None]:
df.shape

In [None]:
df.to_pickle('../data/toxictrain.pkl')

### Supplemental

In [None]:
# DOWNSAMPLING: NLP should be upsampled so do not do this
# df_t = df[df['rating']>0]
# df_nt = df[df['rating']==0]
# df_nt = shuffle(df_nt)
# df_nt = df_nt[-16225:]
# df = pd.concat([df_t,df_nt])
# df_0 = df[df['rating']==0]
# df_1 = df[df['rating']>0]
# print(df_0.shape,df_1.shape)

In [None]:
# downsample comments to number of toxic text
from sklearn.utils import resample
df1 = df[['comment_text','toxic']]
X = df1.iloc[:,0]
y = df1.iloc[:,1]
print(X.shape, y.shape)
y[y==0].count()

In [None]:
X_d, y_d = resample(X[y == 1],y[y == 1],replace=True, n_samples=X[y == 1].shape[0],random_state=42)
print(X_d.shape, y_d.shape)
y_d[y_d==1].count()

In [None]:
X_bal = np.vstack((X[y==1],X_d))
y_bal = np.hstack((y[y==1],y_d))
print(X_bal.shape,y_bal.shape)

In [None]:
X_bal = pd.DataFrame(X_bal)
X_bal

In [None]:
df2 = pd.concat([X_bal, y_bal], axis=1)

In [None]:
df2.sum(axis=0,numeric_only=True)

In [None]:
count_vect = CountVectorizer()
X = count_vect.fit_transform(df1.comment_text)
# X = X.toarray()
# count_vect.vocabulary_
# X = df[['word_count','polarity_min','polarity_max','polarity_mean']]
y = df1['toxic']
# y = df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
print(X.shape, y.shape) 

In [None]:
count_vect = CountVectorizer()
X = count_vect.fit_transform(df.comment_text)
# X = X.toarray()
print(X.shape) 
# count_vect.vocabulary_
# X = df[['word_count','polarity_min','polarity_max','polarity_mean']]
y = df['toxic']
# y = df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

In [None]:
X = X.toarray()

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_d, y_d = resample(X[y == 0],y[y == 0],replace=True, n_samples=X[y == 1].shape[0],random_state=42)
print(X_d.shape, y_d.shape)

In [None]:
# downsample comments to number of toxic text
from sklearn.utils import resample
df1 = df[['comment_text','toxic']]
X = df1.iloc[:,0]
y = df1.iloc[:,1]
print(X.shape, y.shape)
y[y==1].count()

In [None]:
# print(X[y==0].shape, X[y==1].shape)

In [None]:
# X_imb = np.vstack((X[y == 0], X[y == 1][:40]))

# y_imb = np.hstack((y[y == 0], y[y == 1][:40]))

print(X.shape, y.shape)

In [None]:
X_d, y_d = resample(X[y == 0],y[y == 0],replace=True, n_samples=X[y == 1].shape[0],random_state=42)
print(X_d.shape, y_d.shape)

In [None]:
count_vect = CountVectorizer()
X = count_vect.fit_transform(df1.comment_text)
print(X)
y = y_d
# X = X.toarray()
# print(X.shape) 
# count_vect.vocabulary_
# X = df[['word_count','polarity_min','polarity_max','polarity_mean']]
# y = df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

In [None]:
# from sklearn.utils import resample

# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# X_train_up, y_train_up = resample(X_train[y_train == 1],y_train[y_train == 1],replace=True,
#                                     n_samples=X_train[y_train == 0].shape[0],random_state=123)

In [None]:
# print(X_train_up.shape, y_train_up.shape)

In [None]:
# df['stemmer'] = df['comment_text'].apply(stemmer)

In [None]:
df.head()

In [None]:
# df.groupby('rating').nunique()['id'] # class imbalance issue

In [None]:
# df = df.sort_values(['rating'],ascending=[False])
# df.head()

In [None]:
# pipeline
# pipe_lr = make_pipeline(StandardScaler(),
#                          PCA(n_components=2),
#                          LogisticRegression(random_state=1))
# pipe_lr.fit(X_train, y_train)
# y_pred = pipe_lr.predict(X_test)
# print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

In [None]:
# https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
# scaler = StandardScaler(with_mean=False).fit(X_train)
# X_train_s = scaler.transform(X_train)
# X_test_s = scaler.transform(X_test)

In [None]:
# class DenseTransformer(TransformerMixin):

#     def transform(self, X, y=None, **fit_params):
#         return X.todense()

#     def fit_transform(self, X, y=None, **fit_params):
#         self.fit(X, y, **fit_params)
#         return self.transform(X)

#     def fit(self, X, y=None, **fit_params):
#         return self

In [None]:
# pipeline = Pipeline([
#      ('vectorizer', CountVectorizer()), 
#      ('to_dense', DenseTransformer()), 
#      ('classifier', RandomForestClassifier())
# ])

In [None]:
# from sklearn.svm import LinearSVC
# # pipeline = Pipeline([('vectorizer', CountVectorizer()), ('classifier', LinearSVC())])
# predicted = pipeline.predict(X_test)


In [None]:
# Define models to test
model_list = [['GaussianNB', GaussianNB()], 
                ['BernoulliNB', BernoulliNB()], 
#                 ['MultinomialNB', MultinomialNB()],
                ['DecisionTree', DecisionTreeClassifier(class_weight='balanced')], 
                ['KNN', KNeighborsClassifier(10)], 
                ['RandomForest', RandomForestClassifier(class_weight='balanced')], 
                ['GradientBoost', GradientBoostingClassifier()],
                ['AdaBoost', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=DecisionTreeClassifier(class_weight='balanced'))],
                ['XGBoost', XGBClassifier()],
                ['LogisticRegression', LogisticRegression(class_weight='balanced')],          
                ['SVM', SVC(probability=True, class_weight='balanced')]] # scale data; F1 0.57

model_list_s = ['KNN','LogisticRegression','SVM'] # standardize/normalize data

# Calculate metrics for each model
roc = {}
results_dict = {}
for model in model_list:
    if model[0] in model_list_s:
        X_train = X_train_s
        X_test = X_test_s
    
    model_name = model[0]
    model = model[1]
    
    accuracy = []
    precision_1 = []
    precision_0 = []
    recall_1 = []
    recall_0 = []
    f1_1 = []
    f1_0 = []
    auc = []
        
    # Perform K-Fold CV and calculate metrics for each fold
    kf = KFold(5, random_state=42, shuffle=True) 
    for train_idx, test_idx in kf.split(X, y=y):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred ,pos_label=1))
        precision_0.append(precision_score(y_test, y_pred ,pos_label=0))
        recall_1.append(recall_score(y_test, y_pred, pos_label=1))
        recall_0.append(recall_score(y_test, y_pred, pos_label=0))
        f1_1.append(f1_score(y_test, y_pred, pos_label=1))
        f1_0.append(f1_score(y_test, y_pred, pos_label=0))
        auc.append(roc_auc_score(y_test, y_pred))
        
    # Calculate mean metric across K-folds
    mean_accuracy = np.mean(accuracy)
    mean_precision_1 = np.mean(precision_1)
    mean_precision_0 = np.mean(precision_0)
    mean_recall_1 = np.mean(recall_1)
    mean_recall_0 = np.mean(recall_0)
    mean_f1_1 = np.mean(f1_1)
    mean_f1_0 = np.mean(f1_0)
    mean_auc = np.mean(auc)
    
    # Capture TPR and FPR from last fold for plotting
    y_score = model.predict_proba(X_test)[:,1]
    roc[model_name] = roc_curve(y_test, y_score), mean_auc
    results_dict[model_name] = {"accuracy": mean_accuracy, "precision_s": mean_precision_1, "precision_f": mean_precision_0, "recall_s": mean_recall_1, "recall_f": mean_recall_0, "f1_s": mean_f1_1, "f1_f": mean_f1_0, "auc": mean_auc}
    
    # Print formatted results
    print(model)
    print('\t==============================')
    print('\tAccuracy:', mean_accuracy)
    print('\tAUC:', mean_auc)
    print('\n')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred)) 

In [None]:
# Plot the ROC curve from the last K-Fold split
fig, ax = plt.subplots(figsize=(10, 10))

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')

# Plot Classifier ROC Curves
for key, value in roc.items():
    label = '{}, AUC: {}%'.format(key, round(100*value[1],1))
    ax.plot(roc[key][0][0], roc[key][0][1], label=label)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('ROC Curve - All Models',fontweight='bold',fontsize=15)
ax.legend(loc='best')
plt.savefig('../charts/toxic_roc.png')

In [None]:
rd = pd.DataFrame(results_dict).T
rd = rd.apply(lambda x: round(100*x,1).astype(str) + "%")
rd = rd.sort_values(['auc'],ascending=[False])
rd

In [None]:
# cv = CountVectorizer()

# r = pd.SparseDataFrame(cv.fit_transform(text), 
#                        df.index,
#                        cv.get_feature_names(), 
#                        default_fill_value=0)

In [None]:
# text = df['comment_text'].iloc[0]
# x_back = count_vectorizer(text)
# df1 = pd.DataFrame(x_back,columns=vectorizer.get_feature_names())

In [None]:
# stop = stopwords.words('english')
# stop += ['.', ',', '(', ')', "'", '"']
# stop = set(stop)

# counter = Counter()

# n = 2
# for doc in df['comment_text']:
#     words = TextBlob(doc).words
#     words = [w for w in words if w not in stop]
#     bigrams = ngrams(words, n)
#     counter += Counter(bigrams)

# for phrase, count in counter.most_common(30):
#     print('%20s %i' % (" ".join(phrase), count))

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

text = ['That is should come to this!', 'This above all: to thine own self be true.', 'Something is rotten in the state of Denmark.']

# # CountVectorizer is a class; so `vectorizer` below represents an instance of that object.
# vectorizer = CountVectorizer(ngram_range=(1,2)) # selects uni and bigrams

# # call `fit` to build the vocabulary
# vectorizer.fit(text)

# # then, use `get_feature_names` to return the tokens
# print(vectorizer.get_feature_names())

# # finally, call `transform` to convert text to a bag of words
# x = vectorizer.transform(text)

In [None]:
# print('Sparse Matrix')
# # A compressed version; the "sparse" matrix.
# print(type(x))
# print(x)

# print ('Matrix')
# x_back = x.toarray()
# print(type(x_back))
# print(x_back)

In [None]:
# x_back = df['token_clean'].apply(count_vectorizer)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df['comment_text'])
print(X_train_counts.shape)
X_train_counts

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [None]:
pd.DataFrame(x_back, columns=vectorizer.get_feature_names())

In [None]:
# x_back = count_vectorizer(text)
# pd.DataFrame(x_back, columns=vectorizer.get_feature_names())

In [None]:
#### TF: frequency in this document
#### IDF: inverse frequency in the corpus

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
doc_vectors = vectorizer.fit_transform(text)

classes = np.array(['pos']*50 + ['neg']*50)


model = MultinomialNB().fit(doc_vectors, classes)

In [None]:
# sentences = df['sent_token'].iloc[0]
# whitespace_tokenizer(sentences)

In [None]:
# tokenizer = TreebankWordTokenizer()
# tokenizer.tokenize(sentences[2])

In [None]:
# tokenizer = WhitespaceTokenizer()
# tokenizer.tokenize(sentences[2])

In [None]:
# df.head()