In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
train = pd.read_csv("Files/train.csv")
test = pd.read_csv("Files/test.csv")

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
#add the new feature called 'none' if all toxic_kind features value is 0. so if it is nice comment, 'none'=1, else =0
train['none'] = 1 - train[label_cols].max(axis=1)

train['comment_text'].fillna('unknown', inplace=True) #fill the null cell with unknown
test['comment_text'].fillna('unknown', inplace=True)

In [3]:
print(train)
print(test)

                      id                                       comment_text  \
0       0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3       0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
5       00025465d4725e87  "\n\nCongratulations from me as well, use the ...   
6       0002bcb3da6cb337       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
7       00031b1e95af7921  Your vandalism to the Matt Shirvington article...   
8       00037261f536c51d  Sorry if the word 'nonsense' was offensive to ...   
9       00040093b2687caa  alignment on this subject and which are contra...   
10      0005300084f90edc  "\nFair use rationale for Image:Wonju.jpg\n\nT...   
11      00054a5e18b50dd4  bbq \n\nbe a man and lets 

In [4]:
import re, string

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
# this will split s by symbols create a list of string

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [5]:
n = train.shape[0]
print(n)

159571


In [6]:
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode',
                     use_idf=1, smooth_idf=1, sublinear_tf=1)
train_term_doc = vec.fit_transform(train['comment_text'])
test_term_doc = vec.transform(test['comment_text'])

In [7]:
vec.vocabulary_

{'explanation': 155788,
 'why': 409552,
 'the': 359013,
 'edits': 146170,
 'made': 231608,
 'under': 389638,
 'my': 247352,
 'username': 394106,
 'hardcore': 181782,
 'metallica': 239747,
 'fan': 158369,
 'were': 404947,
 'reverted': 313199,
 '?': 41026,
 'they': 372640,
 'weren': 405772,
 "'": 6837,
 't': 350338,
 'vandalisms': 395665,
 ',': 11880,
 'just': 216481,
 'closure': 115362,
 'on': 268787,
 'some': 335723,
 'gas': 172411,
 'after': 55748,
 'i': 193972,
 'voted': 399511,
 'at': 82724,
 'new': 252275,
 'york': 419031,
 'dolls': 141447,
 'fac': 156826,
 '.': 22383,
 'and': 64078,
 'please': 288402,
 'don': 141575,
 'remove': 309308,
 'template': 354198,
 'from': 168981,
 'talk': 352461,
 'page': 279728,
 'since': 331671,
 'm': 230844,
 'retired': 312707,
 'now': 259003,
 '89': 37316,
 '205': 33937,
 '38': 35380,
 '27': 34568,
 'explanation why': 155833,
 'why the': 409695,
 'the edits': 362169,
 'edits made': 146265,
 'made under': 231793,
 'under my': 389753,
 'my username': 2

In [8]:
x = train_term_doc 
test_x = test_term_doc

In [15]:
# method calculate the avg of TFIDF
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(y):
    y = y.values
# pr(1,y): calculate te avg of TFIDF of toxic sentences
# pr(0,y): calculate te avg of TFIDF of nontoxic sentences
# get log helps to increase the weight of the word appears in toxic sentence
    r = np.log(pr(1,y) / pr(0,y))
    print(r)
    k = 3
    my_knn_for_toxic_comments = KNeighborsClassifier(n_neighbors=k, weights='distance') # name of the object is arbitrary!
    x_nb = x.multiply(r)
    return  my_knn_for_toxic_comments.fit(x_nb, y), r

In [16]:
# label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(test), len(label_cols)))
from sklearn.neighbors import KNeighborsClassifier

In [None]:
for i, j in enumerate(label_cols):
    print('fit', j)
    print(i)
    m,r = get_mdl(train[j])
    print("Done training model")
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]
 
    print(preds)
    print(len(preds))

fit toxic
0
[[1.17381793 2.09428535 0.21757157 ... 2.03104827 0.8448213  0.8448213 ]]
Done training model


In [None]:
y_predict = pd.DataFrame(preds, columns = label_cols)
print(type(y_predict))

In [None]:
test_labels = pd.read_csv("Files/test_labels.csv")
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_test = test_labels[label_cols]
print(y_test)


In [None]:
from sklearn.metrics import accuracy_score

# # # Step5: Evaluating the accuracy 
accuracy =  accuracy_score(y_test, y_predict)

print("Accuracy KNN: ", accuracy)

