# Importing needed libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Importing "train.csv" and "test.csv" to data frame

In [2]:
train = pd.read_csv('Files/train.csv')
test = pd.read_csv('Files/test.csv')

# Adding the new feature called 'none' if all toxic_kind features value is 0. so if it is nice comment, 'none'=1, else =0

In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1 - train[label_cols].max(axis=1)

# Fill the null cell with "unknown" values for both "train" and "test" data frame

In [4]:
train['comment_text'].fillna('unknown', inplace=True) 
test['comment_text'].fillna('unknown', inplace=True)

# build the TF-IDF model.

In [5]:
import re, string

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
# this will split s by symbols create a list of string

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

# Tfid Text Vectorization
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode',
                     use_idf=1, smooth_idf=1, sublinear_tf=1)

# train_term_doc is the crs_matrix -> compress row matrix with 1 row and n columns.
train_term_doc = vec.fit_transform(train['comment_text'])


# the same as train_term_doc (X_test crs_matrix)
test_term_doc = vec.transform(test['comment_text'])


# Checking type and values of train_term_doc and test_term_doc

In [6]:
print(type(train_term_doc))
print(train_term_doc[0])

print(type(test_term_doc))
print(test_term_doc[0])

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 22653)	0.11225676787445103
  (0, 35387)	0.11974557970427763
  (0, 22697)	0.11607450613653243
  (0, 33939)	0.11986525556711788
  (0, 22587)	0.11805863895962679
  (0, 37321)	0.11570798175128982
  (0, 22797)	0.11408215406028954
  (0, 259012)	0.07853618304798789
  (0, 312720)	0.16402564821709473
  (0, 231196)	0.16669852085274084
  (0, 7513)	0.050569258726441545
  (0, 193977)	0.04236000551960604
  (0, 331780)	0.0907489579883015
  (0, 280170)	0.13515323124110395
  (0, 352591)	0.05515959489565422
  (0, 368268)	0.07465956912509807
  (0, 169974)	0.05914892960828189
  (0, 354240)	0.1326518250254369
  (0, 368343)	0.09379933899140203
  (0, 309415)	0.08032576894305525
  (0, 351141)	0.11683624691357157
  (0, 141577)	0.046644903958262866
  (0, 288508)	0.09175569723097404
  (0, 68485)	0.10174180819931931
  (0, 23017)	0.06149535138131129
  :	:
  (0, 172411)	0.11169554291171112
  (0, 335723)	0.04728814119210447
  (0, 268787)	0.031841956504032994
  (0, 115362)	

# Checking vocabulary after extracting words from comment_text

In [7]:
vec.vocabulary_

{'explanation': 155788,
 'why': 409552,
 'the': 359013,
 'edits': 146170,
 'made': 231608,
 'under': 389638,
 'my': 247352,
 'username': 394106,
 'hardcore': 181782,
 'metallica': 239747,
 'fan': 158369,
 'were': 404947,
 'reverted': 313199,
 '?': 41026,
 'they': 372640,
 'weren': 405772,
 "'": 6837,
 't': 350338,
 'vandalisms': 395665,
 ',': 11880,
 'just': 216481,
 'closure': 115362,
 'on': 268787,
 'some': 335723,
 'gas': 172411,
 'after': 55748,
 'i': 193972,
 'voted': 399511,
 'at': 82724,
 'new': 252275,
 'york': 419031,
 'dolls': 141447,
 'fac': 156826,
 '.': 22383,
 'and': 64078,
 'please': 288402,
 'don': 141575,
 'remove': 309308,
 'template': 354198,
 'from': 168981,
 'talk': 352461,
 'page': 279728,
 'since': 331671,
 'm': 230844,
 'retired': 312707,
 'now': 259003,
 '89': 37316,
 '205': 33937,
 '38': 35380,
 '27': 34568,
 'explanation why': 155833,
 'why the': 409695,
 'the edits': 362169,
 'edits made': 146265,
 'made under': 231793,
 'under my': 389753,
 'my username': 2

# Method to calculate the avg of TFIDF (Naives Bayes Model)

In [8]:
def pr(y_i, y):
    p = train_term_doc[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

# Method to get trainning model 

In [9]:
def get_mdl(y):
    
    # y_train
    y = y.values
    
    # pr(1,y): calculate te avg of TFIDF of toxic sentences
    # pr(0,y): calculate te avg of TFIDF of nontoxic sentences
    # get log helps to increase the weight of the word appears in toxic sentence
    r = np.log(pr(1,y) / pr(0,y))
    
    # initilize Logistic Regression
    m = LogisticRegression(C=4, dual=True, solver='liblinear')
    
    # get the X_train
    x_nb = train_term_doc.multiply(r)
    
    # return training model from X_train and y_train
    # also return the weight of the word appreas in toxic sentence too
    return m.fit(x_nb, y), r

# Create an probability array of the sample for each class in the model. 

In [10]:
# label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((len(test), len(label_cols)))

# Returns the probability of the sample for each class in the model. 

In [11]:
# i represents for the index
# j represent for the column (class) we are using to calculate
for i, j in enumerate(label_cols):
    
    # get training model and weight from get_mdl method
    m,r = get_mdl(train[j])
    
    # transfering to the probability array by predict_proba method from Logistic Regression 
    
    # we would like to get the predicted probabilities for the positive label only 
    
    # that's why we use logistic_model.predict_proba(data)[:,1]
    
    preds[:,i] = m.predict_proba(test_term_doc.multiply(r))[:,1]



# Extract data from preds array to actual submission.csv to test in Kaggle

In [12]:
subm = pd.read_csv('Files/sample_submission.csv')
sub = pd.DataFrame({'id': subm['id']})
submission = pd.concat([sub, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission.csv', index=False)

# Calculate mean_absolute_error between the actual (test) label and the pred_lable by using two ways: MAE (Mean Absolute Error) and RMSE (Root Mean Square Error)

In [14]:
from sklearn import metrics

actual_label = pd.read_csv('Files/test_labels.csv')
pred_label = pd.read_csv('submission.csv')

actual_label[label_cols] = actual_label[label_cols].replace(-1, 1)

print("MAE: ")
for col in label_cols:
    actual = actual_label[col].values
    pred = pred_label[col].values
    print(col, metrics.mean_absolute_error(actual, pred))
    
from sklearn import metrics
import numpy as np

print("\nRMSE")
for col in label_cols:
    actual = actual_label[col].values
    pred = pred_label[col].values
    print(col, np.sqrt(metrics.mean_squared_error(actual, pred)))


MAE: 
toxic 0.47286738472184975
severe_toxic 0.5767147192523256
obscene 0.5136402669890279
threat 0.5821299790243356
insult 0.5389356486819584
identity_hate 0.5789006890911239

RMSE
toxic 0.6605789476649551
severe_toxic 0.7553369891125109
obscene 0.7009065074997879
threat 0.7623409231058985
insult 0.7163783359083947
identity_hate 0.7577892207453808
