# Importing files, packages and setting up data frame

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('Files/train.csv')
test = pd.read_csv('Files/test.csv')

# Fill the null cell with "unknown" values for both "train" and "test" data frame
train['comment_text'].fillna('unknown', inplace=True) 
test['comment_text'].fillna('unknown', inplace=True)

test_label = pd.read_csv('Files/test_labels.csv')

# label columns
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


test_label[label_cols] = test_label[label_cols].replace(-1, 1)

# test label
y_test = test_label[label_cols]

# Initializing the TF-IDF model

In [2]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

# this will split s by symbols create a list of string
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()



from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Tfid Text Vectorization
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=3, max_df=0.9, strip_accents='unicode',
                     use_idf=1, smooth_idf=1, sublinear_tf=1)


# Function to create X_trains and X_tests list model after initializing TF-IDF Model

In [3]:
# vec: TF-IDF model

def produceTrainAndTestmodel(vec):
    
    # purposes of "fit_transform":
    # 1. generates word counts for the words in your comment_text
    # 2. train tfidf transformation
    X_train = vec.fit_transform(train['comment_text'])
    
    # purposes of "transform":
    # 1. Use the vec above (after train tfidf transformation)
    #    to apply and calculate tf-idf scores without counts update 
    X_test = vec.transform(test['comment_text'])
    
    
    # To store list of X_train and X_test based on each label.
    X_trains = []
    X_tests = []
    
    # Build X_train and X_test through each lable
    for i, j in enumerate(label_cols):
        y_train = train[j].values
        
        # ****** Naives Bayes Model ******* #
        # to find out the probability of an event based on the previous events that occured
        # In this case, we calculate probability of an event 1 or 0 given the list of events y_train
        
        # pr(1,y): calculate te avg of TFIDF of toxic sentences    
        # pr(0,y): calculate te avg of TFIDF of nontoxic sentences
        
        p1 = X_train[y_train==1].sum(0)
        pr1 = (p1+1) / ((y_train==1).sum()+1)

        p0 = X_train[y_train==0].sum(0)
        pr0 = (p0+1) / ((y_train==0).sum()+1)

        # get log helps to increase the weight of the word appears in toxic sentence
        r = np.log(pr1 / pr0)
        
        # ****** Naives Bayes Model ******* #

        # multiply X_train with calculated weight (weight of the word appreas in toxic sentence) above
        X_trainR = X_train.multiply(r)

        # multiply X_test with calculated weight (weight of the word appreas in toxic sentence) above
        X_testR = X_test.multiply(r);
        
        # add to the list
        X_trains.append(X_trainR);
        X_tests.append(X_testR);
    
    # return list of X_train and X_tests of each label
    return X_trains, X_tests;

# Function to calculate predictions when we apply a model (algorithm)

In [4]:
# Parameter: model of a algorithm

# Return: array of predictions

def FindPredictions(model):
    # retreiving list of X_train and X_tests of each label
    X_trains, X_tests = produceTrainAndTestmodel(vec);

    # creating a y_predict array to store the predictions
    y_predict = np.zeros((len(test), len(label_cols)))
    
    # calculate predictions through each label
    for i, j in enumerate(label_cols):
        y_train = train[j].values

        # training model for each label
        model.fit(X_trains[i], y_train)

        # predict model for each lable and add to y_predict 
        # model.predict_proba(X_tests[i])[:,1] means the probabilities of getting the output as 1
        y_predict[:,i] = model.predict_proba(X_tests[i])[:,1]
        
    # return array of predictions
    return y_predict