# CS 230 Content Moderation Project #
*by: Guy Wuollete, Martin Amethier, Isabella Garcia-Camargo*

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import string
import datetime
import re
%load_ext autoreload
%autoreload 2

#modeling imports
from dateutil.relativedelta import relativedelta
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
plt.style.use('fivethirtyeight')

#tf imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse.linalg import svds



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
subm = pd.read_csv('./data/sample_submission.csv')
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[labels].max(axis=1) #make an indicator for when there is no 
                                            #value for any of the labels 
#cant have any of the unknown values 
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


In [None]:
#tokenizing stuff 
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize_vals(s): return re_tok.sub(r' \1 ', s).split()

In [49]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize_vals,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

In [51]:
train_terms = vec.fit_transform(train['comment_text'])
test_terms = vec.transform(test['comment_text'])
train_terms

<159571x426005 sparse matrix of type '<class 'numpy.float64'>'
	with 17775104 stored elements in Compressed Sparse Row format>

In [53]:
#BAYESIAN feature equation
#delete this cell and afterwards 
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [52]:
x = train_terms
test_x = test_terms

In [42]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True) 
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [54]:
preds = np.zeros((len(test), len(labels)))

for i, j in enumerate(labels):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic
fit severe_toxic




fit obscene




fit threat




fit insult




fit identity_hate




In [45]:
preds

array([[9.99987829e-01, 1.06258963e-01, 9.99986693e-01, 2.36852684e-03,
        9.62577858e-01, 9.49549878e-02],
       [2.87289643e-03, 6.03642947e-04, 1.89337915e-03, 1.00365667e-04,
        2.22683811e-03, 3.42314203e-04],
       [1.17549952e-02, 8.63809127e-04, 5.58791370e-03, 1.01695598e-04,
        3.20967629e-03, 2.96720508e-04],
       ...,
       [1.38471792e-03, 1.54785328e-04, 2.67019265e-03, 7.52030524e-05,
        9.37269508e-04, 1.88660524e-04],
       [7.98765333e-03, 3.33015177e-04, 2.25503371e-03, 9.38784835e-05,
        2.19463597e-03, 8.70897338e-04],
       [9.45721897e-01, 1.14472731e-04, 4.28269839e-01, 3.39328398e-04,
        4.47160399e-02, 8.05470041e-04]])