In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [3]:
dftrain=pd.read_csv('train.csv')

In [4]:
print('is the index unique :' ,dftrain.id.is_unique)
dftrain.set_index('id', inplace=True)

is the index unique : True


In [5]:
dftrain.severe_toxic.unique()

array([0, 1])

In [6]:
dftest=pd.read_csv('test.csv')
dftest_labels=pd.read_csv('test_labels.csv')

In [7]:
dftest.set_index('id', inplace = True)
dftest_labels.set_index('id', inplace=True)

In [8]:
labes_validity = (dftest_labels == -1).sum(axis=1)
valid_labels_id =labes_validity[labes_validity ==0].index

In [9]:
dftest_valid=dftest[dftest.index.isin(valid_labels_id)]
dftest_labels_valid =  dftest_labels[dftest_labels.index.isin(valid_labels_id)]

In [10]:
print('labels shape', dftest_labels_valid.shape)
print('test data shape ', dftest_valid.shape)

labels shape (63978, 6)
test data shape  (63978, 1)


In [11]:
dftrain.severe_toxic.unique()

array([0, 1])

## Bag of words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer_bow=CountVectorizer(max_features= 8000)

In [14]:
vectorizer_bow.fit(dftrain.comment_text)

CountVectorizer(max_features=8000)

In [16]:
len(vectorizer_bow.vocabulary_)

8000

In [17]:
#Transform documents to document-term matrix.
vector_bow=vectorizer_bow.transform(dftrain.comment_text)

In [18]:
vector_test_bow = vectorizer_bow.transform(dftest_valid.comment_text)

In [19]:
print(vector_bow.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
vector_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## TFIDF

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
vectorizer_tfidf=TfidfVectorizer()

In [24]:
# tokenize and build vocab
vectorizer_tfidf.fit(dftrain.comment_text)

TfidfVectorizer()

In [25]:
vector_tdfidf=vectorizer_tfidf.transform(dftrain.comment_text)

In [26]:
vector_test_tfdfidf=vectorizer_tfidf.transform(dftest_valid.comment_text)

## Classification

In [27]:
Y=dftrain.loc[:, ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
clf=RandomForestClassifier()

In [30]:
clf.fit(vector_bow, Y)

RandomForestClassifier()

In [31]:
clf.score(vector_test_bow, dftest_labels_valid)

0.8926349682703429

### Training with tf-idf

In [32]:
%%time
clf=RandomForestClassifier()

CPU times: user 31 µs, sys: 1e+03 ns, total: 32 µs
Wall time: 43.2 µs


In [33]:
%%time
clf.fit(vector_tdfidf, Y)

CPU times: user 22min 15s, sys: 5.6 s, total: 22min 21s
Wall time: 43min 31s


RandomForestClassifier()

In [34]:
clf.score(vector_test_tfdfidf, dftest_labels_valid)

0.8817718590765575

In [35]:
%%time
y_pred_tf_idf = clf.predict(vector_test_tfdfidf)

CPU times: user 11 s, sys: 44.3 ms, total: 11 s
Wall time: 11.1 s


In [109]:
pred_toxic=pd.Series(y_pred_tf_idf[:,0], index=dftest_labels_valid.index)
pred_toxic.name = 'pred_toxic'
results_vs_pred=pd.concat([pred_toxic,dftest_labels_valid.toxic], axis=1)

results_vs_pred['is_correct']=results_vs_pred.apply(lambda row: True if row['pred_toxic'] ==  row['toxic'] 
                                                    else False, axis=1)
print('actual toxic', results_vs_pred.toxic.sum())
print('predicted as toxic', results_vs_pred[results_vs_pred.toxic == True].is_correct.sum())
print('recall ', results_vs_pred[results_vs_pred.toxic == True].is_correct.sum()/results_vs_pred.toxic.sum())

actual toxic 6090
predicted as toxic 3049
recall  0.5006568144499179


In [131]:
pred_severe_toxic=pd.Series(y_pred_tf_idf[:,1], index=dftest_labels_valid.index)
pred_severe_toxic.name = 'pred_severe_toxic'
results_vs_pred_severetoxic=pd.concat([pred_severe_toxic, dftest_labels_valid.severe_toxic], axis=1)
results_vs_pred_severetoxic['is_correct']=results_vs_pred_severetoxic.apply(lambda row: True if row['severe_toxic'] ==row['pred_severe_toxic'] else False,
                                 axis=1)

print('actual severe toxic ', (results_vs_pred_severetoxic.severe_toxic==1).sum())
print('predicted as toxic ', pred_severe_toxic.sum())
print('recall ', ((results_vs_pred_severetoxic.severe_toxic==1) & (results_vs_pred_severetoxic.is_correct ==
                                                  True)).sum()/(results_vs_pred_severetoxic.severe_toxic==1).sum())

actual severe toxic  367
predicted as toxic  66
recall  0.035422343324250684


In [70]:
df_ypred_tf_idf=pd.DataFrame(y_pred_tf_idf, columns=list(dftest_labels_valid.columns))
df_ypred_tf_idf.apply(lambda x: x.value_counts())

In [91]:
print(classification_report(dftest_labels_valid.to_numpy(), y_pred_tf_idf,
                            target_names=list(dftest_labels_valid.columns)))

               precision    recall  f1-score   support

        toxic       0.55      0.50      0.53      6090
 severe_toxic       0.20      0.04      0.06       367
      obscene       0.48      0.45      0.47      3691
       threat       0.42      0.02      0.04       211
       insult       0.81      0.29      0.43      3427
identity_hate       0.74      0.04      0.08       712

    micro avg       0.56      0.40      0.46     14498
    macro avg       0.53      0.22      0.27     14498
 weighted avg       0.59      0.40      0.45     14498
  samples avg       0.05      0.04      0.04     14498



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [133]:
pwd

'/Users/mayracervantes/Documentsmac/DataScience/projects/toxic_comments/data'