In [1]:
import sklearn
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import time

In [2]:
train = pd.read_csv("train_small.csv")

In [3]:
columns = ['Text', 'IsToxic']

In [4]:
train = train[columns]

In [5]:
train.head()

Unnamed: 0,Text,IsToxic
0,If only people would just take a step back and...,False
1,Law enforcement is not trained to shoot to app...,True
2,\nDont you reckon them 'black lives matter' ba...,True
3,There are a very large number of people who do...,False
4,"The Arab dude is absolutely right, he should h...",False


In [6]:
sum(train["IsToxic"] == 0)

538

In [7]:
sum(train["IsToxic"] == 1)

462

In [8]:
train.isnull().any()

Text       False
IsToxic    False
dtype: bool

In [9]:
y = train.IsToxic.values

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(train.Text.values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3, shuffle=True)

In [10]:
# vectorize tweets for model building
vectorizer = CountVectorizer(binary=True, stop_words='english')

# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))

# transform documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [11]:
start = time.time()
svm = svm.SVC(kernel = 'linear', probability=True)

prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

y_pred_svm = svm.predict(x_test_vec)
print(f"training time: {time.time() - start:.2f}s")

training time: 0.20s


In [12]:
from sklearn.metrics import accuracy_score
print(f"Accuracy score for SVC is: {accuracy_score(y_test, y_pred_svm)*100:0.2f}%")

Accuracy score for SVC is: 68.67%
