In [1]:
import pandas as pd
import re
import warnings 

warnings.filterwarnings('ignore')
#petit clean 
df = pd.read_csv("./data/labeled_data.csv", usecols=['class', 'tweet'])
df['tweet'] = df['tweet'].apply(lambda tweet: re.sub('[^A-Za-z]+', ' ', tweet.lower()))


In [2]:
df = df.sample(frac=.25)
print(df.size)

12392


In [3]:
#labels 
# 0 - hate speech 1 - offensive language 2 - neither
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from stop_words import get_stop_words

In [4]:
#pipeline 
clf = make_pipeline(
    TfidfVectorizer(stop_words=get_stop_words('en')),
    OneVsRestClassifier(SVC(kernel='linear', probability=True))
)

In [5]:
#fit 
clf = clf.fit(X=df['tweet'], y=df['class'])

In [6]:
clf

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['a', 'about', 'above', 'after',
                                             'again', 'against', '...
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False,
                                                   cache_size=200,
                      

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=10, random_state=None)
results = cross_val_score(clf, df['tweet'], df['class'], cv=kfold)
print(results.mean())

0.8905750690499765


In [8]:
predicted = clf.predict_proba(df['tweet'])
print(predicted)

[[0.05114464 0.93093741 0.01791795]
 [0.09934612 0.83281083 0.06784305]
 [0.01833816 0.9708667  0.01079514]
 ...
 [0.02348552 0.96996874 0.00654574]
 [0.0567784  0.92336518 0.01985642]
 [0.02068061 0.96087242 0.01844696]]


In [9]:
from joblib import dump
dump(clf, 'algo.joblib')

['algo.joblib']