In [23]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [24]:
data = pd.read_csv('./data/crime_data_main.csv')

In [25]:

X_train, X_test, y_train, y_test = train_test_split(data['Preprocessed'], data['class'], test_size=0.20, random_state=1, stratify=data['class'])

In [26]:
#tf idf
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(X_train)
#applying tf idf to training data
X_train_tf = tf_idf.transform(X_train)

In [27]:
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 19826, n_features: 26231


In [28]:
X_test_tf = tf_idf.transform(X_test)
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 4957, n_features: 26231


In [29]:
#naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, y_train)
#predicted y
y_pred = naive_bayes_classifier.predict(X_test_tf)

In [33]:
print(metrics.classification_report(y_test, y_pred, target_names=['Hate', 'Offense', 'Neutral'], zero_division=0))

              precision    recall  f1-score   support

        Hate       0.44      0.17      0.24       286
     Offense       0.82      0.97      0.89      3838
     Neutral       0.78      0.28      0.42       833

    accuracy                           0.81      4957
   macro avg       0.68      0.47      0.52      4957
weighted avg       0.79      0.81      0.77      4957



In [31]:
from sklearn import svm
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(X_train_tf, y_train)
y_pred = svm_classifier.predict(X_test_tf)
print(metrics.classification_report(y_test, y_pred, target_names=['Hate', 'Offense', 'Neutral']))


              precision    recall  f1-score   support

        Hate       0.59      0.26      0.36       286
     Offense       0.94      0.96      0.95      3838
     Neutral       0.86      0.92      0.89       833

    accuracy                           0.91      4957
   macro avg       0.80      0.71      0.73      4957
weighted avg       0.90      0.91      0.90      4957



In [32]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_tf, y_train)
y_pred = knn_classifier.predict(X_test_tf)
print(metrics.classification_report(y_test, y_pred, target_names=['Hate', 'Offense', 'Neutral']))


              precision    recall  f1-score   support

        Hate       0.44      0.17      0.24       286
     Offense       0.82      0.97      0.89      3838
     Neutral       0.78      0.28      0.42       833

    accuracy                           0.81      4957
   macro avg       0.68      0.47      0.52      4957
weighted avg       0.79      0.81      0.77      4957



In [34]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(n_estimators=100)
random_forest_classifier.fit(X_train_tf, y_train)
y_pred = random_forest_classifier.predict(X_test_tf)
print(metrics.classification_report(y_test, y_pred, target_names=['Hate', 'Offense', 'Neutral']))


              precision    recall  f1-score   support

        Hate       0.62      0.12      0.20       286
     Offense       0.90      0.97      0.94      3838
     Neutral       0.87      0.79      0.83       833

    accuracy                           0.89      4957
   macro avg       0.80      0.63      0.65      4957
weighted avg       0.88      0.89      0.88      4957



In [35]:
from sklearn.svm import SVC
cost_sensitive_svm_classifier = SVC(kernel='linear', class_weight='balanced')
cost_sensitive_svm_classifier.fit(X_train_tf, y_train)
y_pred = cost_sensitive_svm_classifier.predict(X_test_tf)
print(metrics.classification_report(y_test, y_pred, target_names=['Hate', 'Offense', 'Neutral']))

              precision    recall  f1-score   support

        Hate       0.34      0.59      0.44       286
     Offense       0.97      0.88      0.92      3838
     Neutral       0.81      0.94      0.87       833

    accuracy                           0.88      4957
   macro avg       0.71      0.81      0.74      4957
weighted avg       0.91      0.88      0.89      4957



In [36]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lm = WordNetLemmatizer()

def preprocess(text):

    #remove non alphabetic characters
    text = re.sub('[^A-Za-z]', ' ', text)

    #lowercase
    text = text.lower()

    #tokenization
    words = nltk.word_tokenize(text)

    #punctuation mark removal
    words = [word for word in words if word.isalnum()]

    #stopwords removal
    words_stop = []
    for word in words:
        if word not in stopwords.words('english'):
            words_stop.append(word)

    #stemming
    words_stem = []
    for word in words_stop:
        words_stem.append(ps.stem(word))

    #lemmatization
    words_lemmatized = []
    for word in words_stem:
        words_lemmatized.append(lm.lemmatize(word))

    #join words
    text = ' '.join(words_lemmatized)

    return text


In [45]:
test_text = ["I hate you", "Its a great day"]
for i in range(len(test_text)):
    test_text[i] = preprocess(test_text[i])
    test_text[i] = tf_idf.transform([test_text[i]])
    print(svm_classifier.predict(test_text[i]))

[1]
[2]


In [47]:
# save all models in pickle file
import pickle
pickle.dump(tf_idf, open('./models/tf_idf.pkl', 'wb'))
pickle.dump(naive_bayes_classifier, open('./models/naive_bayes_classifier.pkl', 'wb'))
pickle.dump(svm_classifier, open('./models/svm_classifier.pkl', 'wb'))
pickle.dump(knn_classifier, open('./models/knn_classifier.pkl', 'wb'))
pickle.dump(random_forest_classifier, open('./models/random_forest_classifier.pkl', 'wb'))
pickle.dump(cost_sensitive_svm_classifier, open('./models/cost_sensitive_svm_classifier.pkl', 'wb'))