In [1]:
import pandas as pd

from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv("tech_test_data.csv")
df = pd.concat([df.message, df.case_type], axis=1)
df = df.sample(frac=1).reset_index(drop=True)

In [3]:
def clean_non_ascii(text):
    return ''.join(i for i in text if ord(i) < 128)

In [4]:
df.message = df.message.apply(clean_non_ascii)

In [5]:
x_train, x_test, y_train, y_test = tts(df.message, df.case_type)

In [6]:
def get_model(
    feature, 
    label, 
    model_name, 
    logreg_jobs=1, 
    logreg_c=1e5, 
    lsvm_loss='hinge', 
    lsvm_penalty='l2', 
    lsvm_alpha=1e-3, 
    lsvm_random_state=42, 
    lsvm_max_iter=5, 
    lsvm_tol=None
):
    classifier = None
    if model_name == 'naive bayes':
        classifier = MultinomialNB()
    elif model_name == 'logistic regression':
        classifier = LogisticRegression(n_jobs=logreg_jobs, C=logreg_c)
    elif model_name == 'lsvm':
        classifier = SGDClassifier(
            loss=lsvm_loss, 
            penalty=lsvm_penalty, 
            alpha=lsvm_alpha, 
            random_state=lsvm_random_state, 
            max_iter=lsvm_max_iter, 
            tol=lsvm_tol
        )
    model = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('transformer', TfidfTransformer()),
        ('classifier', classifier),
    ]).fit(feature, label)
    return model

In [7]:
nb_classifier = get_model(x_train, y_train, 'naive bayes')
logreg_classifier = get_model(x_train, y_train, 'logistic regression')
lsvm_classifier = get_model(x_train, y_train, 'lsvm')

In [8]:
nb_y_pred = nb_classifier.predict(x_test)
logreg_y_pred = logreg_classifier.predict(x_test)
lsvm_y_pred = lsvm_classifier.predict(x_test)

In [9]:
accuracies = {
    'Naive Bayes': accuracy_score(nb_y_pred, y_test),
    'Logitic Regression': accuracy_score(logreg_y_pred, y_test),
    'Support Vector Machine': accuracy_score(lsvm_y_pred, y_test)
}
print('Models Accuracies')
accuracies

Models Accuracies


{'Naive Bayes': 0.6818181818181818,
 'Logitic Regression': 0.8181818181818182,
 'Support Vector Machine': 0.7727272727272727}

In [10]:
target = df['case_type'].unique()
reports = {
    'Naive Bayes': classification_report(y_test, nb_y_pred, target_names=target),
    'Logitic Regression': classification_report(y_test, logreg_y_pred, target_names=target),
    'Support Vector Machine': classification_report(y_test, lsvm_y_pred, target_names=target)
}
print('Models Reports\n')
for name, report in reports.items():
    print(f'{name}\n-----------------------------------------------------')
    print(f'{report}\n')

Models Reports

Naive Bayes
-----------------------------------------------------
              precision    recall  f1-score   support

cancel_order       0.82      0.64      0.72        14
order_status       0.55      0.75      0.63         8

   micro avg       0.68      0.68      0.68        22
   macro avg       0.68      0.70      0.68        22
weighted avg       0.72      0.68      0.69        22


Logitic Regression
-----------------------------------------------------
              precision    recall  f1-score   support

cancel_order       0.92      0.79      0.85        14
order_status       0.70      0.88      0.78         8

   micro avg       0.82      0.82      0.82        22
   macro avg       0.81      0.83      0.81        22
weighted avg       0.84      0.82      0.82        22


Support Vector Machine
-----------------------------------------------------
              precision    recall  f1-score   support

cancel_order       0.91      0.71      0.80        14
ord

In [1]:
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

ModuleNotFoundError: No module named 'gensim'

In [2]:
!pip3 install gensim

/bin/sh: 1: pip3: not found
