# BagOfWords approach for classification

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
stop_words = stop_words.ENGLISH_STOP_WORDS
#Function for basic cleaning/preprocessing texts
def clean(doc):
    # Removal of punctuation marks (.,/\][{} etc) and numbers
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    # Removal of stopwords
    doc = " ".join([token for token in doc.split() if token not in stop_words])
    return doc.lower()

NameError: name 'stop_words' is not defined

In [4]:
# MODELS for text classification

models = {
    "Logistic Regression": LogisticRegression(),
    "Multinomial Naive Bayes": MultinomialNB(),
}


In [27]:
csvFile = "E:\\mladhd\\datasets\\text\\data_clean_large.csv"

In [28]:
df = pd.read_csv(csvFile, sep=";")

In [29]:
df

Unnamed: 0,class,text
0,0,© From your Google Drive Interview-Mode BK99 S...
1,1,S Cc @ amazon.com/gp/bestsellers/?ref_=nav_cs_...


In [18]:
df.iloc[0]

text     NaN
class      0
Name: 0, dtype: object

In [31]:
vectorizer = CountVectorizer()#(preprocessor=clean)
# Create sample set of documents
docs = df.text.to_numpy()

In [32]:
# Fit the bag-of-words model
bag = vectorizer.fit_transform(docs)
bag.shape

(2, 363)

In [33]:
# Get unique words / tokens found in all the documents. The unique words / tokens represents
# the features
print(vectorizer.get_feature_names())

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [None]:
# Associate the indices with each unique word
print(vectorizer.vocabulary_)

In [None]:
# Print the numerical feature vector
print(bag.toarray())

In [None]:
tfidf_transformer = TfidfTransformer()
bag_tfidf = tfidf_transformer.fit_transform(bag)
bag_tfidf.shape

In [None]:
# Creating training data
X = bag_tfidf.to_numpy()
y = df["class"].to_numpy()

In [None]:
# Create training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Choose a model

In [None]:
model_name = "Logistic Regression" # "Logistic Regression" or "Multinomial Naive Bayes"
model = models[model_name]

In [None]:
# Fit the model
model.fit(X_train, y_train)

In [None]:
# Create the predictions
y_predict = model.predict(X_test)

In [None]:
# Compute the accuracy, precision, recall and f1-score
print("Model accuracy: ", metrics.accuracy_score(y_test, y_predict))
print("Model precision: ", metrics.precision_score(y_test, y_predict))
print("Model recall: ", metrics.recall_score(y_test, y_predict))
print("Model f1-score: ", metrics.f1_score(y_test, y_predict))

# Visualize the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predict)
print(cm)

## Interpretability

In [None]:
# Assigning the feature names to an empty list
feat_impts = [vectorizer.get_feature_names()]
#For all the models save the feature importances in the #list.estimators_ would give the internal models used by the #multioutput regressor
for clf in model.estimators_:
    feat_impts.append(clf.coef_.flatten())

In [None]:
#Saving the coefficients in a dataframe
df_feats_impts = pd.DataFrame(np.transpose(np.array(feat_impts)), columns = ["focused","distracted"])

In [None]:
#Creating Individual Feature Importance table by sorting on specific toxic-type column and selecting top 5 words
focused_fi = df_feats_impts[["word","focused"]].sort_values(by = "focused", ascending = False).head()

In [None]:
#Creating Individual Feature Importance table by sorting on specific toxic-type column and selecting top 5 words
distracted = df_feats_impts[["word","focused"]].sort_values(by = "focused", ascending = False).head()