# Text Representation – Bag of Words (BoW)



In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('movies_sentiment_data.csv', engine='python', encoding='utf-8', on_bad_lines='skip')
data.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# Train test split
x_train, x_test, y_train, y_test = train_test_split(data.review, data.sentiment, test_size=0.2)

# Create a pipeline for the model using Multinomial NB
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Fit the model
clf.fit(x_train, y_train)

# Make predictions on testing data
y_pred = clf.predict(x_test)

# Evaluate model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.74      0.81      0.77        67
    positive       0.86      0.80      0.83        96

    accuracy                           0.80       163
   macro avg       0.80      0.80      0.80       163
weighted avg       0.81      0.80      0.80       163



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# Train test split
x_train, x_test, y_train, y_test = train_test_split(data.review, data.sentiment, test_size=0.2)

# Create a pipeline for the model using KNN
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

# Fit the model
clf.fit(x_train, y_train)

# Make predictions on testing data
y_pred = clf.predict(x_test)

# Evaluate model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.56      0.45      0.50        73
    positive       0.62      0.71      0.66        90

    accuracy                           0.60       163
   macro avg       0.59      0.58      0.58       163
weighted avg       0.59      0.60      0.59       163

