In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

In [3]:
def preprocess_text(text):
    preprocessed_text = text.lower()
    return preprocessed_text

In [4]:
def generate_ngrams(text, n):
    words = text.split()
    ngrams = zip(*[words[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [5]:
def vectorize_text(text, n, unique_ngrams):
    ngrams = generate_ngrams(text, n)
    vectorized_text = np.zeros(len(unique_ngrams))    
    for i, ngram in enumerate(unique_ngrams):
        vectorized_text[i] = int(ngram in ngrams)    
    return vectorized_text

In [6]:
df=pd.read_csv("prg3.csv")
x=df.iloc[:,6].values
y=df.iloc[:,5].values


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
import pandas as pd
import numpy as np
from sklearn.utils import resample


df = pd.read_csv("prg3.csv")

x = df.iloc[:,6].values
y = df.iloc[:,5].values

data = pd.DataFrame({'x': x, 'y': y})

balanced_data = pd.DataFrame()
for value in np.unique(y):
    subset = data[data['y'] == value]
    
    if value == 4 or value == 5:
        resampled_subset = resample(subset, replace=False, n_samples=500, random_state=42)
        balanced_data = pd.concat([balanced_data, resampled_subset])
    else:
        resampled_subset = resample(subset, replace=False, n_samples=217, random_state=42)
        balanced_data = pd.concat([balanced_data, resampled_subset])

x = balanced_data['x'].values
y = balanced_data['y'].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
all_ngrams = [generate_ngrams(preprocess_text(text), 1) for text in X_train]
unique_ngrams = list(set([item for sublist in all_ngrams for item in sublist]))
X_train_vectorized = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams) for text in X_train])

In [10]:
# Import the Naive Bayes class
from sklearn.naive_bayes import MultinomialNB

# Train the Naive Bayes model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_vectorized, y_train)

In [11]:
X_test_vectorized = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams) for text in X_test])

In [12]:
# Predict the sentiment for the test set using Naive Bayes
y_pred_nb = naive_bayes_model.predict(X_test_vectorized)

# Calculate the accuracy of the Naive Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy using Naive Bayes: {accuracy_nb}")

Accuracy using Naive Bayes: 0.3957703927492447


In [13]:
X_test[10]

'pretty good'

In [39]:
X_train

array(['CRAP', "BEWARE:!!! It's not a hard plastic Mic Clip it's Rubber",
       'box power adaptor', ..., 'Good for the price',
       'This was a great deal.', 'Great price and a real space saver'],
      dtype=object)

In [15]:
y_pred_nb

array([4., 4., 5., 2., 5., 4., 3., 4., 5., 4., 4., 5., 4., 4., 5., 4., 4.,
       1., 5., 3., 4., 4., 5., 5., 4., 3., 5., 4., 5., 1., 5., 4., 4., 5.,
       2., 4., 5., 5., 5., 4., 5., 4., 5., 4., 5., 4., 4., 4., 4., 4., 4.,
       5., 5., 2., 1., 1., 4., 5., 4., 5., 4., 5., 4., 5., 4., 3., 3., 4.,
       4., 4., 4., 4., 5., 4., 2., 2., 5., 5., 4., 3., 4., 4., 5., 5., 4.,
       1., 5., 4., 5., 5., 5., 5., 3., 5., 2., 4., 5., 3., 5., 4., 5., 5.,
       5., 4., 4., 4., 4., 4., 4., 5., 5., 4., 4., 4., 4., 5., 5., 3., 5.,
       5., 1., 4., 4., 4., 5., 4., 5., 5., 4., 5., 4., 4., 1., 2., 1., 4.,
       4., 5., 4., 1., 3., 4., 4., 5., 1., 5., 2., 4., 4., 5., 2., 1., 4.,
       1., 5., 5., 2., 4., 2., 4., 5., 4., 5., 5., 4., 5., 4., 4., 4., 5.,
       4., 4., 5., 4., 5., 4., 5., 5., 4., 5., 4., 4., 4., 5., 3., 4., 1.,
       4., 4., 4., 4., 5., 4., 5., 4., 4., 3., 5., 5., 5., 3., 4., 4., 5.,
       5., 4., 4., 3., 3., 4., 4., 5., 4., 5., 5., 2., 5., 5., 5., 4., 3.,
       4., 1., 4., 4., 5.

In [16]:
X_test

array(['Not Bad Picks...', "Doesn't cut it for me.",
       'Excellent 808 Clone',
       'So far so good, but may have found something much better.',
       'terible cheap material product I saw',
       'For the guys in the back row it is a must', 'Nothing special',
       'for what it is....', "D'Addario makes the best",
       'Use a smartphone', 'pretty good',
       'Five stars for doing what it does at this price point',
       "Yo'd better to buy a big one.", 'great for the price', 'Stands',
       "Does the job, doesn't cost much!", 'Does its job',
       "Not worth the few dollars you'll spend on it.",
       'Great simple looper for rehearsing.', 'DR Pure Blues Pure Nickel',
       'Mighty Bright Duet Music Stand Light', 'Good effect, one problem',
       'No drip humidifier', "I'm new to a capo",
       'Useful but hard to dial in', 'ok pick holder', 'Patch cords',
       'Humidipak', 'Great all round pick', "Doesn't work", 'These Rock!',
       'Hooray...it fits me !',
   

In [17]:
import numpy as np
count5 = np.count_nonzero(y == 5)
count4 = np.count_nonzero(y == 4)
count3 = np.count_nonzero(y == 3)
count2 = np.count_nonzero(y == 2)
count1 = np.count_nonzero(y == 1)
print(count1)
print(count2)
print(count3)
print(count4)
print(count5)

217
217
217
500
500


In [18]:
sentiment_text = {5: "Positve",4: "Good",3: "Neutral",2: "Bad", 1: "Negative"}
sentiment_text[y_pred[0]]

'Good'

In [21]:
text = "5 stars, but I still prefer my Fender tuner"
v = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams)])y_pred_nb = naive_bayes_model.predict(v)
y_pred_nb

array([4.])

In [23]:
text = "piano is not bad !"
v = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams)])

In [24]:
y_pred_nb = naive_bayes_model.predict(v)
y_pred_nb

array([3.])

In [40]:
text = "crap."
v = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams)])

In [41]:
y_pred_nb = naive_bayes_model.predict(v)
y_pred_nb

array([1.])

In [35]:
text = "Not bad. Not good."
v = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams)])

In [36]:
y_pred_nb = naive_bayes_model.predict(v)
y_pred_nb

array([3.])