In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hienle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hienle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hienle/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [51]:
# Load the training data
with open('review_training.txt', 'r') as file:
    lines = file.readlines()
    
# Extract text and labels
train_data = {'label': [], 'review': []}
for line in lines:
    parts = line.split(': ', 1)
    if len(parts) == 2:
        label, review = parts
        train_data['review'].append(review.strip())
        train_data['label'].append(label.lower())

In [52]:
# Create a DataFrame for training
df_train = pd.DataFrame(train_data)

# Preprocess the training data
stop_words = set(stopwords.words('english'))

In [53]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

df_train['cleaned_text'] = df_train['review'].apply(preprocess_text)

In [54]:
print(df_train['cleaned_text'])

0     really loved time amount research volunteers h...
1     campus huge beautiful need work professors man...
2     good place get degree social life meh food bad...
3     currently year york found complaints york true...
4     york horrible university gone strike many time...
5     overall okay school safety remains concern man...
6     found york terrible school learning really lik...
7     make many people like commute really show clas...
8                      school safe school always strike
9        school good nothing incredible bad love campus
10    value self human please stay away years life w...
11    good school learning reputation sucks social l...
12    york university toronto faculties offering div...
13    york popular law business liberal arts fine ar...
14    honestly great time encounter good bad situati...
15    going cuz close home program good profs usuall...
16    york horrible university gone strike many time...
17    good overall ultimately make location bit 

In [55]:
# Load the testing data
with open('review_testing.txt', 'r') as file:
    lines = file.readlines()

# Extract text and labels
test_data = {'label': [], 'review': []}
for line in lines:
    parts = line.split(': ', 1)
    if len(parts) == 2:
        label, review = parts
        test_data['review'].append(review.strip())
        test_data['label'].append(label.lower())

In [56]:
# Create a DataFrame for testing
df_test = pd.DataFrame(test_data)

# Preprocess the testing data
df_test['cleaned_text'] = df_test['review'].apply(preprocess_text)

In [57]:
# Create a Bag-of-Words model
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(df_train['cleaned_text'])
X_test = vectorizer.transform(df_test['cleaned_text'])

In [58]:
# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, df_train['label'])

MultinomialNB()

In [59]:
# Make predictions on the testing set
predictions = clf.predict(X_test)

In [60]:
# Evaluate the model
accuracy = accuracy_score(df_test['label'], predictions)
print(f'Accuracy: {accuracy:.2%}')

print("\nClassification Report:")
print(classification_report(df_test['label'], predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(df_test['label'], predictions))

Accuracy: 22.22%

Classification Report:
              precision    recall  f1-score   support

    negative       0.18      1.00      0.30         3
     neutral       1.00      0.14      0.25         7
    positive       0.00      0.00      0.00         8

    accuracy                           0.22        18
   macro avg       0.39      0.38      0.18        18
weighted avg       0.42      0.22      0.15        18


Confusion Matrix:
[[3 0 0]
 [6 1 0]
 [8 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
# Display the results
result_df = pd.DataFrame({
    'Cleaned Text': df_test['cleaned_text'],
    'True Label': df_test['label'],
    'Predicted Label': predictions,
})

print(result_df)

                                         Cleaned Text True Label  \
0   safety isnt always amazing expect school heart...   positive   
1   tmu good diverse school heart toronto incredib...    neutral   
2   overall great school heart toronto lots opport...    neutral   
3   want tools change world equitable improve huma...   positive   
4   student location convenient unfortunately hear...   positive   
5   good vibes competitive needs residence space c...    neutral   
6   overall would give university solid good diver...   positive   
7   pros offers lots online courses close subway c...    neutral   
8   location university inviting homeless populati...   negative   
9   university regard students professors lecture ...   negative   
10  solid first year staying dcc lots areas chill ...    neutral   
11  location pretty great ttc mall restaurants clo...    neutral   
12  overall great school positive atmosphere vibin...   positive   
13  social skills fit right place meet people ex

In [62]:
# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Apply SentimentIntensityAnalyzer to obtain compound scores
df_test['compound'] = df_test['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Convert compound scores to predicted labels
df_test['predicted_label'] = df_test['compound'].apply(lambda score: 'positive' if score >= 0 else 'negative')

In [63]:
# Display the results
result_df = pd.DataFrame({
    'Cleaned Text': df_test['cleaned_text'],
    'True Label': df_test['label'],
    'Predicted Label': df_test['predicted_label'],
})

print(result_df)

                                         Cleaned Text True Label  \
0   safety isnt always amazing expect school heart...   positive   
1   tmu good diverse school heart toronto incredib...    neutral   
2   overall great school heart toronto lots opport...    neutral   
3   want tools change world equitable improve huma...   positive   
4   student location convenient unfortunately hear...   positive   
5   good vibes competitive needs residence space c...    neutral   
6   overall would give university solid good diver...   positive   
7   pros offers lots online courses close subway c...    neutral   
8   location university inviting homeless populati...   negative   
9   university regard students professors lecture ...   negative   
10  solid first year staying dcc lots areas chill ...    neutral   
11  location pretty great ttc mall restaurants clo...    neutral   
12  overall great school positive atmosphere vibin...   positive   
13  social skills fit right place meet people ex

In [64]:
# Evaluate the model
accuracy = accuracy_score(df_test['label'], df_test['predicted_label'])
print(f'\nAccuracy: {accuracy:.2%}')

print("\nClassification Report:")
print(classification_report(df_test['label'], df_test['predicted_label']))

print("\nConfusion Matrix:")
print(confusion_matrix(df_test['label'], df_test['predicted_label']))


Accuracy: 38.89%

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
     neutral       0.00      0.00      0.00         7
    positive       0.44      0.88      0.58         8

    accuracy                           0.39        18
   macro avg       0.15      0.29      0.19        18
weighted avg       0.19      0.39      0.26        18


Confusion Matrix:
[[0 0 3]
 [1 0 6]
 [1 0 7]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Apply SentimentIntensityAnalyzer to obtain compound scores
df_test['compound'] = df_test['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Adjust the threshold for considering a text as neutral
neutral_threshold = 0.5
df_test['predicted_label'] = df_test['compound'].apply(lambda score: 'positive' if score > neutral_threshold else 'negative' if score < -neutral_threshold else 'neutral')


In [69]:
# Display the results
result_df = pd.DataFrame({
    'Cleaned Text': df_test['cleaned_text'],
    'True Label': df_test['label'],
    'Predicted Label': df_test['predicted_label'],
    'Compound Score': df_test['compound'],
})

print(result_df)

                                         Cleaned Text True Label  \
0   safety isnt always amazing expect school heart...   positive   
1   tmu good diverse school heart toronto incredib...    neutral   
2   overall great school heart toronto lots opport...    neutral   
3   want tools change world equitable improve huma...   positive   
4   student location convenient unfortunately hear...   positive   
5   good vibes competitive needs residence space c...    neutral   
6   overall would give university solid good diver...   positive   
7   pros offers lots online courses close subway c...    neutral   
8   location university inviting homeless populati...   negative   
9   university regard students professors lecture ...   negative   
10  solid first year staying dcc lots areas chill ...    neutral   
11  location pretty great ttc mall restaurants clo...    neutral   
12  overall great school positive atmosphere vibin...   positive   
13  social skills fit right place meet people ex

In [70]:
# Evaluate the model
accuracy = accuracy_score(df_test['label'], df_test['predicted_label'])
print(f'\nAccuracy: {accuracy:.2%}')

print("\nClassification Report:")
print(classification_report(df_test['label'], df_test['predicted_label']))

print("\nConfusion Matrix:")
print(confusion_matrix(df_test['label'], df_test['predicted_label']))


Accuracy: 38.89%

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
     neutral       0.00      0.00      0.00         7
    positive       0.47      0.88      0.61         8

    accuracy                           0.39        18
   macro avg       0.16      0.29      0.20        18
weighted avg       0.21      0.39      0.27        18


Confusion Matrix:
[[0 1 2]
 [1 0 6]
 [1 0 7]]
