In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

## Split the Data into Training and Testing Sets

In [10]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv("restaurants_with_sentiment.csv")

# Review the DataFrame
df.head()

Unnamed: 0,Restaurant_Name,Category,Address,Province,Latitude,Longitude,Rating,Review,Sentiment_score,Sentiment
0,Wendy's (10365 111th Street),Fast food,"10365 111th Street, Edmonton, AB T5K 2V3",AB,53.545878,-113.510914,5,the food met my expectation the seating area w...,0.9607,Good
1,Wendy's (10365 111th Street),Fast food,"10365 111th Street, Edmonton, AB T5K 2V3",AB,53.545878,-113.510914,2,its a wendys their food is good better than mc...,0.1966,Bad
2,Wendy's (10365 111th Street),Fast food,"10365 111th Street, Edmonton, AB T5K 2V3",AB,53.545878,-113.510914,3,i love the taste of this food as it tastes lik...,0.8198,Good
3,Wendy's (10365 111th Street),Fast food,"10365 111th Street, Edmonton, AB T5K 2V3",AB,53.545878,-113.510914,5,this is the best wendys ive ever been to incre...,0.9287,Good
4,Wendy's (10365 111th Street),Fast food,"10365 111th Street, Edmonton, AB T5K 2V3",AB,53.545878,-113.510914,5,i been here so many times i never had a proble...,0.5613,Good


In [18]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df["Sentiment"]

# Separate the X variable, the features
X = df["Category"]
df["Category"].unique()

array(['Fast food', 'Burgers', 'American', 'Bakery',
       'Breakfast and brunch', 'Comfort food', 'African', 'Ethiopian',
       'Indian', 'Halal', 'Vietnamese', 'Filipino', 'Korean', 'Japanese',
       'Pizza', 'Asian fusion', 'Sushi', 'Vegetarian', 'Asian', 'Thai',
       'Mexican', 'Sandwiches', 'Greek', 'Coffee & tea', 'Sandwich',
       'Desserts', 'Healthy', 'Seafood', 'Convenience', 'Chicken Strips',
       'Chinese', 'Bubble Tea', 'Italian', 'European', 'Wings', 'Steak',
       'Everyday Essentials', 'Snacks', 'Grocery', 'Breakfast & Brunch',
       'Dessert: Other', 'Vegan', 'Mediterranean', 'Canadian', 'French',
       'Ice cream & frozen yogurt', 'Japanese: Sushi', 'Salads',
       'Chicken', 'Bubble tea', 'Diner', 'Cheese', 'Bagels',
       'Middle Eastern', 'Comfort Food', 'Pastry', 'Asian: Other',
       'Salad / Sandwiches', 'Bowls', 'Cupcakes', 'Kosher',
       'Ice Cream & Frozen Yogurt', 'Asian Fusion', 'chinese', 'Deli',
       'Juice & Smoothies', 'Bar food', 'win

In [19]:
X.head()

0    Fast food
1    Fast food
2    Fast food
3    Fast food
4    Fast food
Name: Category, dtype: object

In [5]:
y[:5]

0    Good
1     Bad
2    Good
3    Good
4    Good
Name: Sentiment, dtype: object

In [21]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)


Unnamed: 0,African,Alcohol,American,Asian,Asian Fusion,Asian fusion,Asian: Other,BBQ,Bagels,Bakery,...,Turkish,Vegan,Vegan friendly,Vegetarian,Vietnamese,Wings,chinese,pasta,pizza,wings
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5015,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5016,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5017,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5018,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X.shape

(5020, 96)

In [23]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

## Create a Logistic Regression Model with the Original Data

In [24]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [25]:
# Make a prediction using the testing data
training_predictions = lr_model.predict(X_train)
testing_predictions = logistic_regression_model.predict(X_test)

 Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [26]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, testing_predictions)

0.5

In [27]:
# Generate a confusion matrix for the model
training_matrix = confusion_matrix(y_train, training_predictions)

In [28]:
# Print the classification report for the model
print(classification_report(y_test,testing_predictions))

              precision    recall  f1-score   support

         Bad       0.00      0.00      0.00       323
        Good       0.74      1.00      0.85       932

    accuracy                           0.74      1255
   macro avg       0.37      0.50      0.43      1255
weighted avg       0.55      0.74      0.63      1255



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.datasets import make_classification
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)
# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X_train, y_train)

In [30]:
# Count the distinct values of the resampled labels data
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({'Bad': 2750, 'Good': 2750})


In [31]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_cl = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
fit_model = logistic_regression_cl.fit(X_res, y_res)

# Make a prediction using the testing data
testing_data_predicton = logistic_regression_cl.predict(X_test)

In [32]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, testing_data_predicton)

0.5868185200441143

In [33]:
# Generate a confusion matrix for the model
training_matrix = confusion_matrix(y_test, testing_data_predicton)
training_matrix

array([[211, 112],
       [447, 485]])

In [35]:
#  Print the classification report for the model
training_report = classification_report(y_test, testing_data_predicton)
print(training_report)

              precision    recall  f1-score   support

         Bad       0.32      0.65      0.43       323
        Good       0.81      0.52      0.63       932

    accuracy                           0.55      1255
   macro avg       0.57      0.59      0.53      1255
weighted avg       0.69      0.55      0.58      1255

