In [None]:
#loading the necessay libraries 

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import regex as re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#The three libraries here will not be used but are imported to demonstrate the different methods of analysis that can be used

#from nltk.sentiment.vader import SentimentIntensityAnalyzer
#from nltk import sent_tokenize, word_tokenize, pos_tag
#from sklearn.svm import SVC

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
import osmnx as ox
import geopandas as gpd

In [None]:
#loading in the CSV

cal = pd.read_csv('Calgary.csv')

In [None]:
# Creating a function for ease of use
def clean_text(text):
    # Removing numbers and punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Converting words to lowercase and splitting them
    words = text.lower().split()
    
    # Removing short words
    words = [word for word in words if len(word) > 2]
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]
    
    # Lemmatizing words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Appeding the text in a manner that allows it to be added as a dataframe column
    clean_text = ' '.join(words)
    return clean_text

# Apply the cleaning function to the 'text' column
cal['clean_text'] = cal['text'].apply(clean_text)


In [None]:
#sia = SentimentIntensityAnalyzer()
#cal['polarity_score'] = cal['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

#cal['polarity'] = cal['polarity_score'].apply(lambda x: 'positive' if x >= 0 else 'negative')


In [None]:
#appending a column called polarity
cal['polarity'] = cal['stars_y'].apply(lambda x: 'positive' if x >= 4 else 'negative')

In [None]:
#Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(cal['clean_text'], cal['polarity'], test_size=0.2, random_state=42)

In [None]:
# Tokenising the text and extracting features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [None]:
#Support Vector Machine

#a)
#svm = SVC(kernel='linear', C=1.0, random_state=42)
#svm.fit(X_train, y_train)

#b)
# Predict the polarity of the venue for the testing set
#y_pred = svm.predict(X_test)

#c)
# Evaluate the performance of the model using accuracy score
#accuracy = accuracy_score(y_test, y_pred)
#print('Accuracy:', accuracy)

#d)
# Calculate precision, recall, and F1-score
#report = classification_report(y_test, y_pred)
#print(report)

#e)
# Calculate the confusion matrix
#cm = confusion_matrix(y_test, y_pred)

# Create the confusion matrix display
#cmd = ConfusionMatrixDisplay(cm, display_labels=svm.classes_)

# Plot the confusion matrix
#cmd.plot()

# Show the plot
#plt.show()

In [None]:
#Multinomial Naive Bayes Classifier

#a)
nb = MultinomialNB()
nb.fit(X_train, y_train)

#b)
y_pred = nb.predict(X_test)

#c)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

#d)
report = classification_report(y_test, y_pred)
print(report)

#e)
cm = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(cm, display_labels=nb.classes_)
cmd.plot()
plt.show()

In [None]:
#Decision Tree Classifier

#a)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

#b)
y_pred = dt.predict(X_test)

#c)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

#d)
report = classification_report(y_test, y_pred)
print(report)

#e)
cm = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(cm, display_labels=dt.classes_)
cmd.plot()
plt.show()

In [None]:
#Random Forest Classifier

#a)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

#b)
y_pred = rf.predict(X_test)

#c)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

#d)
report = classification_report(y_test, y_pred)
print(report)

#e)
cm = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(cm, display_labels=rf.classes_)
cmd.plot()
plt.show()

In [None]:
#Logistic Regression Model

#a)
lr = LogisticRegression()
lr.fit(X_train, y_train)

#b)
y_pred = lr.predict(X_test)

#c)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

#d)
report = classification_report(y_test, y_pred)
print(report)

#e)
cm = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(cm, display_labels=lr.classes_)
cmd.plot()
plt.show()

In [None]:
#getting the coordinates for Calgary
ox.geocode("Calgary, Canada")

In [None]:
#creating a folium map with the coordinates
calgary_coords = [51.0460954, -114.065465]
map = folium.Map(location=calgary_coords, zoom_start=13)

In [None]:
#creating 2 new columns called postive and negative from the polarity column
cal[['positive', 'negative']] = cal['polarity'].apply(lambda x: (True, False) if x == 'positive' else (False, True)).apply(pd.Series)

In [None]:
#converting the cal dataframe into a gdf
gdf = gpd.GeoDataFrame(cal, geometry=gpd.points_from_xy(cal.longitude, cal.latitude))

In [None]:
# Adding the MarkerCluster plugin
useful_cluster = MarkerCluster().add_to(map)

# Creating a loop that generates clusters and adds them to the map

# The value has been chosen as 2 as the average was 1.2
for _, row in gdf.iterrows():
    if row['useful'] > 2:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='blue')).add_to(useful_cluster)

#the same steps have been repeated for the other two tags

funny_cluster = MarkerCluster().add_to(map)

# The value has been chosen as 1 as the average was 0.39
for _, row in gdf.iterrows():
    if row['funny'] > 1:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='orange')).add_to(funny_cluster)

cool_cluster = MarkerCluster().add_to(map)

# The value has been chosen as 1 as the average was 0.39
for _, row in gdf.iterrows():
    if row['cool'] > 1:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='purple')).add_to(cool_cluster)

map

The 3 maps folllowing this are the tag maps isolated from each other

In [None]:
usefulmap = folium.Map(location=calgary_coords, zoom_start=13)

useful_cluster = MarkerCluster().add_to(usefulmap)

for _, row in gdf.iterrows():
    if row['useful'] > 2:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='blue')).add_to(useful_cluster)

usefulmap

In [None]:
funnymap = folium.Map(location=calgary_coords, zoom_start=13)

funny_cluster = MarkerCluster().add_to(funnymap)

for _, row in gdf.iterrows():
    if row['funny'] > 1:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='orange')).add_to(funny_cluster)

funnymap

In [None]:
coolmap = folium.Map(location=calgary_coords, zoom_start=13)

cool_cluster = MarkerCluster().add_to(coolmap)

for _, row in gdf.iterrows():
    if row['cool'] > 1:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='purple')).add_to(cool_cluster)

coolmap

In [None]:
posimap = folium.Map(location=calgary_coords, zoom_start=13)

posi_cluster = MarkerCluster().add_to(posimap)

for _, row in gdf.iterrows():
    if row['positive'] > 0:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='green')).add_to(posi_cluster)

posimap

In [None]:
negmap = folium.Map(location=calgary_coords, zoom_start=13)

neg_cluster = MarkerCluster().add_to(negmap)

for _, row in gdf.iterrows():
    if row['negative'] > 0:
        folium.Marker(location=[row['latitude'], row['longitude']], 
                      icon=folium.Icon(color='red')).add_to(neg_cluster)

negmap

The total count of the data across both the polarity maps is 82182, which is the total number of rows in the dataset, validating that each review has been assigned a polarity score