<a href="https://colab.research.google.com/github/gurbachansingh2000/gurbachansingh2000/blob/main/NLP_Case_Study_1_Group_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  # Import stopwords
from nltk.stem import PorterStemmer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Download stopwords if not already done
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Load dataset
df = pd.read_csv(r'/reviews.csv')

In [None]:
# Top 5 Records in dataset
df.head(5)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [None]:
# Shape of the dataset
df.shape

(100000, 9)

In [None]:
# Preprocess the text data by (Tokenization, Stop words, Stemming)
# Clean the text, convert to lowercase, remove non-alphabet characters, tokenize, apply stemming, and remove stopwords
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabet characters
    tokens = word_tokenize(text)  # Tokenize text into words
    stop_words = set(stopwords.words('english'))  # Set of English stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    stemmer = PorterStemmer()  # Initialize the Porter Stemmer
    tokens = [stemmer.stem(word) for word in tokens]  # Apply stemming
    return ' '.join(tokens)  # Join tokens back into a string

In [None]:
# New Coloumn in dataset that store processed dataset
df['processed_text'] = df['text'].apply(preprocess_text)  # Applying preprocessing to the reviews

In [None]:
# Records in dataset
df.head(5)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,processed_text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,decid eat awar go take hour begin end tri mult...
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,ive taken lot spin class year noth compar clas...
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,famili diner buffet eclect assort larg chicken...
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,wow yummi differ delici favorit lamb curri kor...
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,cute interior owner gave us tour upcom patioro...


In [None]:
# Showing original text, processed text
df[['text', 'processed_text']]

Unnamed: 0,text,processed_text
0,"If you decide to eat here, just be aware it is...",decid eat awar go take hour begin end tri mult...
1,I've taken a lot of spin classes over the year...,ive taken lot spin class year noth compar clas...
2,Family diner. Had the buffet. Eclectic assortm...,famili diner buffet eclect assort larg chicken...
3,"Wow! Yummy, different, delicious. Our favo...",wow yummi differ delici favorit lamb curri kor...
4,Cute interior and owner (?) gave us tour of up...,cute interior owner gave us tour upcom patioro...
...,...,...
99995,Came here for lunch with a group. They were bu...,came lunch group busi still room us servic goo...
99996,The equipment is so old and so felty! I just u...,equip old felti upgrad multi club membership c...
99997,This is one of my favorite Mexican restaurants...,one favorit mexican restaur authent menu typic...
99998,Came here for brunch - had an omlette ($19 + t...,came brunch omlett tax tip food wayyyyyyy over...


In [None]:
# Label encoding (Positive, Negative)
# We label 'Negative' for <= 3 stars, 'Positive' for 4 or 5 stars
def encode_labels(stars):
    if stars <= 3:
        return 'Negative'
    else:
        return 'Positive'

In [None]:
# New Column in dataset to Store lablel's
df['label'] = df['stars'].apply(encode_labels)  # Apply the label encoding function

In [None]:
# show dataset
df[['stars', 'label', 'text', 'processed_text']]

Unnamed: 0,stars,label,text,processed_text
0,3,Negative,"If you decide to eat here, just be aware it is...",decid eat awar go take hour begin end tri mult...
1,5,Positive,I've taken a lot of spin classes over the year...,ive taken lot spin class year noth compar clas...
2,3,Negative,Family diner. Had the buffet. Eclectic assortm...,famili diner buffet eclect assort larg chicken...
3,5,Positive,"Wow! Yummy, different, delicious. Our favo...",wow yummi differ delici favorit lamb curri kor...
4,4,Positive,Cute interior and owner (?) gave us tour of up...,cute interior owner gave us tour upcom patioro...
...,...,...,...,...
99995,4,Positive,Came here for lunch with a group. They were bu...,came lunch group busi still room us servic goo...
99996,1,Negative,The equipment is so old and so felty! I just u...,equip old felti upgrad multi club membership c...
99997,4,Positive,This is one of my favorite Mexican restaurants...,one favorit mexican restaur authent menu typic...
99998,2,Negative,Came here for brunch - had an omlette ($19 + t...,came brunch omlett tax tip food wayyyyyyy over...


In [None]:
# Split the data into train and test sets (80% train, 20% test)
x = df['processed_text']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Shape of train data set
print('x_train shape : ', x_train.shape)
print('y_train shape : ', y_train.shape)

x_train shape :  (80000,)
y_train shape :  (80000,)


In [None]:
# Shape of test data set
print('x_test shape : ', x_test.shape)
print('y_test shape : ', y_test.shape)

x_test shape :  (20000,)
y_test shape :  (20000,)


In [None]:
def plot_confusion_matrix(cm):
    """
    Plots confusion matrix.

    Parameters:
    cm: Confusion matrix (2x2 array).
    """
    # Plot Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Models