In [1]:
import pandas as pd
import re
import nltk

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chels\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [11]:
df = pd.read_csv("satisfaction_review.csv")
df.head()

Unnamed: 0,Ticket satisfaction comment,Ticket satisfaction rating
0,amazing I found just what I was looking for,Good
1,best customer service ever!,Good
2,Best retail experience I’ve had in ages! \r\n...,Good
3,Called buckle first thing in the morning.. on...,Bad
4,Customer service is great,Good


In [12]:
# drop na values to get rid of type error

df = df.dropna()

In [13]:
df.isna().sum()

Ticket satisfaction comment    0
Ticket satisfaction rating     0
dtype: int64

In [14]:
# remove punctuation

df['comment_processed'] = \
df['Ticket satisfaction comment'].map(lambda x: re.sub('[,\.!?]', '', x))

In [15]:
# convert all text to lower case

df['comment_processed'] = \
df['comment_processed'].map(lambda x: x.lower())

In [16]:
df['comment_processed'].head()

0         amazing i found just what i was looking for 
1                          best customer service ever 
2     best retail experience i’ve had in ages \r\nt...
3     called buckle first thing in the morning on f...
4                            customer service is great
Name: comment_processed, dtype: object

In [19]:
df.head()

Unnamed: 0,Ticket satisfaction comment,Ticket satisfaction rating,comment_processed
0,amazing I found just what I was looking for,Good,amazing i found just what i was looking for
1,best customer service ever!,Good,best customer service ever
2,Best retail experience I’ve had in ages! \r\n...,Good,best retail experience i’ve had in ages \r\nt...
3,Called buckle first thing in the morning.. on...,Bad,called buckle first thing in the morning on f...
4,Customer service is great,Good,customer service is great


In [22]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Encode the categorical labels in the sentiment column
df['sentiment_label'] = le.fit_transform(df['Ticket satisfaction rating'])


In [23]:
# convert the preprocessed text data into numerical features using TF-IDF

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['comment_processed'])
y = df['sentiment_label']

In [24]:
df.head()

Unnamed: 0,Ticket satisfaction comment,Ticket satisfaction rating,comment_processed,sentiment_label
0,amazing I found just what I was looking for,Good,amazing i found just what i was looking for,1
1,best customer service ever!,Good,best customer service ever,1
2,Best retail experience I’ve had in ages! \r\n...,Good,best retail experience i’ve had in ages \r\nt...,1
3,Called buckle first thing in the morning.. on...,Bad,called buckle first thing in the morning on f...,0
4,Customer service is great,Good,customer service is great,1


In [25]:
# split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# choose a machine learning algorithm and train the model on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()

In [27]:
# evaluate the performance of the model on the test set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [28]:
# print the evaluation metrics
print('Accuracy: {:.2f}'.format(accuracy))
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1 score: {:.2f}'.format(f1))

Accuracy: 0.94
Precision: 0.94
Recall: 0.94
F1 score: 0.94


In [30]:
from sklearn.metrics import confusion_matrix

# Predict the sentiment labels for the test set
y_pred = clf.predict(X_test)

# Create a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print(confusion_mat)


[[1032   63]
 [ 103 1776]]


In [None]:
# TP: 1,032
# FN: 103
# FP: 63
# TN: 1,776