In [1]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from text_preprocessor import process_df



cleaned_df = process_df("ice_cream_reviews.csv", to_drop=["author", "date", "helpful_yes", "helpful_no"])

X_train = cleaned_df[1600:]
X_test = cleaned_df[:1600]

In [2]:
# instantiate the Term Frequency Inverse Document Frequency vectorizer (TFIDF)
tfidf = TfidfVectorizer()


# fitting the tfidf vectorizer and also tranforming it
train_tfidf = tfidf.fit_transform(X_train["review"])

# transform means to turn your documents into a document-term matrix, which is a type of sparse matrix
test_tfidf = tfidf.transform(X_test["review"])
train_tfidf

<3732x6698 sparse matrix of type '<class 'numpy.float64'>'
	with 74416 stored elements in Compressed Sparse Row format>

In [3]:
model = SVC(kernel="linear")
model.fit(train_tfidf, X_train["sentiment"])
prediction = model.predict(test_tfidf)

# calculates precision, recall, f1 for each class.
report = classification_report(X_test["sentiment"], prediction, output_dict=True)
print(report["1"])
print(report["0"])

{'precision': 0.8952380952380953, 'recall': 0.9652448657187994, 'f1-score': 0.9289243633599392, 'support': 1266}
{'precision': 0.8127659574468085, 'recall': 0.5718562874251497, 'f1-score': 0.671353251318102, 'support': 334}
