In [2]:
import pandas as pd

total_data = pd.read_csv("../DATASET2.csv")
total_data.head()

Unnamed: 0,Review,label
0,"Great music service, the audio is high quality...",POSITIVE
1,Please ignore previous negative rating. This a...,POSITIVE
2,"This pop-up ""Get the best Spotify experience o...",NEGATIVE
3,Really buggy and terrible to use as of recently,NEGATIVE
4,Dear Spotify why do I get songs that I didn't ...,NEGATIVE


In [3]:
total_data["label"] = total_data["label"].apply(lambda x: 1 if x == "POSITIVE" else 0).astype(int)
total_data.head()

Unnamed: 0,Review,label
0,"Great music service, the audio is high quality...",1
1,Please ignore previous negative rating. This a...,1
2,"This pop-up ""Get the best Spotify experience o...",0
3,Really buggy and terrible to use as of recently,0
4,Dear Spotify why do I get songs that I didn't ...,0


In [4]:
print(total_data.shape)
total_data = total_data.drop_duplicates()
total_data = total_data.dropna()
total_data = total_data.reset_index(inplace = False, drop = True)
total_data.shape

(753, 2)


(753, 2)

In [6]:
import regex as re

def preprocess_text(text):
    # Remove any character that is not a letter (a-z) or white space ( )
    text = re.sub(r'[^a-z ]', " ", text)
    
    # Remove white spaces
    text = re.sub(r'\s+[a-zA-Z]\s+', " ", text)
    text = re.sub(r'\^[a-zA-Z]\s+', " ", text)

    # Multiple white spaces into one
    text = re.sub(r'\s+', " ", text.lower())

    # Remove tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

    return text.split()

total_data["Review"] = total_data["Review"].apply(preprocess_text)
total_data.head()

Unnamed: 0,Review,label
0,"[reat, music, service, the, audio, is, high, q...",1
1,"[lease, ignore, previous, negative, rating, hi...",1
2,"[his, pop, up, et, the, best, potify, experien...",0
3,"[eally, buggy, and, terrible, to, use, as, of,...",0
4,"[ear, potify, why, do, get, songs, that, didn,...",0


In [7]:
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
download("wordnet")
lemmatizer = WordNetLemmatizer()

download("stopwords")
stop_words = stopwords.words("english")

def lemmatize_text(words, lemmatizer = lemmatizer):
    tokens = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

total_data["Review"] = total_data["Review"].apply(lemmatize_text)
total_data.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Review,label
0,"[reat, music, service, audio, high, quality, e...",1
1,"[lease, ignore, previous, negative, rating, su...",1
2,"[best, potify, experience, ndroid, annoying, l...",0
3,"[eally, buggy, terrible, recently]",0
4,"[potify, song, playlist, shuffle, play]",0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tokens_list = total_data["Review"]
tokens_list = [" ".join(tokens) for tokens in tokens_list]
print(tokens_list)

vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.8, min_df = 5)
X = vectorizer.fit_transform(tokens_list).toarray()
y = total_data["label"]

X[:5]

['reat music service audio high quality easy quick friendly support', 'lease ignore previous negative rating super great give five star', 'best potify experience ndroid annoying lease', 'eally buggy terrible recently', 'potify song playlist shuffle play', 'player control sometimes disappear reason restart forgets playing issue', 'love selection lyric provided song listening', 'till extremely slow changing storage external card convinced done purpose spotify know issue done solve time changed card faster read write speed samsung brand please like song never appear search playlist', 'great best music ever used problem play song find song despite wonderful recommend best', 'deleting following reason failing business model hether streaming service like consumer want music fully successively upon logging single song much losed number patient profit already peaked left decline', 'play potify', 'amazon premium music family package good veryone could listen whatever liked respective alexas roo

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
from sklearn.svm import SVC

model = SVC(kernel = "linear", random_state = 42)
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0])

In [12]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8079470198675497

In [13]:
from sklearn.model_selection import GridSearchCV

hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "kernel": ["linear", "poly", "rbf", "sigmoid", "precomputed’"],
    "degree": [1, 2, 3, 4, 5],
    "gamma": ["scale", "auto"]
}

# We initialize the random search
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 5)
grid

In [14]:
grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'C': 1, 'degree': 1, 'gamma': 'scale', 'kernel': 'linear'}


350 fits failed out of a total of 1750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
350 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_paramete

In [15]:
opt_model = SVC(C = 1, degree = 1, gamma = "scale", kernel = "linear", random_state = 42)
opt_model.fit(X_train, y_train)
y_pred = opt_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8079470198675497

In [16]:
from pickle import dump
import pickle

dump(opt_model, open("svm_classifier_C-1_deg-1_gam-scale_ker-linear_42.sav", "wb"))
with open("tfidf_vectorizer.sav", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)