# Building a Custom Classifier

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Ensure "data" is defined before running this cell
# Düzenleme: text sütunundaki verileri ayrıştır ve sentiment sütununu doldur
data[['text', 'sentiment']] = data['text'].str.extract(r'\("(.+)",\s*"(positive|negative)"\)')
# İlk birkaç satırı görüntüle
print(data.head())

                                                text sentiment
0    i love spending time with my friends and family  positive
1    that was the best meal i've ever had in my life  positive
2  i feel so grateful for everything i have in my...  positive
3  i received a promotion at work and i couldn't ...  positive
4  watching a beautiful sunset always fills me wi...  positive


In [11]:
data = data.sample(frac=1).reset_index(drop=True)

In [12]:
X = data['text']
y = data['sentiment']

In [13]:
# text vectorization to bow - CountVectorizer
countvec = CountVectorizer()
countvec_fit = countvec.fit_transform(X)
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns = countvec.get_feature_names_out())

In [14]:
bag_of_words

Unnamed: 0,accomplishment,after,always,am,an,and,argument,at,away,be,...,vacation,ve,wanted,was,watching,we,week,who,with,work
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [15]:
# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(bag_of_words, y, test_size=0.3, random_state = 7)

## Logistic Regression

In [16]:
lr = LogisticRegression(random_state=1).fit(X_train, y_train)

In [17]:
y_pred_lr = lr.predict(X_test)

In [18]:
accuracy_score(y_pred_lr, y_test)

0.16666666666666666

In [19]:
y_pred_lr

array(['positive', 'positive', 'negative', 'positive', 'positive',
       'positive'], dtype=object)

In [20]:
y_test

1     negative
17    negative
2     positive
5     positive
11    negative
0     negative
Name: sentiment, dtype: object

In [21]:
print(classification_report(y_test, y_pred_lr, zero_division=0))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
    positive       0.20      0.50      0.29         2

    accuracy                           0.17         6
   macro avg       0.10      0.25      0.14         6
weighted avg       0.07      0.17      0.10         6



## Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
nb = MultinomialNB().fit(X_train, y_train)

In [24]:
y_pred_nb = nb.predict(X_test)

In [25]:
accuracy_score(y_pred_nb, y_test)

0.5

In [26]:
print(classification_report(y_test, y_pred_nb, zero_division=0))

              precision    recall  f1-score   support

    negative       0.67      0.50      0.57         4
    positive       0.33      0.50      0.40         2

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.49         6
weighted avg       0.56      0.50      0.51         6



## Linear Support Vector Machine

In [27]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [28]:
svm = SGDClassifier().fit(X_train, y_train)
# possible hyper params, loss function, regularization

In [29]:
y_pred_svm = svm.predict(X_test)

In [30]:
accuracy_score(y_pred_svm, y_test)

0.16666666666666666

In [31]:
print(classification_report(y_test, y_pred_svm, zero_division=0))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
    positive       0.20      0.50      0.29         2

    accuracy                           0.17         6
   macro avg       0.10      0.25      0.14         6
weighted avg       0.07      0.17      0.10         6

