In [1]:
# import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble


In [8]:
def classification_report_tf_idf(x_train, x_test, y_train, y_test, max_features: int = None, analyzer: str = "word", n_gram=(1, 1)):
  models = []
  models.append(('LogisticRegression', linear_model.LogisticRegression(solver='newton-cg',multi_class='multinomial')))
  models.append(('RandomForest', ensemble.RandomForestClassifier(n_estimators=100)))

  vectorizer = TfidfVectorizer( max_features=max_features,analyzer=analyzer, ngram_range=n_gram, use_idf=True)
  vectorizer.fit(pd.concat([x_train, x_test]))
  x_train = vectorizer.transform(x_train)
  x_test = vectorizer.transform(x_test)

  for name, model in models:
      clf=model
      clf.fit(x_train, y_train)
      y_pred=clf.predict(x_test)
      print(f"{name}:")
      print(f"accuracy: {metrics.accuracy_score(y_pred=y_pred, y_true=y_test)}")
      print(f"precision: {metrics.precision_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"recall: {metrics.recall_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"{metrics.classification_report(y_pred=y_pred, y_true=y_test)}")

In [3]:
tweet_train = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_train.csv")
tweet_test = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_test.csv")

In [4]:
tweet_test.dropna(inplace=True)
tweet_test.reset_index(drop=True,inplace=True)

tweet_train.dropna(inplace=True)
tweet_train.reset_index(drop=True,inplace=True)

x_train=tweet_train.tweet
y_train=tweet_train.sentiment.map({'olumlu':1,'olumsuz':-1,'notr':0}).values
x_test=tweet_test.tweet
y_test=tweet_test.sentiment.map({'olumlu':1,'olumsuz':-1,'notr':0}).values

# analyzer "word" "char"
# n_gram    (1,1) (2,2) (3,3)
# max_features 25 50 75 100

# max features 50

In [9]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (1,1),
                      max_features=50)

LogisticRegression:
accuracy: 0.4915893271461717
precision: 0.5044027817560593
recall: 0.47742582149399077
              precision    recall  f1-score   support

          -1       0.54      0.55      0.55      1373
           0       0.42      0.53      0.47      1160
           1       0.55      0.34      0.42       915

    accuracy                           0.49      3448
   macro avg       0.50      0.48      0.48      3448
weighted avg       0.50      0.49      0.49      3448

RandomForest:
accuracy: 0.4872389791183295
precision: 0.49994679351475063
recall: 0.4759074519112554
              precision    recall  f1-score   support

          -1       0.54      0.51      0.53      1373
           0       0.42      0.56      0.48      1160
           1       0.54      0.35      0.42       915

    accuracy                           0.49      3448
   macro avg       0.50      0.48      0.48      3448
weighted avg       0.50      0.49      0.48      3448



# max_features 25

In [10]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (1,1),
                      max_features=25)

LogisticRegression:
accuracy: 0.4538863109048724
precision: 0.4749377269496584
recall: 0.438765280347023
              precision    recall  f1-score   support

          -1       0.51      0.47      0.49      1373
           0       0.40      0.59      0.48      1160
           1       0.52      0.25      0.34       915

    accuracy                           0.45      3448
   macro avg       0.47      0.44      0.43      3448
weighted avg       0.47      0.45      0.44      3448

RandomForest:
accuracy: 0.4544663573085847
precision: 0.4799066018306361
recall: 0.4435004329689935
              precision    recall  f1-score   support

          -1       0.52      0.44      0.48      1373
           0       0.39      0.61      0.48      1160
           1       0.53      0.28      0.37       915

    accuracy                           0.45      3448
   macro avg       0.48      0.44      0.44      3448
weighted avg       0.48      0.45      0.45      3448



# max features 75

In [11]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (1,1),
                      max_features=75)

LogisticRegression:
accuracy: 0.5121809744779582
precision: 0.5233165493727364
recall: 0.49606447676626947
              precision    recall  f1-score   support

          -1       0.55      0.60      0.57      1373
           0       0.45      0.53      0.48      1160
           1       0.57      0.36      0.44       915

    accuracy                           0.51      3448
   macro avg       0.52      0.50      0.50      3448
weighted avg       0.52      0.51      0.51      3448

RandomForest:
accuracy: 0.5055104408352669
precision: 0.5153053150149907
recall: 0.49340202624639784
              precision    recall  f1-score   support

          -1       0.56      0.55      0.55      1373
           0       0.44      0.57      0.50      1160
           1       0.55      0.37      0.44       915

    accuracy                           0.51      3448
   macro avg       0.52      0.49      0.50      3448
weighted avg       0.52      0.51      0.50      3448



# max features 100

In [12]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (1,1),
                      max_features=100)

LogisticRegression:
accuracy: 0.5258120649651972
precision: 0.5365645117163295
recall: 0.5104116737077594
              precision    recall  f1-score   support

          -1       0.55      0.62      0.58      1373
           0       0.46      0.53      0.49      1160
           1       0.59      0.39      0.47       915

    accuracy                           0.53      3448
   macro avg       0.54      0.51      0.52      3448
weighted avg       0.53      0.53      0.52      3448

RandomForest:
accuracy: 0.5246519721577726
precision: 0.5326381916743311
recall: 0.5122771184087053
              precision    recall  f1-score   support

          -1       0.57      0.57      0.57      1373
           0       0.46      0.57      0.51      1160
           1       0.57      0.39      0.46       915

    accuracy                           0.52      3448
   macro avg       0.53      0.51      0.52      3448
weighted avg       0.53      0.52      0.52      3448



# analyzer "word"

In [13]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (1,1),
                      max_features=100)

LogisticRegression:
accuracy: 0.5258120649651972
precision: 0.5365645117163295
recall: 0.5104116737077594
              precision    recall  f1-score   support

          -1       0.55      0.62      0.58      1373
           0       0.46      0.53      0.49      1160
           1       0.59      0.39      0.47       915

    accuracy                           0.53      3448
   macro avg       0.54      0.51      0.52      3448
weighted avg       0.53      0.53      0.52      3448

RandomForest:
accuracy: 0.519431554524362
precision: 0.5261904998083718
recall: 0.5087455596604008
              precision    recall  f1-score   support

          -1       0.57      0.56      0.56      1373
           0       0.46      0.57      0.51      1160
           1       0.55      0.40      0.46       915

    accuracy                           0.52      3448
   macro avg       0.53      0.51      0.51      3448
weighted avg       0.53      0.52      0.52      3448



# analyzer "char"

In [14]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "char",
                      n_gram = (1,1),
                      max_features=100)

LogisticRegression:
accuracy: 0.46403712296983757
precision: 0.45942918551524453
recall: 0.4296728365605981
              precision    recall  f1-score   support

          -1       0.47      0.72      0.57      1373
           0       0.47      0.37      0.41      1160
           1       0.44      0.20      0.28       915

    accuracy                           0.46      3448
   macro avg       0.46      0.43      0.42      3448
weighted avg       0.46      0.46      0.44      3448

RandomForest:
accuracy: 0.5185614849187935
precision: 0.5282279299267988
recall: 0.48979378928177475
              precision    recall  f1-score   support

          -1       0.51      0.74      0.61      1373
           0       0.50      0.43      0.46      1160
           1       0.57      0.30      0.39       915

    accuracy                           0.52      3448
   macro avg       0.53      0.49      0.49      3448
weighted avg       0.52      0.52      0.50      3448



# n gram (1,1)

In [15]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (1,1),
                      max_features=100)

LogisticRegression:
accuracy: 0.5258120649651972
precision: 0.5365645117163295
recall: 0.5104116737077594
              precision    recall  f1-score   support

          -1       0.55      0.62      0.58      1373
           0       0.46      0.53      0.49      1160
           1       0.59      0.39      0.47       915

    accuracy                           0.53      3448
   macro avg       0.54      0.51      0.52      3448
weighted avg       0.53      0.53      0.52      3448

RandomForest:
accuracy: 0.5226218097447796
precision: 0.5293867198808954
recall: 0.5100427294123804
              precision    recall  f1-score   support

          -1       0.56      0.58      0.57      1373
           0       0.46      0.56      0.51      1160
           1       0.56      0.39      0.46       915

    accuracy                           0.52      3448
   macro avg       0.53      0.51      0.51      3448
weighted avg       0.53      0.52      0.52      3448



# n_gram (2,2)

In [16]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (2,2),
                      max_features=100)

LogisticRegression:
accuracy: 0.4431554524361949
precision: 0.5388464023535632
recall: 0.39091601019106853
              precision    recall  f1-score   support

          -1       0.42      0.94      0.58      1373
           0       0.55      0.10      0.17      1160
           1       0.65      0.13      0.22       915

    accuracy                           0.44      3448
   macro avg       0.54      0.39      0.32      3448
weighted avg       0.52      0.44      0.35      3448

RandomForest:
accuracy: 0.4448955916473318
precision: 0.5456370758335706
recall: 0.3926682285481579
              precision    recall  f1-score   support

          -1       0.42      0.95      0.59      1373
           0       0.58      0.09      0.16      1160
           1       0.64      0.14      0.23       915

    accuracy                           0.44      3448
   macro avg       0.55      0.39      0.32      3448
weighted avg       0.53      0.44      0.35      3448



# ngram (3,3)

In [17]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "word",
                      n_gram = (3,3),
                      max_features=100)

LogisticRegression:
accuracy: 0.4245939675174014
precision: 0.6279432692989355
recall: 0.3624230684617096
              precision    recall  f1-score   support

          -1       0.41      0.99      0.58      1373
           0       0.77      0.06      0.11      1160
           1       0.70      0.04      0.07       915

    accuracy                           0.42      3448
   macro avg       0.63      0.36      0.25      3448
weighted avg       0.61      0.42      0.29      3448

RandomForest:
accuracy: 0.4245939675174014
precision: 0.6329897645445753
recall: 0.3624230684617096
              precision    recall  f1-score   support

          -1       0.41      0.99      0.58      1373
           0       0.77      0.06      0.11      1160
           1       0.72      0.04      0.07       915

    accuracy                           0.42      3448
   macro avg       0.63      0.36      0.25      3448
weighted avg       0.61      0.42      0.29      3448



In [21]:
classification_report(x_train,
                      x_test,
                      y_train,
                      y_test,
                      analyzer = "char",
                      n_gram = (1,3),
                      max_features=200)

LogisticRegression:
accuracy: 0.5501740139211136
precision: 0.5444765070380967
recall: 0.5329263632942205
              precision    recall  f1-score   support

          -1       0.57      0.69      0.63      1373
           0       0.53      0.48      0.50      1160
           1       0.53      0.43      0.47       915

    accuracy                           0.55      3448
   macro avg       0.54      0.53      0.53      3448
weighted avg       0.55      0.55      0.54      3448

RandomForest:
accuracy: 0.5907772621809745
precision: 0.6071608030326847
recall: 0.5644255918001542
              precision    recall  f1-score   support

          -1       0.57      0.77      0.66      1373
           0       0.59      0.54      0.57      1160
           1       0.66      0.38      0.48       915

    accuracy                           0.59      3448
   macro avg       0.61      0.56      0.57      3448
weighted avg       0.60      0.59      0.58      3448

