In [1]:
# import libraries
from gensim.models.word2vec import Word2Vec
import gensim
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
import pandas as pd
from gensim.models.phrases import Phrases, Phraser
import numpy as np
import xgboost 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas(desc="progress-bar")
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from collections import Counter
from nltk import ngrams

  from pandas import Panel


In [23]:
def classification_report(x_train, x_test, y_train, y_test):
  models = []
  models.append(('LogisticRegression', linear_model.LogisticRegression(solver='newton-cg',multi_class='multinomial')))
  models.append(('RandomForest', ensemble.RandomForestClassifier(n_estimators=100)))

  for name, model in models:
      clf=model
      clf.fit(x_train, y_train)
      y_pred=clf.predict(x_test)
      print(f"{name}:")
      print(f"accuracy: {metrics.accuracy_score(y_pred=y_pred, y_true=y_test)}")
      print(f"precision: {metrics.precision_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"recall: {metrics.recall_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"{metrics.classification_report(y_pred=y_pred, y_true=y_test)}")

In [3]:
def get_word_counts(data):
  words = data.tweet.to_string().split()
  return Counter(words)
  

In [4]:
# word2vec hyper parameters
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# sg = 0 cbow, 1 skip-gram
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_alpha = eğitim aşamasında learning rate linear olarak bunu düşer
# epoch = iterasyon sayısı


In [5]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [6]:
def train_word2vec(corpus, n_epoch, name_corpus, sg, vector_size, negative, window, min_count, alpha, min_alpha):
  cores = multiprocessing.cpu_count()
  model = Word2Vec(sg=sg, size=vector_size, negative=negative, window=window, min_count=min_count, workers=cores, alpha=alpha, min_alpha=min_alpha)
  model.build_vocab([x.words for x in tqdm(corpus)])

  for epoch in range(n_epoch):
    model.train(utils.shuffle([x.words for x in tqdm(corpus)]), total_examples=len(corpus), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

  model.save(f"/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/trained_embeddings/{name_corpus}_sg_{sg}_size_{vector_size}_window_{window}_min_count_{min_count}.model")
  return model

In [7]:
def get_mean_vector(model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in model.wv]
    if len(words) >= 1:
        return np.mean(model[words], axis=0)
    else:
        return np.zeros((1, model.vector_size))

In [8]:
def get_vectors(model, corpus):
  vectors = []
  for sentence in corpus:
      vec = get_mean_vector(model, sentence)
      vectors.append(vec)
  return vectors

In [9]:
def get_max_len_sentence(series):
  res = series.str.split().str.len().max()

  print(f"The maximum length in words are : {res}") 

**TWEET METINLERI İÇİN WORD2VEC BENCHMARK**

In [11]:
tweet_train = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_train.csv")
tweet_test = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_test.csv")

In [12]:
tweet_test.dropna(inplace=True)
tweet_test.reset_index(drop=True,inplace=True)

tweet_train.dropna(inplace=True)
tweet_train.reset_index(drop=True,inplace=True)

x_train=tweet_train.tweet
y_train=tweet_train.sentiment.map({'olumlu':1,'olumsuz':-1,'notr':0}).values
x_test=tweet_test.tweet
y_test=tweet_test.sentiment.map({'olumlu':1,'olumsuz':-1,'notr':0}).values

In [13]:
concat = pd.concat([x_train, x_test])
corpus = labelize_tweets_ug(concat, 'all')

In [14]:
corpus_train = pd.DataFrame(x_train)['tweet'].apply(lambda x: x.split())
corpus_test = pd.DataFrame(x_test)['tweet'].apply(lambda x: x.split())

In [15]:
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# sg = 0 cbow, 1 skip-gram
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_alpha = eğitim aşamasında learning rate linear olarak bunu düşer
# epoch = iterasyon sayısı

In [16]:
get_max_len_sentence(pd.DataFrame(concat).tweet)

The maximum length in words are : 27


# sg=0 epoch=25 negative=5 için (sg karar vermek için ilk deneme ikisi için yapılacak)
# alpha 0.5 0.01 0.05 0.1 her biri window = 3, vector_size = 150 kullanılacak 
# window 3 5 7 her biri alpha = 0.05, vector_size = 150 kullanılacak 
# vector_size 25 50 150 200 her biri  için window = 3, alpha = 0.05

In [None]:
model_1 = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.5,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

In [18]:
vectors_train = get_vectors(model=model_1,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model_1,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [19]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.5481438515081206
precision: 0.5475194240183531
recall: 0.5290639683382075
              precision    recall  f1-score   support

          -1       0.56      0.70      0.62      1373
           0       0.52      0.48      0.50      1160
           1       0.56      0.41      0.47       915

    accuracy                           0.55      3448
   macro avg       0.55      0.53      0.53      3448
weighted avg       0.55      0.55      0.54      3448

RandomForest:
accuracy: 0.5580046403712297
precision: 0.5753731214120604
recall: 0.5341337792989448
              precision    recall  f1-score   support

          -1       0.54      0.76      0.63      1373
           0       0.56      0.45      0.50      1160
           1       0.63      0.39      0.48       915

    accuracy                           0.56      3448
   macro avg       0.58      0.53      0.54      3448
weighted avg       0.57      0.56      0.55      3448

SVM:
accuracy: 0.555104408352668

In [20]:
# deneme 2
model_2 = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=1, 
                         negative=5,
                         alpha = 0.5,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 17222/17222 [00:00<00:00, 1182153.43it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1588265.25it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1536573.14it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1686338.36it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1906119.47it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1524863.39it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1882229.03it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2095570.16it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1987735.37it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1889911.40it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1761812.28it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1874998.14it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1647229.40it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2091020.51it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1745169.32it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1630755.24it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1974046.33it/

In [21]:
vectors_train = get_vectors(model=model_2,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model_2,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [22]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.5417633410672854
precision: 0.5452278884679288
recall: 0.5214176098821507
              precision    recall  f1-score   support

          -1       0.54      0.70      0.61      1373
           0       0.52      0.47      0.49      1160
           1       0.57      0.39      0.47       915

    accuracy                           0.54      3448
   macro avg       0.55      0.52      0.52      3448
weighted avg       0.54      0.54      0.53      3448

RandomForest:
accuracy: 0.537122969837587
precision: 0.5474788257797004
recall: 0.5113636633957368
              precision    recall  f1-score   support

          -1       0.52      0.73      0.61      1373
           0       0.54      0.46      0.49      1160
           1       0.58      0.34      0.43       915

    accuracy                           0.54      3448
   macro avg       0.55      0.51      0.51      3448
weighted avg       0.54      0.54      0.52      3448

SVM:
accuracy: 0.5246519721577726

# yukarıdaki sonuçlara göre sg=0 -> cbow yöntemi için daha başarılı oldu onunla diğer sonuçlar alınacak

# alpha(learning rate) = 0.01

In [None]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.01,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

In [27]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [28]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.44895591647331784
precision: 0.4755311665469673
recall: 0.4123676235443822
              precision    recall  f1-score   support

          -1       0.44      0.76      0.56      1373
           0       0.44      0.28      0.34      1160
           1       0.55      0.20      0.29       915

    accuracy                           0.45      3448
   macro avg       0.48      0.41      0.40      3448
weighted avg       0.47      0.45      0.41      3448

RandomForest:
accuracy: 0.49564965197215777
precision: 0.4971043298193725
recall: 0.4751265317516162
              precision    recall  f1-score   support

          -1       0.50      0.66      0.57      1373
           0       0.47      0.42      0.44      1160
           1       0.52      0.35      0.42       915

    accuracy                           0.50      3448
   macro avg       0.50      0.48      0.48      3448
weighted avg       0.50      0.50      0.49      3448



# learning rate 0.05

In [None]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

In [30]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [31]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.6189095127610209
precision: 0.6152101408702348
recall: 0.6031816644939304
              precision    recall  f1-score   support

          -1       0.63      0.75      0.69      1373
           0       0.60      0.56      0.58      1160
           1       0.61      0.51      0.55       915

    accuracy                           0.62      3448
   macro avg       0.62      0.60      0.61      3448
weighted avg       0.62      0.62      0.61      3448

RandomForest:
accuracy: 0.6302204176334106
precision: 0.6355996715037645
recall: 0.6113736910573524
              precision    recall  f1-score   support

          -1       0.62      0.79      0.69      1373
           0       0.63      0.54      0.58      1160
           1       0.65      0.50      0.57       915

    accuracy                           0.63      3448
   macro avg       0.64      0.61      0.62      3448
weighted avg       0.63      0.63      0.62      3448



# learning rate 0.1

In [None]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.1,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

In [33]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [34]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.6218097447795824
precision: 0.618427849527626
recall: 0.6064234412052627
              precision    recall  f1-score   support

          -1       0.63      0.75      0.69      1373
           0       0.62      0.55      0.58      1160
           1       0.61      0.52      0.56       915

    accuracy                           0.62      3448
   macro avg       0.62      0.61      0.61      3448
weighted avg       0.62      0.62      0.62      3448

RandomForest:
accuracy: 0.6218097447795824
precision: 0.6326078442422531
recall: 0.5969728561904776
              precision    recall  f1-score   support

          -1       0.61      0.82      0.70      1373
           0       0.62      0.54      0.58      1160
           1       0.67      0.44      0.53       915

    accuracy                           0.62      3448
   macro avg       0.63      0.60      0.60      3448
weighted avg       0.63      0.62      0.61      3448



# window = 3

In [35]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 17222/17222 [00:00<00:00, 1365616.85it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1422327.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1999288.78it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1556002.49it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1823868.29it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2105281.21it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1830848.67it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1613561.41it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1485477.28it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1727804.04it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1732528.33it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1864495.99it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2031164.51it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1755859.49it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1948118.98it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1207467.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1839942.52it/

In [36]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [37]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.6189095127610209
precision: 0.6168600466393709
recall: 0.6027603059955581
              precision    recall  f1-score   support

          -1       0.63      0.75      0.69      1373
           0       0.60      0.55      0.58      1160
           1       0.62      0.50      0.56       915

    accuracy                           0.62      3448
   macro avg       0.62      0.60      0.61      3448
weighted avg       0.62      0.62      0.61      3448

RandomForest:
accuracy: 0.6342807424593968
precision: 0.6372955795484575
recall: 0.6156354392383405
              precision    recall  f1-score   support

          -1       0.63      0.79      0.70      1373
           0       0.64      0.54      0.59      1160
           1       0.65      0.51      0.57       915

    accuracy                           0.63      3448
   macro avg       0.64      0.62      0.62      3448
weighted avg       0.64      0.63      0.63      3448



# window size = 5

In [None]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 5,
                         vector_size=150,
                         min_count=2)

In [39]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

  """
  
  


LogisticRegression:
accuracy: 0.6247099767981439
precision: 0.6236967945700024
recall: 0.6091632675775828
              precision    recall  f1-score   support

          -1       0.63      0.76      0.69      1373
           0       0.61      0.55      0.58      1160
           1       0.63      0.52      0.57       915

    accuracy                           0.62      3448
   macro avg       0.62      0.61      0.61      3448
weighted avg       0.62      0.62      0.62      3448

RandomForest:
accuracy: 0.6337006960556845
precision: 0.6380770695875175
recall: 0.6143761774325188
              precision    recall  f1-score   support

          -1       0.62      0.80      0.70      1373
           0       0.63      0.54      0.59      1160
           1       0.65      0.50      0.57       915

    accuracy                           0.63      3448
   macro avg       0.64      0.61      0.62      3448
weighted avg       0.64      0.63      0.63      3448



# window size = 7

In [40]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 7,
                         vector_size=150,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1530939.18it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2073850.98it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1746181.82it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1828716.54it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1892188.70it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1857448.21it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1556438.34it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1564393.46it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1705972.88it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1931243.58it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1783915.43it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1864351.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1863678.20it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1784091.67it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1920512.16it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1473628.13it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1843087.96it/

LogisticRegression:
accuracy: 0.6293503480278422
precision: 0.6274789248025936
recall: 0.6141981921571701
              precision    recall  f1-score   support

          -1       0.64      0.76      0.69      1373
           0       0.61      0.56      0.58      1160
           1       0.63      0.53      0.57       915

    accuracy                           0.63      3448
   macro avg       0.63      0.61      0.62      3448
weighted avg       0.63      0.63      0.63      3448

RandomForest:
accuracy: 0.6415313225058005
precision: 0.6461515174632755
recall: 0.6227136908643017
              precision    recall  f1-score   support

          -1       0.63      0.80      0.71      1373
           0       0.64      0.55      0.59      1160
           1       0.66      0.51      0.58       915

    accuracy                           0.64      3448
   macro avg       0.65      0.62      0.63      3448
weighted avg       0.64      0.64      0.64      3448



# vector size = 25


In [41]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=25,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 950764.11it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1579927.90it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1640383.86it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1583321.72it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1552991.71it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1930314.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2025753.08it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1960757.42it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1966094.27it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1621639.36it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1986915.24it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1433504.73it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2034597.17it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1671471.30it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2032021.59it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1917198.91it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1682293.15it/s

LogisticRegression:
accuracy: 0.5948375870069605
precision: 0.589717210193724
recall: 0.5777202159350797
              precision    recall  f1-score   support

          -1       0.61      0.73      0.67      1373
           0       0.58      0.54      0.56      1160
           1       0.58      0.47      0.52       915

    accuracy                           0.59      3448
   macro avg       0.59      0.58      0.58      3448
weighted avg       0.59      0.59      0.59      3448

RandomForest:
accuracy: 0.6209396751740139
precision: 0.6263826363497952
recall: 0.6024506790698594
              precision    recall  f1-score   support

          -1       0.61      0.77      0.68      1373
           0       0.62      0.56      0.58      1160
           1       0.65      0.49      0.56       915

    accuracy                           0.62      3448
   macro avg       0.63      0.60      0.61      3448
weighted avg       0.62      0.62      0.62      3448



# vector  size = 50

In [42]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=50,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1691075.82it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1567108.59it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1810701.21it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1748337.29it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1603959.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1980921.53it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1961183.30it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1819595.53it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1846905.05it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1986259.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1821247.13it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2092171.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1425133.24it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1892684.49it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2057312.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1886505.71it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2071709.74it/

LogisticRegression:
accuracy: 0.6087587006960556
precision: 0.6035299861321796
recall: 0.5912452121003683
              precision    recall  f1-score   support

          -1       0.63      0.75      0.68      1373
           0       0.60      0.55      0.57      1160
           1       0.59      0.48      0.53       915

    accuracy                           0.61      3448
   macro avg       0.60      0.59      0.59      3448
weighted avg       0.61      0.61      0.60      3448

RandomForest:
accuracy: 0.6313805104408353
precision: 0.6353577104008358
recall: 0.6108383159420652
              precision    recall  f1-score   support

          -1       0.62      0.79      0.70      1373
           0       0.64      0.56      0.59      1160
           1       0.65      0.48      0.55       915

    accuracy                           0.63      3448
   macro avg       0.64      0.61      0.61      3448
weighted avg       0.63      0.63      0.62      3448



# vector size = 100

In [43]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=100,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1774537.01it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1269049.60it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1866857.14it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1873636.38it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1807032.16it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1773752.66it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1556103.05it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1915216.45it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2082940.79it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1839052.48it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2017999.82it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1617391.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2053160.81it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1774711.40it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2027458.84it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1452938.76it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2052752.38it/

LogisticRegression:
accuracy: 0.6125290023201856
precision: 0.6111973593631416
recall: 0.5975730885925148
              precision    recall  f1-score   support

          -1       0.62      0.74      0.67      1373
           0       0.60      0.55      0.57      1160
           1       0.61      0.51      0.56       915

    accuracy                           0.61      3448
   macro avg       0.61      0.60      0.60      3448
weighted avg       0.61      0.61      0.61      3448

RandomForest:
accuracy: 0.6273201856148491
precision: 0.6335717845541599
recall: 0.6086259322191375
              precision    recall  f1-score   support

          -1       0.61      0.78      0.69      1373
           0       0.64      0.54      0.59      1160
           1       0.65      0.50      0.56       915

    accuracy                           0.63      3448
   macro avg       0.63      0.61      0.61      3448
weighted avg       0.63      0.63      0.62      3448



# vector size = 150

In [44]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1906421.31it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1980649.94it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1770405.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1672748.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1813155.54it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1780397.90it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1787800.80it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1773360.75it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1803783.24it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1711997.33it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1831498.57it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1579547.87it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1346951.28it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1863918.65it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1892734.08it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2032421.81it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1681470.79it/

LogisticRegression:
accuracy: 0.6203596287703016
precision: 0.6175282233111985
recall: 0.6038569545378668
              precision    recall  f1-score   support

          -1       0.63      0.75      0.69      1373
           0       0.60      0.56      0.58      1160
           1       0.62      0.50      0.55       915

    accuracy                           0.62      3448
   macro avg       0.62      0.60      0.61      3448
weighted avg       0.62      0.62      0.62      3448

RandomForest:
accuracy: 0.6305104408352669
precision: 0.6367282877365646
recall: 0.6101258487544085
              precision    recall  f1-score   support

          -1       0.62      0.80      0.70      1373
           0       0.63      0.54      0.58      1160
           1       0.66      0.49      0.56       915

    accuracy                           0.63      3448
   macro avg       0.64      0.61      0.61      3448
weighted avg       0.63      0.63      0.62      3448



# vector size = 200

In [45]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=200,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1981301.87it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1807981.97it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1818084.20it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1760524.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1718800.35it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1996691.36it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1847471.89it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1971944.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2055205.38it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1824236.77it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1929798.92it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1885422.41it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1351866.89it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1741466.85it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2032307.44it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1452266.90it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1735817.36it/

LogisticRegression:
accuracy: 0.6258700696055685
precision: 0.6234586354462887
recall: 0.6105557355685504
              precision    recall  f1-score   support

          -1       0.64      0.75      0.69      1373
           0       0.62      0.56      0.59      1160
           1       0.62      0.52      0.56       915

    accuracy                           0.63      3448
   macro avg       0.62      0.61      0.61      3448
weighted avg       0.62      0.63      0.62      3448

RandomForest:
accuracy: 0.6302204176334106
precision: 0.6340341458802654
recall: 0.6108753901558647
              precision    recall  f1-score   support

          -1       0.62      0.79      0.70      1373
           0       0.64      0.54      0.58      1160
           1       0.64      0.50      0.56       915

    accuracy                           0.63      3448
   macro avg       0.63      0.61      0.61      3448
weighted avg       0.63      0.63      0.62      3448



# en iyiler sq=0, alpha=0.1, window_size=7, vector_size=200

In [48]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="tweet", 
                         sg=0, 
                         negative=5,
                         alpha = 0.1,
                         min_alpha = 0.065,
                         window = 7,
                         vector_size=200,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1655459.13it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1604850.11it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1400188.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2069335.76it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1672438.78it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1719987.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2066139.51it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1672012.95it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1741214.98it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1941625.77it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1617608.41it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2029794.69it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1758295.69it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2036834.63it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1860126.79it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2005282.98it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1949906.96it/

LogisticRegression:
accuracy: 0.62122969837587
precision: 0.6158902966170939
recall: 0.6060673402766655
              precision    recall  f1-score   support

          -1       0.64      0.76      0.69      1373
           0       0.62      0.54      0.58      1160
           1       0.59      0.52      0.55       915

    accuracy                           0.62      3448
   macro avg       0.62      0.61      0.61      3448
weighted avg       0.62      0.62      0.62      3448

RandomForest:
accuracy: 0.6290603248259861
precision: 0.6409442046089743
recall: 0.6040871207083883
              precision    recall  f1-score   support

          -1       0.61      0.83      0.70      1373
           0       0.65      0.53      0.58      1160
           1       0.67      0.45      0.54       915

    accuracy                           0.63      3448
   macro avg       0.64      0.60      0.61      3448
weighted avg       0.64      0.63      0.62      3448

