In [1]:
# import libraries
from gensim.models.word2vec import Word2Vec
import gensim
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
import pandas as pd
from gensim.models.phrases import Phrases, Phraser
import numpy as np
import xgboost 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas(desc="progress-bar")
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from collections import Counter
from nltk import ngrams

  from pandas import Panel


In [2]:
def classification_report(x_train, x_test, y_train, y_test):
  models = []
  models.append(('LogisticRegression', linear_model.LogisticRegression(solver='newton-cg',multi_class='multinomial')))
  models.append(('RandomForest', ensemble.RandomForestClassifier(n_estimators=100)))

  for name, model in models:
      clf=model
      clf.fit(x_train, y_train)
      y_pred=clf.predict(x_test)
      print(f"{name}:")
      print(f"accuracy: {metrics.accuracy_score(y_pred=y_pred, y_true=y_test)}")
      print(f"precision: {metrics.precision_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"recall: {metrics.recall_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"{metrics.classification_report(y_pred=y_pred, y_true=y_test)}")

In [3]:
def get_word_counts(data):
  words = data.tweet.to_string().split()
  return Counter(words)
  

In [4]:
# word2vec hyper parameters
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# sg = 0 cbow, 1 skip-gram
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_alpha = eğitim aşamasında learning rate linear olarak bunu düşer
# epoch = iterasyon sayısı


In [5]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [6]:
def train_word2vec(corpus, n_epoch, name_corpus, sg, vector_size, negative, window, min_count, alpha, min_alpha):
  cores = multiprocessing.cpu_count()
  model = Word2Vec(sg=sg, size=vector_size, negative=negative, window=window, min_count=min_count, workers=cores, alpha=alpha, min_alpha=min_alpha)
  model.build_vocab([x.words for x in tqdm(corpus)])

  for epoch in range(n_epoch):
    model.train(utils.shuffle([x.words for x in tqdm(corpus)]), total_examples=len(corpus), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

  model.save(f"/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/trained_embeddings/{name_corpus}_sg_{sg}_size_{vector_size}_window_{window}_min_count_{min_count}.model")
  return model

In [7]:
def get_mean_vector(model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in model.wv]
    if len(words) >= 1:
        return np.mean(model[words], axis=0)
    else:
        return np.zeros((1, model.vector_size))

In [8]:
def get_vectors(model, corpus):
  vectors = []
  for sentence in corpus:
      vec = get_mean_vector(model, sentence)
      vectors.append(vec)
  return vectors

In [9]:
def get_max_len_sentence(series):
  res = series.str.split().str.len().max()

  print(f"The maximum length in words are : {res}") 

In [10]:
data = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_movie_sentiment.csv")

In [12]:
data.dropna(inplace=True)
data.reset_index(drop=True,inplace=True)

x=data.comment
y=data.sentiment.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
corpus = labelize_tweets_ug(x, 'all')

In [14]:
corpus_train = pd.DataFrame(x_train)['comment'].apply(lambda x: x.split())
corpus_test = pd.DataFrame(x_test)['comment'].apply(lambda x: x.split())

In [22]:
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# sg = 0 cbow, 1 skip-gram
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_alpha = eğitim aşamasında learning rate linear olarak bunu düşer
# epoch = iterasyon sayısı

# sg=0 epoch=25 negative=5 için (sg karar vermek için ilk deneme ikisi için yapılacak)
# alpha 0.5 0.01 0.05 0.1 her biri window = 3, vector_size = 150 kullanılacak 
# window 3 5 7 her biri alpha = 0.05, vector_size = 150 kullanılacak 
# vector_size 25 50 150 200 her biri  için window = 3, alpha = 0.05

In [16]:
model_1 = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.5,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 82598/82598 [00:00<00:00, 2078368.23it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2108344.88it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2001716.75it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1991567.44it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1890734.22it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2058791.74it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2024349.63it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1822952.17it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2000237.42it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2112111.02it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2031424.24it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2134809.29it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1983937.52it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1960351.29it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2129193.79it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2033904.89it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2122241.82it/

In [17]:
vectors_train = get_vectors(model=model_1,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model_1,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [18]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.662953995157385
precision: 0.6617183760481924
recall: 0.6597227901092813
              precision    recall  f1-score   support

           0       0.65      0.61      0.63      7761
           1       0.67      0.71      0.69      8759

    accuracy                           0.66     16520
   macro avg       0.66      0.66      0.66     16520
weighted avg       0.66      0.66      0.66     16520

RandomForest:
accuracy: 0.6245157384987894
precision: 0.6226658114985069
recall: 0.6215070937251884
              precision    recall  f1-score   support

           0       0.61      0.57      0.59      7761
           1       0.64      0.67      0.65      8759

    accuracy                           0.62     16520
   macro avg       0.62      0.62      0.62     16520
weighted avg       0.62      0.62      0.62     16520



In [19]:
# deneme 2
model_2 = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=1, 
                         negative=5,
                         alpha = 0.5,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 82598/82598 [00:00<00:00, 1857403.92it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1958511.63it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1950045.15it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2080877.43it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2095021.42it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1966739.27it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2090281.24it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1976816.94it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2097266.26it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1906707.48it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1909176.75it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1944103.13it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2037948.89it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2094286.87it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1949913.45it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2049765.53it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1814873.08it/

In [20]:
vectors_train = get_vectors(model=model_2,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model_2,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [21]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.6463680387409201
precision: 0.6448700988464475
recall: 0.643046718276733
              precision    recall  f1-score   support

           0       0.63      0.59      0.61      7761
           1       0.66      0.70      0.68      8759

    accuracy                           0.65     16520
   macro avg       0.64      0.64      0.64     16520
weighted avg       0.65      0.65      0.65     16520

RandomForest:
accuracy: 0.6059927360774818
precision: 0.6038140832946602
recall: 0.6015655824269046
              precision    recall  f1-score   support

           0       0.59      0.53      0.56      7761
           1       0.62      0.67      0.64      8759

    accuracy                           0.61     16520
   macro avg       0.60      0.60      0.60     16520
weighted avg       0.60      0.61      0.60     16520



# yukarıdaki sonuçlara göre sg=0 -> cbow yöntemi için daha başarılı oldu onunla diğer sonuçlar alınacak

# alpha(learning rate) = 0.01

In [23]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.01,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 82598/82598 [00:00<00:00, 2052449.27it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2145850.48it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1977279.52it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2028771.41it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1939575.30it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1903470.35it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2136138.77it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2055615.61it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1991029.49it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2126253.55it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1973944.75it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1926620.93it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1885034.81it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2075081.74it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1989725.88it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2011187.48it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2046073.24it/

In [24]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [25]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.7291767554479419
precision: 0.7286878348709327
recall: 0.726584509339476
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520

RandomForest:
accuracy: 0.7110774818401937
precision: 0.7106883425801436
recall: 0.70793813358819
              precision    recall  f1-score   support

           0       0.71      0.66      0.68      7761
           1       0.71      0.76      0.74      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# learning rate 0.05

In [26]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 82598/82598 [00:00<00:00, 1883313.25it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2158767.22it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2066799.83it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1986542.67it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1953035.31it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2009332.79it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1849076.49it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2050493.46it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2015493.26it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1967722.29it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1969131.51it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2050760.49it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1960007.48it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1935792.95it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1905784.46it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1996088.51it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2060322.22it/

In [27]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [28]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.7398305084745763
precision: 0.7391997939211934
recall: 0.7376810163445704
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.722457627118644
precision: 0.7218741567567772
recall: 0.7198517801168571
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.76      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# learning rate 0.1

In [29]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.1,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 82598/82598 [00:00<00:00, 1970587.59it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1819515.04it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2088227.52it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1903930.63it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1999129.36it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2034478.21it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1976027.66it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2000872.80it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1999971.84it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2024385.11it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2082728.88it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2113657.35it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1945828.38it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2080739.96it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2081002.43it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1984403.44it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1972506.17it/

In [30]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [31]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.7328087167070217
precision: 0.7319889537068984
recall: 0.7308830827772723
              precision    recall  f1-score   support

           0       0.72      0.70      0.71      7761
           1       0.74      0.76      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520

RandomForest:
accuracy: 0.7130750605326877
precision: 0.7128433291807558
recall: 0.7097998886384816
              precision    recall  f1-score   support

           0       0.71      0.66      0.68      7761
           1       0.71      0.76      0.74      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# window = 3

In [32]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)

100%|██████████| 82598/82598 [00:00<00:00, 1923497.43it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2039208.44it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2013256.17it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1923422.67it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1945730.02it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1988024.62it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1988172.93it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1934096.25it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2075032.02it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2009484.30it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2087610.93it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1789810.67it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2044962.11it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1993217.43it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1910081.99it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2041455.49it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2069528.39it/

In [33]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  """
  
  


In [34]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.7395883777239709
precision: 0.7389859341046865
recall: 0.7373866148668349
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7204600484261501
precision: 0.7202135129479756
recall: 0.7173734192433121
              precision    recall  f1-score   support

           0       0.72      0.67      0.69      7761
           1       0.72      0.77      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# window size = 5

In [35]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 5,
                         vector_size=150,
                         min_count=2)

100%|██████████| 82598/82598 [00:00<00:00, 1902069.97it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1991178.25it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2012753.22it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1865414.89it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2138763.08it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2001265.79it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1943764.99it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2043357.94it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1926331.68it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1889105.25it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2194554.31it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1998126.24it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2128984.44it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1994043.46it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1888600.63it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2012566.13it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1957493.54it/

In [36]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

  """
  
  


LogisticRegression:
accuracy: 0.7414043583535109
precision: 0.7408347136112996
recall: 0.7391872256737742
              precision    recall  f1-score   support

           0       0.74      0.70      0.72      7761
           1       0.75      0.78      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7269975786924939
precision: 0.7265949550601012
recall: 0.7242431989514817
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.72      0.72     16520
weighted avg       0.73      0.73      0.73     16520



# window size = 7

In [37]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 7,
                         vector_size=150,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 2132339.03it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2054092.11it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2027655.26it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1901746.29it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1794074.26it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2086743.29it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2057422.36it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2144137.26it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2089827.31it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2118025.05it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1928165.02it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2089877.73it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2114883.14it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1915997.69it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2045300.16it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2076736.13it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1946626.52it/

LogisticRegression:
accuracy: 0.7463075060532688
precision: 0.7456951923186192
recall: 0.7442588144542372
              precision    recall  f1-score   support

           0       0.74      0.71      0.72      7761
           1       0.75      0.78      0.76      8759

    accuracy                           0.75     16520
   macro avg       0.75      0.74      0.74     16520
weighted avg       0.75      0.75      0.75     16520

RandomForest:
accuracy: 0.7291767554479419
precision: 0.7287146313762283
recall: 0.7265478066119015
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520



# vector size = 25


In [38]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=25,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1933243.61it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2031555.28it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1917705.24it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1921896.83it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2035566.01it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1929733.09it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2071235.85it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2157705.04it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2139066.81it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2019652.68it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2048626.16it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2052424.95it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1950484.31it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1927457.00it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2032174.95it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1995686.06it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2064902.74it/

LogisticRegression:
accuracy: 0.7207021791767555
precision: 0.7198313647010177
recall: 0.7185927294559278
              precision    recall  f1-score   support

           0       0.71      0.68      0.70      7761
           1       0.73      0.75      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520

RandomForest:
accuracy: 0.7177966101694915
precision: 0.7172028932941477
recall: 0.7150966144506744
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      7761
           1       0.72      0.76      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# vector  size = 50

In [39]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=50,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 2098002.91it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2108781.21it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1995801.03it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1869471.56it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2012718.14it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1961827.74it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1932531.86it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2033510.92it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2002318.36it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1971686.67it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2027738.33it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2088290.46it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2036990.28it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1979788.00it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2165148.75it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1888858.05it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1986895.85it/

LogisticRegression:
accuracy: 0.7325665859564164
precision: 0.7317894363189625
recall: 0.7305372974809321
              precision    recall  f1-score   support

           0       0.72      0.70      0.71      7761
           1       0.74      0.76      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520

RandomForest:
accuracy: 0.7275423728813559
precision: 0.7270774169291097
recall: 0.7248817455034635
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.72      0.73     16520
weighted avg       0.73      0.73      0.73     16520



# vector size = 100

In [40]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=100,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1840754.93it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1945664.46it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1987363.17it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2042550.77it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2017805.85it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2028712.01it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1895243.97it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2050857.61it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2079054.23it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1960972.68it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2200157.00it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2144190.34it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2092086.29it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1916400.44it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1901819.37it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2058180.19it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2031078.86it/

LogisticRegression:
accuracy: 0.7372881355932204
precision: 0.7367068604923366
recall: 0.735004541649939
              precision    recall  f1-score   support

           0       0.73      0.70      0.71      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7230629539951574
precision: 0.7225172085068203
recall: 0.7204152809915956
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.76      0.75      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# vector size = 150

In [41]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1894083.45it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2008820.14it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2073479.62it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2028937.76it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2009869.01it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2040349.37it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2165365.28it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1988138.70it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2089121.59it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2068008.89it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1829005.74it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2035398.58it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1956520.71it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2030281.37it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2053678.11it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1992002.59it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1992117.13it/

LogisticRegression:
accuracy: 0.7415859564164649
precision: 0.7410347734077674
recall: 0.7393437970088204
              precision    recall  f1-score   support

           0       0.74      0.70      0.72      7761
           1       0.75      0.78      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7216707021791767
precision: 0.7212199924788967
recall: 0.7188601077230203
              precision    recall  f1-score   support

           0       0.72      0.67      0.69      7761
           1       0.72      0.77      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# vector size = 200

In [42]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=200,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1925742.76it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1963128.40it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1833118.80it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1863418.20it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2051938.70it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1988892.01it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2122189.82it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2125379.58it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1962405.60it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2035326.83it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1890692.95it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2022033.83it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2015399.46it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2154109.49it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1934290.62it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2074448.05it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1902143.07it/

LogisticRegression:
accuracy: 0.7407990314769975
precision: 0.7403059721672947
recall: 0.7384548922521925
              precision    recall  f1-score   support

           0       0.74      0.70      0.72      7761
           1       0.74      0.78      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7200968523002421
precision: 0.7196510446690256
recall: 0.7172437902110929
              precision    recall  f1-score   support

           0       0.72      0.67      0.69      7761
           1       0.72      0.76      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# en iyiler sq=0, alpha=0.1, window_size=7, vector_size=200

In [43]:
model = train_word2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         sg=0, 
                         negative=5,
                         alpha = 0.1,
                         min_alpha = 0.065,
                         window = 7,
                         vector_size=200,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1911821.21it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1868654.78it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2019876.41it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1927939.69it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1993768.04it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1968348.36it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2051732.11it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2103557.05it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2057520.11it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2068379.30it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2086567.34it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1886687.59it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1998633.44it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2011689.65it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1826681.58it/s]
100%|██████████| 82598/82598 [00:00<00:00, 2008284.48it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1978917.22it/

LogisticRegression:
accuracy: 0.7354721549636803
precision: 0.7351110127728779
recall: 0.7328442441127685
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      7761
           1       0.74      0.78      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.73      0.73     16520
weighted avg       0.74      0.74      0.73     16520

RandomForest:
accuracy: 0.7096246973365617
precision: 0.70977045191627
recall: 0.7058854434466941
              precision    recall  f1-score   support

           0       0.71      0.64      0.68      7761
           1       0.71      0.77      0.74      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520

