In [None]:
# import libraries
from gensim.models import Doc2Vec
import gensim
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
import pandas as pd
from gensim.models.phrases import Phrases, Phraser
import numpy as np
import xgboost 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas(desc="progress-bar")
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from collections import Counter
from nltk import ngrams

  from pandas import Panel


In [None]:
def classification_report(x_train, x_test, y_train, y_test):
  models = []
  models.append(('LogisticRegression', linear_model.LogisticRegression(solver='newton-cg',multi_class='multinomial')))
  models.append(('RandomForest', ensemble.RandomForestClassifier(n_estimators=100)))

  for name, model in models:
      clf=model
      clf.fit(x_train, y_train)
      y_pred=clf.predict(x_test)
      print(f"{name}:")
      print(f"accuracy: {metrics.accuracy_score(y_pred=y_pred, y_true=y_test)}")
      print(f"precision: {metrics.precision_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"recall: {metrics.recall_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"{metrics.classification_report(y_pred=y_pred, y_true=y_test)}")

In [None]:
def get_word_counts(data):
  words = data.tweet.to_string().split()
  return Counter(words)
  

In [None]:
# word2vec hyper parameters
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# sg = 0 cbow, 1 skip-gram
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_alpha = eğitim aşamasında learning rate linear olarak bunu düşer
# epoch = iterasyon sayısı


In [None]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [None]:
def train_doc2vec(corpus, n_epoch, name_corpus, vector_size, negative, window, min_count, alpha, min_alpha):
  cores = multiprocessing.cpu_count()
  model = Doc2Vec(size=vector_size, negative=negative, window=window, min_count=min_count, workers=cores, alpha=alpha, min_alpha=min_alpha)
  model.build_vocab(corpus)

  for epoch in range(n_epoch):
    model.train(utils.shuffle(corpus), total_examples=len(corpus), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

  model.save(f"/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/trained_embeddings/Doc2Vec_{name_corpus}_size_{vector_size}_window_{window}_min_count_{min_count}.model")
  return model

In [None]:
def get_mean_vector(model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in model.wv]
    if len(words) >= 1:
        return np.mean(model[words], axis=0)
    else:
        return np.zeros((1, model.vector_size))

In [None]:
def get_vectors(model, corpus):
  vectors = []
  for sentence in corpus:
      vec = get_mean_vector(model, sentence)
      vectors.append(vec)
  return vectors

In [None]:
def get_max_len_sentence(series):
  res = series.str.split().str.len().max()

  print(f"The maximum length in words are : {res}") 

In [None]:
data = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_movie_sentiment.csv")

In [None]:
data.dropna(inplace=True)
data.reset_index(drop=True,inplace=True)

x=data.comment
y=data.sentiment.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
corpus = labelize_tweets_ug(x, 'all')

In [None]:
corpus_train = pd.DataFrame(x_train)['comment'].apply(lambda x: x.split())
corpus_test = pd.DataFrame(x_test)['comment'].apply(lambda x: x.split())

In [None]:
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# sg = 0 cbow, 1 skip-gram
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_alpha = eğitim aşamasında learning rate linear olarak bunu düşer
# epoch = iterasyon sayısı

# sg=0 epoch=25 negative=5 için (sg karar vermek için ilk deneme ikisi için yapılacak)
# alpha 0.5 0.01 0.05 0.1 her biri window = 3, vector_size = 150 kullanılacak 
# window 3 5 7 her biri alpha = 0.05, vector_size = 150 kullanılacak 
# vector_size 25 50 150 200 her biri  için window = 3, alpha = 0.05

In [None]:
# deneme 2
model_2 = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.5,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)



In [None]:
vectors_train = get_vectors(model=model_2,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model_2,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  
  


In [None]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.6858958837772398
precision: 0.6848244020587344
recall: 0.6832001568611321
              precision    recall  f1-score   support

           0       0.68      0.64      0.66      7761
           1       0.69      0.73      0.71      8759

    accuracy                           0.69     16520
   macro avg       0.68      0.68      0.68     16520
weighted avg       0.69      0.69      0.69     16520

RandomForest:
accuracy: 0.6478813559322034
precision: 0.6463957991434175
recall: 0.6446206327376649
              precision    recall  f1-score   support

           0       0.63      0.59      0.61      7761
           1       0.66      0.70      0.68      8759

    accuracy                           0.65     16520
   macro avg       0.65      0.64      0.64     16520
weighted avg       0.65      0.65      0.65     16520



# alpha(learning rate) = 0.01

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.01,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)



In [None]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  
  


In [None]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.7319007263922518
precision: 0.7314123750124597
recall: 0.7293661715505493
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      7761
           1       0.74      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520

RandomForest:
accuracy: 0.710411622276029
precision: 0.7102101939960562
recall: 0.7070386078418591
              precision    recall  f1-score   support

           0       0.71      0.65      0.68      7761
           1       0.71      0.76      0.74      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# learning rate 0.05

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)



In [None]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  
  


In [None]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.737590799031477
precision: 0.7369069060180697
recall: 0.7354881570889685
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7128934624697336
precision: 0.7121486782061617
recall: 0.7103186474908081
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      7761
           1       0.72      0.75      0.74      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# learning rate 0.1

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.1,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)



In [None]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  
  


In [None]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.7199757869249395
precision: 0.7190597934978965
recall: 0.717966444115743
              precision    recall  f1-score   support

           0       0.71      0.68      0.70      7761
           1       0.73      0.75      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520

RandomForest:
accuracy: 0.686319612590799
precision: 0.6855399086109003
recall: 0.6830418643961756
              precision    recall  f1-score   support

           0       0.68      0.63      0.65      7761
           1       0.69      0.74      0.71      8759

    accuracy                           0.69     16520
   macro avg       0.69      0.68      0.68     16520
weighted avg       0.69      0.69      0.69     16520



# window = 3

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)



In [None]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

  
  


In [None]:
classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

LogisticRegression:
accuracy: 0.7348062953995157
precision: 0.7340274702293081
recall: 0.732832924373743
              precision    recall  f1-score   support

           0       0.73      0.70      0.71      7761
           1       0.74      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520

RandomForest:
accuracy: 0.7142857142857143
precision: 0.7135902722973366
recall: 0.7116609449394508
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      7761
           1       0.72      0.76      0.74      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# window size = 5

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 5,
                         vector_size=150,
                         min_count=2)



In [None]:
vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

  
  


LogisticRegression:
accuracy: 0.738680387409201
precision: 0.7379719828611503
recall: 0.7366478014646933
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7157990314769975
precision: 0.715250309842179
recall: 0.7129852808528755
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      7761
           1       0.72      0.76      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.71      0.71     16520
weighted avg       0.72      0.72      0.72     16520



# window size = 7

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 7,
                         vector_size=150,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)



LogisticRegression:
accuracy: 0.7428571428571429
precision: 0.7421934793808618
recall: 0.7408288452664344
              precision    recall  f1-score   support

           0       0.74      0.71      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7187046004842615
precision: 0.7181256061203951
recall: 0.716004260399659
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      7761
           1       0.72      0.76      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# vector size = 25


In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=25,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)



LogisticRegression:
accuracy: 0.7179782082324455
precision: 0.7170833428016317
recall: 0.715855110517944
              precision    recall  f1-score   support

           0       0.71      0.68      0.69      7761
           1       0.73      0.75      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520

RandomForest:
accuracy: 0.7110774818401937
precision: 0.7104708641702606
recall: 0.7082317554087869
              precision    recall  f1-score   support

           0       0.71      0.66      0.68      7761
           1       0.72      0.76      0.73      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# vector  size = 50

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=50,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)



LogisticRegression:
accuracy: 0.7265133171912833
precision: 0.7257033627517911
recall: 0.7244104721840472
              precision    recall  f1-score   support

           0       0.72      0.69      0.70      7761
           1       0.73      0.76      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.72      0.72     16520
weighted avg       0.73      0.73      0.73     16520

RandomForest:
accuracy: 0.7139830508474576
precision: 0.7135425240389827
recall: 0.7109791347715183
              precision    recall  f1-score   support

           0       0.71      0.66      0.68      7761
           1       0.72      0.76      0.74      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# vector size = 100

In [None]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=100,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)



LogisticRegression:
accuracy: 0.7356537530266344
precision: 0.7351289009497965
recall: 0.733250393995322
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.73      0.73     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.715496368038741
precision: 0.7149458772815178
recall: 0.712677838506204
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      7761
           1       0.72      0.76      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.72      0.72      0.71     16520



# vector size = 150

In [39]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=150,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)



LogisticRegression:
accuracy: 0.7373486682808716
precision: 0.7367473716800657
recall: 0.7350983285195389
              precision    recall  f1-score   support

           0       0.73      0.70      0.71      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7074455205811138
precision: 0.7067100134257664
recall: 0.7047112797955721
              precision    recall  f1-score   support

           0       0.70      0.66      0.68      7761
           1       0.71      0.75      0.73      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.70      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# vector size = 200

In [40]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 3,
                         vector_size=200,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)



LogisticRegression:
accuracy: 0.738498789346247
precision: 0.7378894344382467
recall: 0.7362856948552294
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7113801452784504
precision: 0.7105833379595989
recall: 0.7088621817581148
              precision    recall  f1-score   support

           0       0.70      0.67      0.68      7761
           1       0.72      0.75      0.73      8759

    accuracy                           0.71     16520
   macro avg       0.71      0.71      0.71     16520
weighted avg       0.71      0.71      0.71     16520



# en iyiler sq=0, alpha=0.1, window_size=7, vector_size=200

In [41]:
model = train_doc2vec(corpus=corpus, 
                         n_epoch=25, 
                         name_corpus="movie", 
                         negative=5,
                         alpha = 0.05,
                         min_alpha = 0.065,
                         window = 7,
                         vector_size=200,
                         min_count=2)


vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)

classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)



LogisticRegression:
accuracy: 0.7424334140435835
precision: 0.7417718086941897
recall: 0.7403925535446825
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7236682808716707
precision: 0.7230665082770205
recall: 0.7211182522311176
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.76      0.75      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520

