In [3]:
# import libraries
from gensim.models.fasttext import FastText
import gensim
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
import pandas as pd
from gensim.models.phrases import Phrases, Phraser
import numpy as np
import xgboost 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas(desc="progress-bar")
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from collections import Counter
from nltk import ngrams

  from pandas import Panel


In [4]:
def classification_report(x_train, x_test, y_train, y_test):
  models = []
  models.append(('LogisticRegression', linear_model.LogisticRegression(solver='newton-cg',multi_class='multinomial')))
  models.append(('RandomForest', ensemble.RandomForestClassifier(n_estimators=100)))

  for name, model in models:
      clf=model
      clf.fit(x_train, y_train)
      y_pred=clf.predict(x_test)
      print(f"{name}:")
      print(f"accuracy: {metrics.accuracy_score(y_pred=y_pred, y_true=y_test)}")
      print(f"precision: {metrics.precision_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"recall: {metrics.recall_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"{metrics.classification_report(y_pred=y_pred, y_true=y_test)}")

In [5]:
def get_word_counts(data):
  words = data.tweet.to_string().split()
  return Counter(words)
  

In [4]:
# fastText hyper parameters
# sg = 0 cbow, 1 skip-gram
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# loss = "ns" "hs" "softmax"
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_n: char ngram minimum uzunluğu default:3
# max_n: max length of char ngrams (Default 6)
# epoch = iterasyon sayısı


In [6]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [7]:
def train_fasText(corpus, n_epoch, name_corpus, sg, vector_size, negative, window, min_count, alpha, min_n, max_n):
  cores = multiprocessing.cpu_count()
  model = FastText(sg=sg, size=vector_size, negative=negative, window=window, min_count=min_count, workers=cores, alpha=alpha, min_n=min_n, max_n=max_n)
  model.build_vocab([x.words for x in tqdm(corpus)])

  for epoch in range(n_epoch):
    model.train(utils.shuffle([x.words for x in tqdm(corpus)]), total_examples=len(corpus), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

  model.save(f"/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/trained_embeddings/fastText_{name_corpus}_sg_{sg}_size_{vector_size}_window_{window}_min_count_{min_count}.model")
  return model

In [8]:
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.wv]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return np.zeros((1, word2vec_model.vector_size))

In [9]:
def get_vectors(model, corpus):
  vectors = []
  for sentence in corpus:
      vec = get_mean_vector(model, sentence)
      vectors.append(vec)
  return vectors

In [10]:
def get_max_len_sentence(series):
  res = series.str.split().str.len().max()

  print(f"The maximum length in words are : {res}") 

In [11]:
data = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_movie_sentiment.csv")

In [12]:
data.dropna(inplace=True)
data.reset_index(drop=True,inplace=True)

x=data.comment
y=data.sentiment.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
corpus = labelize_tweets_ug(x, 'all')

In [14]:
corpus_train = pd.DataFrame(x_train)['comment'].apply(lambda x: x.split())
corpus_test = pd.DataFrame(x_test)['comment'].apply(lambda x: x.split())

# n_epoch = 25, sg = 1, negative = 5, alpha = 0.065
# değişenler:
# min_n, max_n = (2,4)  (3,6) (4,8) (8,26) window= 3 vector_size= 150
# window 3 5 7 min,max = (4,8)
# vector size = 25 50 75 100 window = 3 min,max = (4,8)

# min_n, max_n = (2,4)

In [15]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =2,
                        max_n = 4,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1666864.52it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1297644.07it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1747390.43it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1630058.02it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1464303.89it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1685828.47it/s]
  """


LogisticRegression:
accuracy: 0.7378329297820824
precision: 0.7372188176313277
recall: 0.735613726019861
              precision    recall  f1-score   support

           0       0.73      0.70      0.71      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7204600484261501
precision: 0.7200538661792832
recall: 0.7175642734267
              precision    recall  f1-score   support

           0       0.72      0.67      0.69      7761
           1       0.72      0.77      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# min_n, max_n = (3,6)

In [16]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =3,
                        max_n = 6,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1897340.65it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1689569.33it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1661484.52it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1722173.95it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1797126.82it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1654565.14it/s]
  """


LogisticRegression:
accuracy: 0.7403147699757869
precision: 0.739612000793217
recall: 0.7383065220276164
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7254842615012107
precision: 0.7249347932906489
recall: 0.7228968414015122
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.73      0.73      0.72     16520



# min_n, max_n = (4,8)

In [17]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =4,
                        max_n = 8,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1786110.42it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1871339.72it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1724351.19it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1828330.06it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1730561.58it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1824036.74it/s]
  """


LogisticRegression:
accuracy: 0.7423728813559322
precision: 0.7418087792501176
recall: 0.740166636855814
              precision    recall  f1-score   support

           0       0.74      0.70      0.72      7761
           1       0.75      0.78      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7265133171912833
precision: 0.7261210980559092
recall: 0.7237351419966747
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.72      0.72     16520
weighted avg       0.73      0.73      0.73     16520



# window 3

In [15]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =4,
                        max_n = 18,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1552322.48it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1759870.78it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1813049.49it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1728351.39it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1778728.25it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1692507.84it/s]
  """


LogisticRegression:
accuracy: 0.7413438256658595
precision: 0.7406840341832425
recall: 0.7392769524420473
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7270581113801453
precision: 0.7266140032720497
recall: 0.7243590074576265
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.72      0.72     16520
weighted avg       0.73      0.73      0.73     16520



# window 5

In [16]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1783214.46it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1758075.69it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1657319.62it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1880930.81it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1644543.23it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1818187.71it/s]
  """


LogisticRegression:
accuracy: 0.7454600484261501
precision: 0.7447492378612544
recall: 0.7435770851941211
              precision    recall  f1-score   support

           0       0.74      0.71      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.75     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.75      0.75      0.75     16520

RandomForest:
accuracy: 0.7285714285714285
precision: 0.7284539730274076
recall: 0.725521851369723
              precision    recall  f1-score   support

           0       0.73      0.68      0.70      7761
           1       0.73      0.78      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520



# window 7

In [17]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 7,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1671561.36it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1758807.57it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1660481.13it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1668020.19it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1705463.93it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1902634.05it/s]
  """


LogisticRegression:
accuracy: 0.7430387409200968
precision: 0.7423179478227632
recall: 0.7411248869662642
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.724818401937046
precision: 0.7246048505613947
recall: 0.7217917803807636
              precision    recall  f1-score   support

           0       0.72      0.67      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# vector size 25

In [18]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 3,
                        vector_size=25,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1680113.68it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1763157.84it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1917737.08it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1858360.41it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1756382.21it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1715496.67it/s]
  """


LogisticRegression:
accuracy: 0.7226392251815981
precision: 0.72167977048448
recall: 0.7208525141861191
              precision    recall  f1-score   support

           0       0.71      0.69      0.70      7761
           1       0.73      0.75      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520

RandomForest:
accuracy: 0.7220338983050848
precision: 0.7214620956985356
recall: 0.7194008073040752
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      7761
           1       0.73      0.76      0.74      8759

    accuracy                           0.72     16520
   macro avg       0.72      0.72      0.72     16520
weighted avg       0.72      0.72      0.72     16520



# vector size 50

In [19]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=50,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1624988.84it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1784307.38it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1796567.64it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1840148.73it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1752614.81it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1747073.20it/s]
  """


LogisticRegression:
accuracy: 0.7402542372881356
precision: 0.7395681840561432
recall: 0.7382127351580164
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7348062953995157
precision: 0.7344568036089109
recall: 0.7321502536408554
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      7761
           1       0.74      0.78      0.76      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520



# vector size 75

In [20]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=75,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1538979.09it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1766466.22it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1772957.03it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1625156.55it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1883671.65it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1712088.57it/s]
  """


LogisticRegression:
accuracy: 0.7409806295399516
precision: 0.7403053368390572
recall: 0.7389344475898951
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      7761
           1       0.75      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7317191283292979
precision: 0.7314514697611092
recall: 0.7289012973038764
              precision    recall  f1-score   support

           0       0.73      0.68      0.70      7761
           1       0.73      0.78      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.73      0.73     16520
weighted avg       0.73      0.73      0.73     16520



# vector size 100

In [21]:
model = train_fasText(corpus=corpus, 
                        n_epoch=5, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=100,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 82598/82598 [00:00<00:00, 1828011.70it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1709638.38it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1766646.38it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1667795.35it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1865806.70it/s]
100%|██████████| 82598/82598 [00:00<00:00, 1845147.06it/s]
  """


LogisticRegression:
accuracy: 0.7378329297820824
precision: 0.737116661668664
recall: 0.735797239657734
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      7761
           1       0.74      0.77      0.76      8759

    accuracy                           0.74     16520
   macro avg       0.74      0.74      0.74     16520
weighted avg       0.74      0.74      0.74     16520

RandomForest:
accuracy: 0.7268765133171913
precision: 0.7266567807702472
recall: 0.7239014737564686
              precision    recall  f1-score   support

           0       0.72      0.67      0.70      7761
           1       0.73      0.77      0.75      8759

    accuracy                           0.73     16520
   macro avg       0.73      0.72      0.72     16520
weighted avg       0.73      0.73      0.73     16520

