In [2]:
# import libraries
from gensim.models.fasttext import FastText
import gensim
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
import pandas as pd
from gensim.models.phrases import Phrases, Phraser
import numpy as np
import xgboost 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas(desc="progress-bar")
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from collections import Counter
from nltk import ngrams

  from pandas import Panel


In [3]:
def classification_report(x_train, x_test, y_train, y_test):
  models = []
  models.append(('LogisticRegression', linear_model.LogisticRegression(solver='newton-cg',multi_class='multinomial')))
  models.append(('RandomForest', ensemble.RandomForestClassifier(n_estimators=100)))

  for name, model in models:
      clf=model
      clf.fit(x_train, y_train)
      y_pred=clf.predict(x_test)
      print(f"{name}:")
      print(f"accuracy: {metrics.accuracy_score(y_pred=y_pred, y_true=y_test)}")
      print(f"precision: {metrics.precision_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"recall: {metrics.recall_score(y_pred=y_pred, y_true=y_test, average='macro')}")
      print(f"{metrics.classification_report(y_pred=y_pred, y_true=y_test)}")

In [4]:
def get_word_counts(data):
  words = data.tweet.to_string().split()
  return Counter(words)
  

In [5]:
# fastText hyper parameters
# sg = 0 cbow, 1 skip-gram
# min_count = corpusta kelimenin en az bulunma sayısı eğer kelime bu kadardan az geçiyorsa anlam ifade etmediği varsayılır(default=5)
# vector_size = kelimelerin ifade edileceği vektörün boyut sayısı
# window = current ve predicted word arasındaki maksimum mesafe
# loss = "ns" "hs" "softmax"
# negative = eğer sıfırdan büyük olursa negative sampling kullanılır 5-20 arasında olmalı
# alpha = başlangıç learning rate
# min_n: char ngram minimum uzunluğu default:3
# max_n: max length of char ngrams (Default 6)
# epoch = iterasyon sayısı


In [6]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [7]:
def train_fasText(corpus, n_epoch, name_corpus, sg, vector_size, negative, window, min_count, alpha, min_n, max_n):
  cores = multiprocessing.cpu_count()
  model = FastText(sg=sg, size=vector_size, negative=negative, window=window, min_count=min_count, workers=cores, alpha=alpha, min_n=min_n, max_n=max_n)
  model.build_vocab([x.words for x in tqdm(corpus)])

  for epoch in range(n_epoch):
    model.train(utils.shuffle([x.words for x in tqdm(corpus)]), total_examples=len(corpus), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

  model.save(f"/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/trained_embeddings/fastText_{name_corpus}_sg_{sg}_size_{vector_size}_window_{window}_min_count_{min_count}.model")
  return model

In [8]:
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.wv]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return np.zeros((1, word2vec_model.vector_size))

In [9]:
def get_vectors(model, corpus):
  vectors = []
  for sentence in corpus:
      vec = get_mean_vector(model, sentence)
      vectors.append(vec)
  return vectors

In [10]:
def get_max_len_sentence(series):
  res = series.str.split().str.len().max()

  print(f"The maximum length in words are : {res}") 

**TWEET METINLERI İÇİN BENCHMARK**

In [11]:
tweet_train = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_train.csv")
tweet_test = pd.read_csv("/content/drive/MyDrive/hesaplamalı_anlambilim_ödev/preprocess_test.csv")

In [12]:
tweet_test.dropna(inplace=True)
tweet_test.reset_index(drop=True,inplace=True)

tweet_train.dropna(inplace=True)
tweet_train.reset_index(drop=True,inplace=True)

x_train=tweet_train.tweet
y_train=tweet_train.sentiment.map({'olumlu':1,'olumsuz':-1,'notr':0}).values
x_test=tweet_test.tweet
y_test=tweet_test.sentiment.map({'olumlu':1,'olumsuz':-1,'notr':0}).values

In [13]:
concat = pd.concat([x_train, x_test])
corpus = labelize_tweets_ug(concat, 'all')

In [14]:
corpus_train = pd.DataFrame(x_train)['tweet'].apply(lambda x: x.split())
corpus_test = pd.DataFrame(x_test)['tweet'].apply(lambda x: x.split())

In [15]:
get_max_len_sentence(pd.DataFrame(concat).tweet)

The maximum length in words are : 27


# n_epoch = 25, sg = 1, negative = 5, alpha = 0.065
# değişenler:
# min_n, max_n = (2,4)  (3,6) (4,8) window= 3 vector_size= 150
# window 3 5 7 min,max = (4,8)
# vector size = 25 50 75 100 window = 3 min,max = (4,8)

# min_n, max_n = (2,4)

In [31]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =2,
                        max_n = 4,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1330159.35it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1651334.01it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1819366.38it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1438643.77it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1762328.08it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1477033.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1388960.96it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1779739.90it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1633668.89it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1364997.51it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1789395.15it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1824144.64it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1564528.99it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1744789.94it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1942931.40it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1718105.36it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1517750.58it/

LogisticRegression:
accuracy: 0.6487819025522041
precision: 0.6430791713300478
recall: 0.6346088512046438
              precision    recall  f1-score   support

          -1       0.67      0.77      0.72      1373
           0       0.64      0.59      0.61      1160
           1       0.62      0.55      0.58       915

    accuracy                           0.65      3448
   macro avg       0.64      0.63      0.64      3448
weighted avg       0.65      0.65      0.65      3448

RandomForest:
accuracy: 0.6374709976798144
precision: 0.6503638499540287
recall: 0.6122830677581264
              precision    recall  f1-score   support

          -1       0.62      0.83      0.71      1373
           0       0.65      0.56      0.60      1160
           1       0.69      0.44      0.54       915

    accuracy                           0.64      3448
   macro avg       0.65      0.61      0.62      3448
weighted avg       0.65      0.64      0.63      3448

SVM:
accuracy: 0.685034802784222

# min_n, max_n = (3,6)

In [32]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =3,
                        max_n = 6,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1427188.74it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1752451.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1649938.41it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1996194.76it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1608387.78it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1639192.67it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1412205.35it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1595174.87it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1690244.84it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2026548.75it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1701592.51it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1327177.75it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1613921.92it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1795354.76it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1687954.00it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1900702.65it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1788996.30it/

LogisticRegression:
accuracy: 0.652262180974478
precision: 0.6479713590702573
recall: 0.6376235530740754
              precision    recall  f1-score   support

          -1       0.67      0.77      0.72      1373
           0       0.65      0.59      0.62      1160
           1       0.63      0.55      0.59       915

    accuracy                           0.65      3448
   macro avg       0.65      0.64      0.64      3448
weighted avg       0.65      0.65      0.65      3448

RandomForest:
accuracy: 0.6531322505800464
precision: 0.6707176163641102
recall: 0.628402999277249
              precision    recall  f1-score   support

          -1       0.63      0.84      0.72      1373
           0       0.66      0.58      0.62      1160
           1       0.72      0.47      0.57       915

    accuracy                           0.65      3448
   macro avg       0.67      0.63      0.63      3448
weighted avg       0.66      0.65      0.64      3448

SVM:
accuracy: 0.6879350348027842


# min_n, max_n = (4,8)

In [33]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =4,
                        max_n = 8,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1634334.21it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1752451.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1595315.79it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1536377.05it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1249793.30it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1909444.98it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1700951.41it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1853397.23it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1650315.36it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1311491.04it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1961662.64it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1842053.95it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1646290.85it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2042825.32it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1747322.29it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1411984.51it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1425864.66it/

LogisticRegression:
accuracy: 0.6368909512761021
precision: 0.6336631485466095
recall: 0.6213164743729886
              precision    recall  f1-score   support

          -1       0.65      0.77      0.70      1373
           0       0.63      0.56      0.59      1160
           1       0.62      0.53      0.58       915

    accuracy                           0.64      3448
   macro avg       0.63      0.62      0.62      3448
weighted avg       0.64      0.64      0.63      3448

RandomForest:
accuracy: 0.6490719257540604
precision: 0.663095303564246
recall: 0.6238536294439593
              precision    recall  f1-score   support

          -1       0.63      0.84      0.72      1373
           0       0.65      0.57      0.61      1160
           1       0.71      0.46      0.56       915

    accuracy                           0.65      3448
   macro avg       0.66      0.62      0.63      3448
weighted avg       0.66      0.65      0.64      3448

SVM:
accuracy: 0.6830046403712297

# window 3

In [16]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n =4,
                        max_n = 18,
                        window = 3,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1667959.07it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1938551.43it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1548862.57it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1422719.28it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1565173.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1576204.58it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1589383.55it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1158157.82it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1732819.26it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1374713.17it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1724298.28it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1668498.45it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1528347.83it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1580930.68it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1741928.80it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1188181.46it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2096056.63it/

LogisticRegression:
accuracy: 0.6496519721577726
precision: 0.645502872865643
recall: 0.6342557066521046
              precision    recall  f1-score   support

          -1       0.66      0.77      0.71      1373
           0       0.65      0.59      0.62      1160
           1       0.62      0.54      0.58       915

    accuracy                           0.65      3448
   macro avg       0.65      0.63      0.64      3448
weighted avg       0.65      0.65      0.65      3448

RandomForest:
accuracy: 0.6548723897911833
precision: 0.6724687404112669
recall: 0.630759174955842
              precision    recall  f1-score   support

          -1       0.63      0.84      0.72      1373
           0       0.67      0.59      0.62      1160
           1       0.73      0.47      0.57       915

    accuracy                           0.65      3448
   macro avg       0.67      0.63      0.64      3448
weighted avg       0.67      0.65      0.65      3448



# window 5

In [17]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1276472.52it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1813519.71it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1911668.44it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1653753.60it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1287805.59it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1417943.65it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1867581.14it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1396857.66it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1488385.05it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1587078.78it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1715982.98it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1605349.44it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1911213.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1445032.88it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1768324.89it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1208881.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1685512.03it/

LogisticRegression:
accuracy: 0.6415313225058005
precision: 0.6393875416255268
recall: 0.6284215449616422
              precision    recall  f1-score   support

          -1       0.65      0.76      0.70      1373
           0       0.63      0.57      0.60      1160
           1       0.63      0.56      0.59       915

    accuracy                           0.64      3448
   macro avg       0.64      0.63      0.63      3448
weighted avg       0.64      0.64      0.64      3448

RandomForest:
accuracy: 0.6551624129930395
precision: 0.6750695717340665
recall: 0.6290173147797805
              precision    recall  f1-score   support

          -1       0.62      0.84      0.72      1373
           0       0.67      0.59      0.63      1160
           1       0.73      0.45      0.56       915

    accuracy                           0.66      3448
   macro avg       0.68      0.63      0.63      3448
weighted avg       0.67      0.66      0.65      3448



# window 7

In [18]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 7,
                        vector_size=150,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1697035.20it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1484652.93it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1526023.10it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2071175.12it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2000617.72it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1931760.05it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1378438.32it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1531523.45it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1951540.05it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1585580.78it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1548430.94it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1218116.42it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1492444.29it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1603282.80it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1354706.47it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1619857.45it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1362191.74it/

LogisticRegression:
accuracy: 0.6316705336426914
precision: 0.6269684729508896
recall: 0.6161324786764658
              precision    recall  f1-score   support

          -1       0.65      0.76      0.70      1373
           0       0.62      0.56      0.59      1160
           1       0.61      0.52      0.56       915

    accuracy                           0.63      3448
   macro avg       0.63      0.62      0.62      3448
weighted avg       0.63      0.63      0.63      3448

RandomForest:
accuracy: 0.6447215777262181
precision: 0.6632324041858676
recall: 0.6202809787839475
              precision    recall  f1-score   support

          -1       0.62      0.83      0.71      1373
           0       0.64      0.58      0.61      1160
           1       0.73      0.46      0.56       915

    accuracy                           0.64      3448
   macro avg       0.66      0.62      0.63      3448
weighted avg       0.66      0.64      0.64      3448



# vector size 25

In [19]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 3,
                        vector_size=25,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1645878.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1941677.96it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1621712.17it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1652769.79it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1712849.84it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1558688.55it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1390833.02it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1556136.57it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1901753.51it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1429137.06it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1625105.26it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1382871.70it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1829874.69it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1760481.18it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1060849.50it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1743610.69it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1428119.88it/

LogisticRegression:
accuracy: 0.6197795823665894
precision: 0.6142664843949519
recall: 0.6020096003423312
              precision    recall  f1-score   support

          -1       0.63      0.77      0.70      1373
           0       0.62      0.53      0.57      1160
           1       0.59      0.50      0.54       915

    accuracy                           0.62      3448
   macro avg       0.61      0.60      0.60      3448
weighted avg       0.62      0.62      0.61      3448

RandomForest:
accuracy: 0.660092807424594
precision: 0.663147722575612
recall: 0.6408168738097012
              precision    recall  f1-score   support

          -1       0.65      0.80      0.72      1373
           0       0.67      0.61      0.64      1160
           1       0.67      0.51      0.58       915

    accuracy                           0.66      3448
   macro avg       0.66      0.64      0.65      3448
weighted avg       0.66      0.66      0.65      3448



# vector size 50

In [20]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=50,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1411349.98it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1766724.64it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1597750.57it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1427019.57it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1718432.34it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1735316.95it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1959693.53it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1605278.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1750625.36it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1552991.71it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1395966.83it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1619603.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1430608.88it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1548895.78it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1629247.19it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1060538.00it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1467908.38it/

LogisticRegression:
accuracy: 0.6281902552204176
precision: 0.623084895384813
recall: 0.6124655910993159
              precision    recall  f1-score   support

          -1       0.64      0.76      0.70      1373
           0       0.64      0.56      0.60      1160
           1       0.59      0.52      0.55       915

    accuracy                           0.63      3448
   macro avg       0.62      0.61      0.61      3448
weighted avg       0.63      0.63      0.62      3448

RandomForest:
accuracy: 0.6592227378190255
precision: 0.671673799363108
recall: 0.6372324731273994
              precision    recall  f1-score   support

          -1       0.64      0.83      0.72      1373
           0       0.67      0.59      0.63      1160
           1       0.71      0.49      0.58       915

    accuracy                           0.66      3448
   macro avg       0.67      0.64      0.64      3448
weighted avg       0.67      0.66      0.65      3448



# vector size 75

In [21]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=75,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1556840.89it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1923939.37it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1877532.39it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1696915.61it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1493987.66it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1313494.26it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1475554.67it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1044376.54it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1671432.62it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1856541.16it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1478726.35it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1630387.17it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1501378.11it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1463892.34it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1706779.06it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1570925.66it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1628769.61it/

LogisticRegression:
accuracy: 0.627030162412993
precision: 0.6228882545948219
recall: 0.6128147179539472
              precision    recall  f1-score   support

          -1       0.64      0.76      0.69      1373
           0       0.63      0.55      0.58      1160
           1       0.60      0.54      0.57       915

    accuracy                           0.63      3448
   macro avg       0.62      0.61      0.61      3448
weighted avg       0.63      0.63      0.62      3448

RandomForest:
accuracy: 0.6534222737819025
precision: 0.6697048766782502
recall: 0.6291239295917255
              precision    recall  f1-score   support

          -1       0.63      0.84      0.72      1373
           0       0.66      0.58      0.62      1160
           1       0.72      0.47      0.57       915

    accuracy                           0.65      3448
   macro avg       0.67      0.63      0.63      3448
weighted avg       0.66      0.65      0.64      3448



# vector size 100

In [22]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 4,
                        max_n = 8,
                        window = 5,
                        vector_size=100,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1666958.29it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1713499.94it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1880220.30it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1617499.74it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1406266.86it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1566360.99it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1723352.11it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1801938.37it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1573834.97it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1827976.10it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1584850.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1562228.11it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1765256.68it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1482824.31it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1995753.54it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1737696.44it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1405199.95it/

LogisticRegression:
accuracy: 0.6415313225058005
precision: 0.6377774169307019
recall: 0.6267648249850534
              precision    recall  f1-score   support

          -1       0.65      0.77      0.71      1373
           0       0.63      0.57      0.60      1160
           1       0.62      0.54      0.58       915

    accuracy                           0.64      3448
   macro avg       0.64      0.63      0.63      3448
weighted avg       0.64      0.64      0.64      3448

RandomForest:
accuracy: 0.6528422273781903
precision: 0.6713334355549536
recall: 0.6291323927185452
              precision    recall  f1-score   support

          -1       0.62      0.84      0.71      1373
           0       0.66      0.58      0.62      1160
           1       0.73      0.48      0.58       915

    accuracy                           0.65      3448
   macro avg       0.67      0.63      0.64      3448
weighted avg       0.66      0.65      0.64      3448



# en iyiler vector_size = 100, window=5, min,max = (3,6)

In [23]:
model = train_fasText(corpus=corpus, 
                        n_epoch=25, 
                        name_corpus="tweet", 
                        sg=1, 
                        negative=5,
                        alpha = 0.065,
                        min_n = 3,
                        max_n = 6,
                        window = 5,
                        vector_size=100,
                        min_count=2)

vectors_train = get_vectors(model=model,
                            corpus=corpus_train)
vectors_test = get_vectors(model=model,
                            corpus=corpus_test)

X_train = np.array(vectors_train)
X_train = np.vstack(X_train)
X_test = np.array(vectors_test)
X_test = np.vstack(X_test)


classification_report(x_train=X_train,
                      x_test=X_test,
                      y_train=y_train,
                      y_test=y_test)

100%|██████████| 17222/17222 [00:00<00:00, 1568061.12it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1864784.79it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1716268.38it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1482489.55it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1861996.79it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1755859.49it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1874560.22it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1623936.14it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1222590.31it/s]
100%|██████████| 17222/17222 [00:00<00:00, 2013836.56it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1575517.00it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1413282.92it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1410192.75it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1689493.71it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1926916.09it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1785414.59it/s]
100%|██████████| 17222/17222 [00:00<00:00, 1959480.89it/

LogisticRegression:
accuracy: 0.6453016241299304
precision: 0.6400547229655147
recall: 0.6289731910100733
              precision    recall  f1-score   support

          -1       0.66      0.78      0.72      1373
           0       0.64      0.57      0.61      1160
           1       0.62      0.53      0.57       915

    accuracy                           0.65      3448
   macro avg       0.64      0.63      0.63      3448
weighted avg       0.64      0.65      0.64      3448

RandomForest:
accuracy: 0.6499419953596288
precision: 0.6663915291036622
recall: 0.6245697460691407
              precision    recall  f1-score   support

          -1       0.62      0.84      0.72      1373
           0       0.66      0.57      0.61      1160
           1       0.72      0.46      0.56       915

    accuracy                           0.65      3448
   macro avg       0.67      0.62      0.63      3448
weighted avg       0.66      0.65      0.64      3448

