# Skipgram

In [None]:
import pandas as pd
import re

## Load data

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/dennywr/cobaprosaindata/main/ptautm.csv')

In [None]:
df = pd.DataFrame(data)
df

Unnamed: 0.1,Unnamed: 0,Judul,Penulis,Dosen Pembimbing I,Dosen Pembimbing II,Abstrak
0,0,PERANCANGAN DAN IMPLEMENTASI SISTEM DATABASE \...,A.Ubaidillah S.Kom,Budi Setyono M.T,Hermawan S.T,Sistem informasi akademik (SIAKAD) merupaka...
1,1,APLIKASI KONTROL DAN MONITORING JARINGAN KOMPU...,"M. Basith Ardianto,","Drs. Budi Soesilo, MT","Koko Joni, ST",Berjalannya koneksi jaringan komputer dengan l...
2,2,RANCANG BANGUN APLIKASI PROXY SERVER UNTUK\nEN...,"Akhmad Suyandi, S.Kom","Drs. Budi Soesilo, M.T","Hermawan, ST, MT",Web server adalah sebuah perangkat lunak serve...
3,3,SISTEM PENDUKUNG KEPUTUSAN OPTIMASI PENJADWALA...,Heri Supriyanto,"Mulaab, S.Si., M.Kom","Firli Irhamni, ST., M.Kom",Penjadwalan kuliah di Perguruan Tinggi me...
4,4,SISTEM AUGMENTED REALITY ANIMASI BENDA BERGERA...,Septian Rahman Hakim,"Arik Kurniawati, S.Kom., M.T.","Haryanto, S.T., M.T.",Seiring perkembangan teknologi yang ada diduni...
...,...,...,...,...,...,...
853,853,PENERAPAN ALGORITMA LONG-SHORT TERM MEMORY UNT...,Rachmad Agung Pambudi,"Eka Mala Sari Rochman, S.Kom., M.Kom","Sri Herawati, S.Kom., M.Kom",Investasi saham selama ini memiliki resiko ker...
854,854,SISTEM PENCARIAN TEKS AL-QURAN TERJEMAHAN BERB...,Nadila Hidayanti,"Achmad Jauhari, S.T., M.Kom","Ika Oktavia Suzanti, S.Kom., M.Cs",Information Retrieval (IR) merupakan pengambil...
855,855,KLASIFIKASI KOMPLEKSITAS VISUAL CITRA SAMPAH M...,Afni Sakinah,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Moch. Kautsar Sophan, S.Kom., M.MT.",Klasifikasi citra merupakan proses pengelompok...
856,856,IDENTIFIKASI BINER ATRIBUT PEJALAN KAKI MENGGU...,Friska Fatmawatiningrum,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Prof. Dr. Arief Muntasa, S.Si., M.MT.",Identifikasi atribut pejalan kaki merupakan sa...


In [None]:
df['Abstrak'] = df['Abstrak'].fillna('').astype(str)

## Preprocessing

- Hapus karakter spesial pada teks

In [None]:
def removeSpecialText (text):
  text = text.replace('\\t',"").replace('\\n',"").replace('\\u',"").replace('\\',"")
  text = text.encode('ascii', 'replace').decode('ascii')
  return text.replace("http://"," ").replace("https://", " ")
df['Abstrak'] = df['Abstrak'].apply(removeSpecialText)
print(df['Abstrak'])

0      Sistem  informasi  akademik  (SIAKAD) merupaka...
1      Berjalannya koneksi jaringan komputer dengan l...
2      Web server adalah sebuah perangkat lunak serve...
3      Penjadwalan  kuliah  di  Perguruan  Tinggi  me...
4      Seiring perkembangan teknologi yang ada diduni...
                             ...                        
853    Investasi saham selama ini memiliki resiko ker...
854    Information Retrieval (IR) merupakan pengambil...
855    Klasifikasi citra merupakan proses pengelompok...
856    Identifikasi atribut pejalan kaki merupakan sa...
857    Topik deteksi objek telah menarik perhatian ya...
Name: Abstrak, Length: 858, dtype: object


- Hapus tanda baca pada teks

In [None]:
def removePunctuation(text):
  text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text)
  return text

df['Abstrak'] = df['Abstrak'].apply(removePunctuation)
df['Abstrak'].head(20)

0     Sistem  informasi  akademik   SIAKAD  merupaka...
1     Berjalannya koneksi jaringan komputer dengan l...
2     Web server adalah sebuah perangkat lunak serve...
3     Penjadwalan  kuliah  di  Perguruan  Tinggi  me...
4     Seiring perkembangan teknologi yang ada diduni...
5     Gerak pekerja ada pada game yang memiliki genr...
6     Perkembangan game yang semakin pesat  memberik...
7     Sistem pengenalan wajah adalah suatu sistem un...
8     Teknologi mobile game beroperating system open...
9     Kantor Badan Kepegawaian kota Bangkalan adalah...
10    Penyusunan Sistem Informasi Dinas Perindustria...
11    Perusahaan pemerintah maupun swasta mempunyai ...
12    Pelayanan makanan bagi pasien rawat inap di Ru...
13    Penyusunan Sistem Pendukung Keputusan pemiliha...
14    Sidik jari adalah salah satu karakteristik fis...
15    Di Indonesia masalah perkembangan gizi adalah ...
16    Pengenalan tulisan tangan merupakan topik pene...
17    Citra senyum merupakan salah satu fitur bi

- Hapus angka pada teks

In [None]:
def removeNumbers (text):
  return re.sub(r"\d+", "", text)
df['Abstrak'] = df['Abstrak'].apply(removeNumbers)
df['Abstrak']

0      Sistem  informasi  akademik   SIAKAD  merupaka...
1      Berjalannya koneksi jaringan komputer dengan l...
2      Web server adalah sebuah perangkat lunak serve...
3      Penjadwalan  kuliah  di  Perguruan  Tinggi  me...
4      Seiring perkembangan teknologi yang ada diduni...
                             ...                        
853    Investasi saham selama ini memiliki resiko ker...
854    Information Retrieval  IR  merupakan pengambil...
855    Klasifikasi citra merupakan proses pengelompok...
856    Identifikasi atribut pejalan kaki merupakan sa...
857    Topik deteksi objek telah menarik perhatian ya...
Name: Abstrak, Length: 858, dtype: object

- Ubah semua huruf pada teks menjadi huruf kecil

In [None]:
def casefolding(Abstrak):
  Abstrak = Abstrak.lower()
  return Abstrak
df['Abstrak'] = df['Abstrak'].apply(casefolding)
df['Abstrak']

0      sistem  informasi  akademik   siakad  merupaka...
1      berjalannya koneksi jaringan komputer dengan l...
2      web server adalah sebuah perangkat lunak serve...
3      penjadwalan  kuliah  di  perguruan  tinggi  me...
4      seiring perkembangan teknologi yang ada diduni...
                             ...                        
853    investasi saham selama ini memiliki resiko ker...
854    information retrieval  ir  merupakan pengambil...
855    klasifikasi citra merupakan proses pengelompok...
856    identifikasi atribut pejalan kaki merupakan sa...
857    topik deteksi objek telah menarik perhatian ya...
Name: Abstrak, Length: 858, dtype: object

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


- Instalasi package/library Sastrawi

In [None]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

#Membuat fungsi steaming dengan library
factory = StemmerFactory()
stemming = factory.create_stemmer()

#Melakukan perulangan untuk memasukan kata kedalam fungsi stemming
hasil_stemming = []
for i in range (len(df["Abstrak"])):
  stem = stemming.stem(df["Abstrak"][i])
  hasil_stemming.append(stem)

#Menampilkan data hasil stemming kedalam dataframe
data_stemming = pd.DataFrame(hasil_stemming, columns=["stemming"])
data_stemming

Unnamed: 0,stemming
0,sistem informasi akademik siakad rupa sistem i...
1,jalan koneksi jaring komputer dengan lancar da...
2,web server adalah buah perangkat lunak server ...
3,jadwal kuliah di guru tinggi rupa masalah yang...
4,iring kembang teknologi yang ada dunia muncul ...
...,...
853,investasi saham lama ini milik resiko rugi yan...
854,information retrieval ir rupa ambil informasi ...
855,klasifikasi citra rupa proses kelompok piksel ...
856,identifikasi atribut pejal kaki rupa salah sat...


- Tokenizing dan Stop Words

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

#Inisialisasi fungsi stop words
stop_factory = StopWordRemoverFactory()

words = []

#Membuat perulangan untuk memasukkan dataset ke dalam tekonisasi dan list stopwords
for i in range (len(hasil_stemming)):

  #Inisialisai fungsi tokenisasi dan stopword
  tokens = word_tokenize(hasil_stemming[i])
  more_stopword = ['dengan', 'ia','bahwa','oleh','aalysis','aam','kunci']
  data = stop_factory.get_stop_words()+more_stopword
  stopword = stop_factory.create_stop_word_remover()

  #Melakukan removed kata
  removed = []
  for t in tokens:
      if t not in data:
          removed.append(t)

  #Memasukkan hasil removed kedalem variable words
  words.append(removed)
  print(removed)

['sistem', 'informasi', 'akademik', 'siakad', 'rupa', 'sistem', 'informasi', 'fungsi', 'tangan', 'kelola', 'saji', 'data', 'data', 'akademik', 'pihak', 'fakultas', 'siakad', 'anggap', 'sangat', 'penting', 'beri', 'layan', 'mahasiswa', 'butuh', 'informasi', 'akademik', 'universitas', 'trunojoyo', 'sedia', 'siakad', 'database', 'pusat', 'sistem', 'beri', 'lebih', 'awat', 'mudah', 'butuh', 'sedikit', 'biaya', 'sistem', 'sebut', 'potensi', 'mengahadapi', 'kendala', 'kendala', 'proses', 'transaksi', 'data', 'padat', 'jaring', 'tuju', 'database', 'siakad', 'lambat', 'pemrosesan', 'respon', 'query', 'data', 'simpan', 'makin', 'besar', 'pemrosesan', 'makin', 'kompleks', 'milik', 'lemah', 'sedia', 'data', 'sistem', 'perlu', 'kembang', 'sistem', 'database', 'lebih', 'baik', 'sistem', 'databases', 'distribusi', 'masing', 'masing', 'fakultas', 'jadi', 'solusi', 'masalah', 'atas', 'basisdata', 'distribusi', 'untung', 'milik', 'basisdata', 'pusat', 'awas', 'distribusi', 'reability', 'availability', 

- Melakukan penggabungan kata dari hasil tokenisasi

In [None]:
join=[]
for i in range(len(words)):
  join_words = ' '.join(words[i])
  join.append(join_words)

result = pd.DataFrame(join, columns=['Join Words'])
result

Unnamed: 0,Join Words
0,sistem informasi akademik siakad rupa sistem i...
1,jalan koneksi jaring komputer lancar ganggu ru...
2,web server buah perangkat lunak server fungsi ...
3,jadwal kuliah guru tinggi rupa masalah komplek...
4,iring kembang teknologi dunia muncul teknologi...
...,...
853,investasi saham lama milik resiko rugi sangat ...
854,information retrieval ir rupa ambil informasi ...
855,klasifikasi citra rupa proses kelompok piksel ...
856,identifikasi atribut pejal kaki rupa salah sat...


## TF-IDF (Ekstraksi Fitur)

TF-IDF (Term Frequency-Inverse Document Frequency) adalah metode yang digunakan mengevaluasi seberapa penting sebuah kata bagi sebuah dokumen dalam kumpulan dokumen dengan cara mengalikan berapa kali sebuah kata muncul dalam sebuah dokumen (Term Frequency) dengan frekuensi dokumen terbalik (Inverse Document Frequency) dari kata di seluruh set dokumen.

In [None]:
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords', quiet=True)

True

In [None]:
finalResult = pd.DataFrame(join, columns=['Join Words'])
finalResult

Unnamed: 0,Join Words
0,sistem informasi akademik siakad rupa sistem i...
1,jalan koneksi jaring komputer lancar ganggu ru...
2,web server buah perangkat lunak server fungsi ...
3,jadwal kuliah guru tinggi rupa masalah komplek...
4,iring kembang teknologi dunia muncul teknologi...
...,...
853,investasi saham lama milik resiko rugi sangat ...
854,information retrieval ir rupa ambil informasi ...
855,klasifikasi citra rupa proses kelompok piksel ...
856,identifikasi atribut pejal kaki rupa salah sat...


- Membentuk matriks dokumen x kata

In [None]:
from nltk.corpus import stopwords

stopwords = stopwords.words('indonesian')

tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                        stop_words=stopwords,
                        tokenizer = tokenizer.tokenize)


tfidf_matrix = vectorizer.fit_transform(finalResult['Join Words'])



In [None]:
def nltk_frequency_vectorize(corpus):

    # The NLTK frequency vectorize method
    from collections import defaultdict

    def vectorize(doc):
        features = defaultdict(int)

        for token in removed:
            features[token] += 1

        return features

    return map(vectorize, finalResult['Join Words'])
vectnltk=nltk_frequency_vectorize(finalResult['Join Words'])
type(vectnltk)

map

In [None]:
def sklearn_frequency_vectorize(corpus):
    # The Scikit-Learn frequency vectorize method
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    # mengembalikan frekuensi kemunculan
    return vectorizer.fit_transform(corpus)
vectsklen=sklearn_frequency_vectorize(finalResult['Join Words'])
print(vectsklen)

  (0, 5132)	7
  (0, 2176)	3
  (0, 109)	3
  (0, 5060)	4
  (0, 4764)	1
  (0, 1738)	1
  (0, 5497)	1
  (0, 2531)	1
  (0, 4782)	1
  (0, 1086)	5
  (0, 4154)	1
  (0, 1606)	2
  (0, 241)	1
  (0, 4812)	1
  (0, 4059)	1
  (0, 577)	2
  (0, 2915)	1
  (0, 3108)	1
  (0, 771)	2
  (0, 5897)	1
  (0, 5782)	1
  (0, 4895)	2
  (0, 1087)	3
  (0, 4440)	2
  (0, 2936)	2
  :	:
  (857, 969)	1
  (857, 5996)	1
  (857, 5515)	1
  (857, 5513)	1
  (857, 4493)	1
  (857, 2694)	1
  (857, 4224)	1
  (857, 3747)	1
  (857, 304)	2
  (857, 3743)	1
  (857, 5976)	1
  (857, 5272)	1
  (857, 3337)	1
  (857, 2322)	1
  (857, 4856)	1
  (857, 4813)	1
  (857, 6139)	4
  (857, 6140)	1
  (857, 3048)	1
  (857, 3742)	1
  (857, 700)	1
  (857, 1897)	1
  (857, 1178)	1
  (857, 233)	1
  (857, 4315)	2


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Membuat objek CountVectorizer
coun_vect = CountVectorizer(stop_words=stopwords)
count_matrix = coun_vect.fit_transform(finalResult['Join Words'])
count_array = count_matrix.toarray()
# Membuat DataFrame dari array dengan kolom berdasarkan term dalam teks
df = pd.DataFrame(data=count_array,columns = coun_vect.vocabulary_.keys())
df

Unnamed: 0,sistem,informasi,akademik,siakad,rupa,fungsi,tangan,kelola,saji,data,...,from,accelerated,segment,augmentasi,weak,stump,ransel,detector,anchor,pretrained
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
855,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
856,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# menampilkan bobot dari masing-masing term
vsc = pd.DataFrame(data=tfidf_matrix.toarray(),columns = vectorizer.vocabulary_.keys())
print("\nTF-IDF Vectorizer\n")
vsc


TF-IDF Vectorizer



Unnamed: 0,sistem,informasi,akademik,siakad,rupa,fungsi,tangan,kelola,saji,data,...,from,accelerated,segment,augmentasi,weak,stump,ransel,detector,anchor,pretrained
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
854,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
855,0.114924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
856,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(count_array)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
print(tfidf_matrix)

  (0, 2965)	0.09443907691059017
  (0, 3741)	0.1321364193827794
  (0, 828)	0.04957623319576153
  (0, 398)	0.10619987912967242
  (0, 4462)	0.12455046765952321
  (0, 405)	0.08564565727986274
  (0, 5770)	0.0900164180369778
  (0, 479)	0.24910093531904642
  (0, 5093)	0.0661608269986417
  (0, 1281)	0.27969482700935683
  (0, 1062)	0.1321364193827794
  (0, 2493)	0.04015562883728955
  (0, 2888)	0.08344202395990385
  (0, 3362)	0.07567624524333787
  (0, 2633)	0.08488190801146682
  (0, 4995)	0.07047375383335036
  (0, 4379)	0.16294161356807338
  (0, 4573)	0.08644191612898487
  (0, 3918)	0.27969482700935683
  (0, 2821)	0.08414811753089826
  (0, 5666)	0.04280133609859509
  (0, 2310)	0.06758970883936101
  (0, 3770)	0.10399624580971352
  (0, 5610)	0.08276161228587339
  (0, 4307)	0.032051506779121275
  :	:
  (857, 3990)	0.07157473862986906
  (857, 869)	0.05049704519670171
  (857, 2055)	0.06369148463092639
  (857, 3104)	0.05547877016878425
  (857, 1894)	0.04198676041402508
  (857, 1961)	0.0706472181682137

In [None]:
print(tfidf_matrix.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.11492366 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


## Word2Vec (Skipgram)

Skip-gram adalah bagian dari algoritma Word2Vec yang digunakan untuk melatih representasi vektor kata yang menangkap makna semantik dan sintaksis. Kata-kata yang sering muncul dalam konteks yang sama cenderung memiliki vektor yang mirip

In [None]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

In [None]:
# tokenisasi
sentences = [word_tokenize(text) for text in finalResult['Join Words']]

# train model Word2Vec (Skip-gram)
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# model.save('word2vec_model')

# Fungsi untuk menghitung vektor rata-rata dari kata-kata dalam teks
def calculate_average_vector(text, model):
    tokens = word_tokenize(text)
    vector_sum = np.zeros(model.vector_size)
    count = 0
    for token in tokens:
        if token in model.wv:
            vector_sum += model.wv[token]
            count += 1
    if count == 0:
        return np.zeros(model.vector_size)
    return vector_sum / count

# Ekstraksi fitur dari keseluruhan teks
finalResult['average_vector'] = finalResult['Join Words'].apply(lambda x: calculate_average_vector(x, model))

# print("Vektor rata-rata")
dfSkipgram = pd.DataFrame(finalResult['average_vector'])
dfSkipgram

Unnamed: 0,average_vector
0,"[0.07063663863383307, 0.04908818524045003, 0.0..."
1,"[0.06361712262664429, 0.02655505702354918, 0.0..."
2,"[0.06368132762321638, 0.0985399116956008, 0.13..."
3,"[0.05226992804228383, 0.05942789698019624, 0.0..."
4,"[0.11934994460404236, 0.13313512612169362, -0...."
...,...
853,"[0.012919313900887833, 0.0879733121787597, 0.0..."
854,"[0.013559594003093098, 0.0984657802568969, 0.0..."
855,"[-0.07221999396875617, 0.07375618708902039, 0...."
856,"[-0.04814973098706478, 0.10142612304592219, 0...."


In [None]:
# Menggabungkan vektor rata-rata dengan fitur-fitur lainnya
finalResult_with_features = finalResult.join(dfSkipgram)

# Hitung korelasi antara vektor rata-rata dan fitur-fitur lainnya
# correlation_matrix = finalResult_with_features.corr()
correlation_matrix = finalResult_with_features.corr(numeric_only=True)


# Tampilkan korelasi dalam bentuk DataFrame
print("Matrix Korelasi:")
print(correlation_matrix)


Matrix Korelasi:
Empty DataFrame
Columns: []
Index: []
