# Tugas 2 PPW Pre Processing Data PTA Trunojoyo

In [None]:
import numpy as np
import pandas as pd

## Import Data

In [None]:
# df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataPTAInformatika.csv')
# df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataPTAInformatikaMini.csv')
df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataPTAInformatikaLabel.csv',delimiter=';')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",Gerak pekerja ada pada game yang memiliki genr...,Jurusan Teknik Informatika,RPL
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.","Perkembangan game yang semakin pesat, memberik...",Jurusan Teknik Informatika,RPL
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",Sistem pengenalan wajah adalah suatu sistem un...,Jurusan Teknik Informatika,Kecerdasan Komputasional
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",Teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",Kantor Badan Kepegawaian kota Bangkalan adalah...,Jurusan Teknik Informatika,RPL


## Pre Processing Data

### Cek Data yang Kosong

In [None]:
df.isnull().sum()

Judul             6
Nama Penulis      0
Pembimbing I      0
Pembimbing II    12
Abstrak          29
Prodi             5
Label             7
dtype: int64

### Menghapus Data yang Kosong

In [None]:
df.dropna(inplace=True)

### Cleaning Data

#### Membuat Fungsi Cleaning Data
- Tag HTML
- LowerCase Data
- Spasi pada teks
- Tanda baca dan karakter spesial
- Nomor
- Komponen Lainnya

In [None]:
import re, string

# Text Cleaning
def cleaning(text):
    # Menghapus tag HTML
    text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', str(text))

    # Mengubah seluruh teks menjadi huruf kecil
    text = text.lower()

    # Menghapus spasi pada teks
    text = text.strip()

    # Menghapus Tanda Baca, karakter spesial, and spasi ganda
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub("â", "", text)

    # Menghapus Nomor
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Mengubah text yang berisi 'nan' dengan whitespace agar nantinya dapat dihapus
    text = re.sub('nan', '', text)

    return text

#### Implementasi Fungsi Pada Data Frame Abstrak

In [None]:
df['Abstrak'] = df['Abstrak'].apply(lambda x: cleaning(x))
df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak pekerja ada pada game yang memiliki genr...,Jurusan Teknik Informatika,RPL
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",perkembangan game yang semakin pesat memberika...,Jurusan Teknik Informatika,RPL
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem pengenalan wajah adalah suatu sistem un...,Jurusan Teknik Informatika,Kecerdasan Komputasional
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan kepegawaian kota bangkalan adalah...,Jurusan Teknik Informatika,RPL


### Tokenisasi Data
Memisahkan sebuah Dokumen menjadi susunan per kata / term

#### Import Library NLTK

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

#### Implementasi Library pada Data

In [None]:
df['abstrak_tokens'] = df['Abstrak'].apply(lambda x: word_tokenize(x))
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja ada pada game yang memiliki genr...,"[gerak, pekerja, ada, pada, game, yang, memili..."
1,perkembangan game yang semakin pesat memberika...,"[perkembangan, game, yang, semakin, pesat, mem..."
2,sistem pengenalan wajah adalah suatu sistem un...,"[sistem, pengenalan, wajah, adalah, suatu, sis..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan adalah...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


### Stopword Data
Mengubah isi dokumen sesuai dengan kamus data

#### Import Library NLTK

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Implementasi Library pada Data

In [None]:
from nltk.corpus import stopwords
from itertools import chain

stop_words = set(chain(stopwords.words('indonesian')))
df['abstrak_tokens'] = df['abstrak_tokens'].apply(lambda x: [w for w in x if not w in stop_words])

In [None]:
df['Abstrak'] = df['abstrak_tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja game memiliki genre rts real tim...,"[gerak, pekerja, game, memiliki, genre, rts, r..."
1,perkembangan game pesat alternative peminatnya...,"[perkembangan, game, pesat, alternative, pemin..."
2,sistem pengenalan wajah sistem mengenali ident...,"[sistem, pengenalan, wajah, sistem, mengenali,..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan instan...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


### Steaming Data
Mengubah kata menjadi bentuk dasar

#### Import Library Sastrawi

In [None]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m112.6/209.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


#### Implementasi Library pada Data

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm.auto import tqdm
tqdm.pandas()

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
df['abstrak_tokens'] = df['abstrak_tokens'].progress_apply(lambda x: stemmer.stem(' '.join(x)).split(' '))

  0%|          | 0/818 [00:00<?, ?it/s]

In [None]:
df['Abstrak'] = df['abstrak_tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
df

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label,abstrak_tokens
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak kerja game milik genre rts real time str...,Jurusan Teknik Informatika,RPL,"[gerak, kerja, game, milik, genre, rts, real, ..."
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",kembang game pesat alternative minat bentuk ga...,Jurusan Teknik Informatika,RPL,"[kembang, game, pesat, alternative, minat, ben..."
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem kenal wajah sistem nali identitas wajah...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[sistem, kenal, wajah, sistem, nali, identitas..."
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL,"[teknologi, mobile, game, beroperating, system..."
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan pegawai kota bangkal instansi per...,Jurusan Teknik Informatika,RPL,"[kantor, badan, pegawai, kota, bangkal, instan..."
...,...,...,...,...,...,...,...,...
848,PENERAPAN ALGORITMA LONG-SHORT TERM MEMORY UNT...,Rachmad Agung Pambudi,"Eka Mala Sari Rochman, S.Kom., M.Kom","Sri Herawati, S.Kom., M.Kom",investasi saham milik resiko rugi dikarenakanp...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[investasi, saham, milik, resiko, rugi, dikare..."
849,SISTEM PENCARIAN TEKS AL-QURAN TERJEMAHAN BERB...,Nadila Hidayanti,"Achmad Jauhari, S.T., M.Kom","Ika Oktavia Suzanti, S.Kom., M.Cs",information retrieval ir ambil informasi simpa...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[information, retrieval, ir, ambil, informasi,..."
850,KLASIFIKASI KOMPLEKSITAS VISUAL CITRA SAMPAH M...,Afni Sakinah,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Moch. Kautsar Sophan, S.Kom., M.MT.",klasifikasi citra proses kelompok piksel citra...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[klasifikasi, citra, proses, kelompok, piksel,..."
851,IDENTIFIKASI BINER ATRIBUT PEJALAN KAKI MENGGU...,Friska Fatmawatiningrum,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Prof. Dr. Arief Muntasa, S.Si., M.MT.",identifikasi atribut pejal kaki salah teliti k...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[identifikasi, atribut, pejal, kaki, salah, te..."


In [None]:
df.to_csv('DataSteaming.csv', index=False)

In [None]:
# df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataSteaming.csv')
# df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label,abstrak_tokens
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak kerja game milik genre rts real time str...,Jurusan Teknik Informatika,RPL,"['gerak', 'kerja', 'game', 'milik', 'genre', '..."
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",kembang game pesat alternative minat bentuk ga...,Jurusan Teknik Informatika,RPL,"['kembang', 'game', 'pesat', 'alternative', 'm..."
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem kenal wajah sistem nali identitas wajah...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"['sistem', 'kenal', 'wajah', 'sistem', 'nali',..."
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL,"['teknologi', 'mobile', 'game', 'beroperating'..."
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan pegawai kota bangkal instansi per...,Jurusan Teknik Informatika,RPL,"['kantor', 'badan', 'pegawai', 'kota', 'bangka..."


## Ekstraksi Fitur

### Term Frekuensi

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(np.array(df['Abstrak']))

terms_count = count_vectorizer.get_feature_names_out()
df_countvect = pd.DataFrame(data = X_count.toarray(),columns = terms_count)
df_countvect

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
816,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
token_counts = df_countvect.sum(axis=0)

non_zero_token_counts = token_counts[token_counts != 0]

print("Token Counts yang Tidak Sama dengan 0:")
print(non_zero_token_counts)

Token Counts yang Tidak Sama dengan 0:
aalysis    1
abad       1
abadi      2
abai       1
abdi       3
          ..
zone       3
zoning     4
zoom       3
zucara     1
zungu      1
Length: 6366, dtype: int64


In [None]:
df_countvect.to_csv('Data_CountVectorize.csv', index=False)

### One Hot Encoding

In [None]:
df_binary = df_countvect.applymap(lambda x: 1 if x > 0 else 0)
df_binary

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
816,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_binary.to_csv('Data_OneHotEncoder.csv', index=False)

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['Abstrak'].tolist())

terms = vectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = X_tfidf.toarray(),columns = terms)
df_tfidfvect

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_tfidfvect.to_csv('Data_TF-IDF.csv', index=False)

### Log Frekuensi

In [None]:
df_log = df_countvect.applymap(lambda x: np.log1p(x) if x > 0 else 0)
df_log

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_log.to_csv('Data_LogFrekuensi.csv', index=False)

## Skip Gram Data

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
import pandas as pd

sentences = df['abstrak_tokens'].tolist()

model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)

word = "hasil"
similar_words = model.wv.most_similar(word)

print(f"Kata yang mirip dengan '{word}':")
for w, sim in similar_words:
    print(f"{w}: {sim:.4f}")

Kata yang mirip dengan 'hasil':
tingkat: 0.9966
sistem: 0.9964
nilai: 0.9963
kunci: 0.9963
proses: 0.9962
metode: 0.9962
main: 0.9962
sakit: 0.9961
aplikasi: 0.9960
milik: 0.9956


In [None]:
import gensim
from gensim.models import Word2Vec
import pandas as pd

sentences = df['abstrak_tokens'].tolist()

model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)

word = "gerak"
context_words = []

if word in model.wv:
    word_vector = model.wv[word]
    similar_words = model.wv.most_similar([word_vector], topn=3)
    context_words = [w for w, _ in similar_words]

print(f"Kata-kata dalam konteks window=1 untuk '{word}':")
for w in context_words:
    print(w)


Kata-kata dalam konteks window=1 untuk 'gerak':
gerak
main
bas


In [None]:
import gensim
from gensim.models import Word2Vec

model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)

word1 = "gerak"
word2 = "bs"

if word1 in model.wv and word2 in model.wv:
    vector1 = model.wv[word1]
    vector2 = model.wv[word2]
    similarity = model.wv.cosine_similarities(vector1, [vector2])[0]

    print(f"Kesamaan kosakata antara '{word1}' dan '{word2}': {similarity:.4f}")
else:
    print("Salah satu atau kedua kata tidak ada dalam model.")


Salah satu atau kedua kata tidak ada dalam model.


# Tugas 3 LDA Modeling

## LDA Modeling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

## Modeling Data
- k=3
- alpha=0.1
- betha=0.2

In [None]:
lda_model = LatentDirichletAllocation(n_components=3, doc_topic_prior=0.1, topic_word_prior=0.2, random_state=42)
lda_model.fit(df_countvect)

## proporsi topik pada dokumen

In [None]:
doc_topic_proportions = lda_model.transform(df_countvect)

for i, doc in enumerate(df['Abstrak']):
    print(f"Dokumen {i+1}:")
    for j, topic_prob in enumerate(doc_topic_proportions[i]):
        print(f"Topik {j+1}: {topic_prob:.4f}")
    print()


Dokumen 1:
Topik 1: 0.9975
Topik 2: 0.0013
Topik 3: 0.0013

Dokumen 2:
Topik 1: 0.9978
Topik 2: 0.0011
Topik 3: 0.0011

Dokumen 3:
Topik 1: 0.0009
Topik 2: 0.0009
Topik 3: 0.9983

Dokumen 4:
Topik 1: 0.5392
Topik 2: 0.3484
Topik 3: 0.1124

Dokumen 5:
Topik 1: 0.6607
Topik 2: 0.0009
Topik 3: 0.3384

Dokumen 6:
Topik 1: 0.7362
Topik 2: 0.0009
Topik 3: 0.2629

Dokumen 7:
Topik 1: 0.9982
Topik 2: 0.0009
Topik 3: 0.0009

Dokumen 8:
Topik 1: 0.3559
Topik 2: 0.1195
Topik 3: 0.5246

Dokumen 9:
Topik 1: 0.1202
Topik 2: 0.7713
Topik 3: 0.1085

Dokumen 10:
Topik 1: 0.3282
Topik 2: 0.5134
Topik 3: 0.1584

Dokumen 11:
Topik 1: 0.0923
Topik 2: 0.7871
Topik 3: 0.1207

Dokumen 12:
Topik 1: 0.0008
Topik 2: 0.0008
Topik 3: 0.9984

Dokumen 13:
Topik 1: 0.0009
Topik 2: 0.0009
Topik 3: 0.9982

Dokumen 14:
Topik 1: 0.0008
Topik 2: 0.0526
Topik 3: 0.9467

Dokumen 15:
Topik 1: 0.9984
Topik 2: 0.0008
Topik 3: 0.0008

Dokumen 16:
Topik 1: 0.0009
Topik 2: 0.9982
Topik 3: 0.0009

Dokumen 17:
Topik 1: 0.0012
Topik

In [None]:
topic_word_distributions = lda_model.components_

feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(topic_word_distributions):
    top_words_idx = topic.argsort()[::-1][:10]  # Ambil 10 kata teratas
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topik {topic_idx+1}:")
    print(", ".join(top_words))
    print()


Topik 1:
sistem, ajar, hasil, aplikasi, nilai, metode, informasi, data, game, proses

Topik 2:
hasil, data, metode, nilai, teliti, proses, sistem, tingkat, akurasi, uji

Topik 3:
citra, metode, hasil, sistem, sakit, teliti, data, akurasi, proses, uji

