In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

In [13]:
# load data traning
data_path = 'C:/Users/ASUS/TA01/00_data/data_bab3.csv'
data = pd.read_csv(data_path, sep=';')

data

Unnamed: 0,keluhan,bagian
0,cetak transkip nilai akhir menu nilai igracias...,BAA
1,sejahtera air asrama gedung keluh air kotor ba...,Bagian Asrama
2,aju praktikum susul,Laboran
3,lahan parkir gku sempit jalan rusak,Logistik dan Aset
4,sulit cari buku,Open Library
5,bantu cek data beasiswa semester ipk nol,Pengelolaan Kegiatan dan Kesejahteraan Mahasiswa
6,daftar geladi gelombang daftar,PPDU
7,minggu nilai eprt janji skor eprt,Pusat Bahasa
8,masuk igracias lupa password akun,Riset dan Layanan Teknologi Informasi


In [14]:
# Label Encoding
category_codes = {
    'BAA': 0,
    'BAGIAN ASRAMA': 1,
    'LABORAN': 2,
    'LOGISTIK DAN ASET': 3,
    'OPEN LIBRARY': 4,
    'PENGELOLAAN KEGIATAN DAN KESEJAHTERAAN MAHASISWA': 5,
    'PPDU': 6,
    'PUSAT BAHASA': 7,
    'RISET DAN LAYANAN TEKNOLOGI INFORMASI': 8
}

data['bagian_label'] = LabelEncoder().fit_transform(data['bagian']) #memberi label pada setiap bagian
data

Unnamed: 0,keluhan,bagian,bagian_label
0,cetak transkip nilai akhir menu nilai igracias...,BAA,0
1,sejahtera air asrama gedung keluh air kotor ba...,Bagian Asrama,1
2,aju praktikum susul,Laboran,2
3,lahan parkir gku sempit jalan rusak,Logistik dan Aset,3
4,sulit cari buku,Open Library,4
5,bantu cek data beasiswa semester ipk nol,Pengelolaan Kegiatan dan Kesejahteraan Mahasiswa,6
6,daftar geladi gelombang daftar,PPDU,5
7,minggu nilai eprt janji skor eprt,Pusat Bahasa,7
8,masuk igracias lupa password akun,Riset dan Layanan Teknologi Informasi,8


In [15]:
x_train = data['keluhan']
y_train = data['bagian_label']

In [16]:
# Text representation: TF-IDF
# TF-IDF parameter
ngram_range = (1,1) #untuk menghitung tfidf unigram dan bigram
min_df = 1
max_df = 1.0 #mindf dan maxdf untuk memberi batasan minimum dan maksimum ngram yang akan digunakan pada fungsi TfidfVectorizer
max_features = 25 #untuk mendapatkan 1000 top term dengan term frequency terbesar

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True) #membuat objek tfidfvectorizer ke variabel tfidf

In [17]:
features_train = tfidf.fit_transform(x_train) #memanggil method fit.transform dari objek tfidf pada x_train
labels_train = y_train
print(features_train)

  (0, 23)	0.4584241347075178
  (0, 3)	0.3871923225027238
  (0, 11)	0.4584241347075178
  (0, 13)	0.6555735891799439
  (1, 5)	0.3816141458138271
  (1, 7)	0.3816141458138271
  (1, 6)	0.3816141458138271
  (1, 0)	0.6461289150464732
  (1, 19)	0.3816141458138271
  (2, 17)	1.0
  (3, 18)	0.4472135954999579
  (3, 4)	0.4472135954999579
  (3, 21)	0.4472135954999579
  (3, 15)	0.4472135954999579
  (3, 8)	0.4472135954999579
  (4, 24)	1.0
  (5, 14)	0.7071067811865476
  (5, 20)	0.7071067811865476
  (6, 1)	1.0
  (7, 22)	0.42332907472849846
  (7, 2)	0.7167584293256076
  (7, 12)	0.42332907472849846
  (7, 13)	0.35755047611451257
  (8, 16)	0.5189380717981641
  (8, 9)	0.5189380717981641
  (8, 10)	0.5189380717981641
  (8, 3)	0.438303357179945


In [18]:
tfidf.get_feature_names()

['air',
 'daftar',
 'eprt',
 'igracias',
 'jalan',
 'kalo',
 'keluh',
 'kotor',
 'lahan',
 'lupa',
 'masuk',
 'menu',
 'minggu',
 'nilai',
 'nol',
 'parkir',
 'password',
 'praktikum',
 'rusak',
 'sejahtera',
 'semester',
 'sempit',
 'skor',
 'solusi',
 'sulit']

In [19]:
features_train.todense()

matrix([[0.        , 0.        , 0.        , 0.38719232, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.45842413, 0.        , 0.65557359, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.45842413, 0.        ],
        [0.64612892, 0.        , 0.        , 0.        , 0.        ,
         0.38161415, 0.38161415, 0.38161415, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.38161415,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        ,

In [20]:
import pandas as pd

df = pd.DataFrame(features_train.todense().T,
                  index=tfidf.get_feature_names(),
                  columns=[f'D{i+1}' for i in range(len(x_train))])

pd.set_option("max_columns", None)
pd.set_option("max_rows", None)
df

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9
air,0.0,0.646129,0.0,0.0,0.0,0.0,0.0,0.0,0.0
daftar,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
eprt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.716758,0.0
igracias,0.387192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438303
jalan,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0
kalo,0.0,0.381614,0.0,0.0,0.0,0.0,0.0,0.0,0.0
keluh,0.0,0.381614,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kotor,0.0,0.381614,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lahan,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0
lupa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.518938
