In [1]:
import pandas as pd
import os
from pymongo import MongoClient
from dotenv import load_dotenv

In [2]:
# Nama database dan collection
db_name = 'youtube_rewind_indonesia'
collection_label = 'labeling_all'

# Memuat value dari file .env
load_dotenv()

mongodb_url = os.getenv('URL_SANDY')
local_url = os.getenv('URL_LOCAL')

In [3]:
# Membuat koneksi ke MongoDB
client = MongoClient(local_url)
db = client[db_name]  # Ganti dengan nama database Anda
labeling_all_collection = db[collection_label]

# Mengambil data dari koleksi MongoDB
labeled_data = list(labeling_all_collection.find())

# Konversi data menjadi dataframe
df = pd.DataFrame(labeled_data)

In [4]:
# Mengambil masing-masing 1000 data dari setiap kelas untuk data latih
train_df = pd.concat([
    df[df['label'] == 'netral'].sample(n=1000, random_state=42),
    df[df['label'] == 'positif'].sample(n=1000, random_state=42),
    df[df['label'] == 'negatif'].sample(n=1000, random_state=42)
])

# Menggunakan sisa data untuk data uji
remaining_df = df.drop(train_df.index)

# Pastikan sisa data cukup untuk setiap kelas dalam data uji
test_df = remaining_df.groupby('label').apply(lambda x: x.sample(n=1000, random_state=42) if len(x) >= 1000 else x).reset_index(drop=True)

print(f"Data Latih: {len(train_df)}")
print(f"Data Uji: {len(test_df)}")
print("Distribusi label pada data latih:")
print(train_df['label'].value_counts())
print("Distribusi label pada data uji:")
print(test_df['label'].value_counts())

Data Latih: 3000
Data Uji: 2557
Distribusi label pada data latih:
label
netral     1000
positif    1000
negatif    1000
Name: count, dtype: int64
Distribusi label pada data uji:
label
netral     1000
positif    1000
negatif     557
Name: count, dtype: int64


  test_df = remaining_df.groupby('label').apply(lambda x: x.sample(n=1000, random_state=42) if len(x) >= 1000 else x).reset_index(drop=True)


In [7]:
train_df

Unnamed: 0,_id,textOriginal,label,videoId
11476,6651a6a8c14ed2b8f4864227,emang beda,netral,Q5vQawTFJ0I
10608,6651a6a8c14ed2b8f4863ec3,alah kaget kak,netral,Q5vQawTFJ0I
7149,66519bf84bec1f5ab8644cc4,ksni gara tiktok,netral,LhYfsKc0_tA
2694,665189f294a2f3e28c752dcc,wih jodie,netral,1ZIGLm5cuDo
4999,6651962182f57e4e5420d513,bikin eric ko lim dia gk kesi,netral,mCyITaDib7M
...,...,...,...,...
4735,6651962182f57e4e5420d40b,kurang gigit,negatif,mCyITaDib7M
5517,6651962182f57e4e5420d719,eh parah erpan lari cepat,negatif,mCyITaDib7M
4341,6651962182f57e4e5420d281,gren scren,negatif,mCyITaDib7M
2301,665189f294a2f3e28c752c43,aku bingung dis like siapa yah,negatif,1ZIGLm5cuDo


In [9]:
# Koneksi ke MongoDB
client = MongoClient(local_url)
db = client[db_name]
collection = db["data_training"]

# Konversi DataFrame ke bentuk dictionary
records = train_df.to_dict(orient='records')

# Menyisipkan data ke MongoDB
collection.insert_many(records)

print("Data berhasil diimpor ke MongoDB Local.")

Data berhasil diimpor ke MongoDB Local.


In [8]:
test_df

Unnamed: 0,_id,textOriginal,label,videoId
0,6651837620ee1df904ba49a2,yang punya lagu gk,negatif,fNX6Gu2F7A8
1,6651837620ee1df904ba49b5,tetap bikin merinding keren salam dri,negatif,fNX6Gu2F7A8
2,6651837620ee1df904ba49be,jancuk keren parahthe best youtubers indonesia,negatif,fNX6Gu2F7A8
3,6651837620ee1df904ba49c9,enak nya pas akhir,negatif,fNX6Gu2F7A8
4,6651837620ee1df904ba49cd,apa cuma gue ngerasa cepet sangat video,negatif,fNX6Gu2F7A8
...,...,...,...,...
2552,6651962182f57e4e5420d148,best lah timone gk ad gk asikand gua suka dire...,positif,mCyITaDib7M
2553,6651a6a8c14ed2b8f4864391,keren banget woi nang banget banga banget,positif,Q5vQawTFJ0I
2554,6651a6a8c14ed2b8f4863d40,keren aku bingung pesa apa,positif,Q5vQawTFJ0I
2555,6651962182f57e4e5420d29e,lagu mantap,positif,mCyITaDib7M


In [11]:
# Koneksi ke MongoDB
client = MongoClient(local_url)
db = client[db_name]
collection = db["data_testing"]

# Konversi DataFrame ke bentuk dictionary
records = test_df.to_dict(orient='records')

# Menyisipkan data ke MongoDB
collection.insert_many(records)

print("Data berhasil diimpor ke MongoDB Local.")

Data berhasil diimpor ke MongoDB Local.
