In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re

from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer


In [None]:
file_path = "/content/drive/MyDrive/StackOverflow_Data/stackoverflow_filtered_2020-2024.csv"

chunksize = 100000
chunks = pd.read_csv(file_path, chunksize=chunksize)

row_count = 0
columns = None

for chunk in chunks:
    row_count += len(chunk)
    if columns is None:
        columns = chunk.columns.tolist()

print("Jumlah total baris:", row_count)
print("Kolom:", columns)


Jumlah total baris: 5907474
Kolom: ['Title', 'Body', 'Tags', 'CreationDate']


In [None]:
#frekuensi tag
tag_counter = Counter()

for chunk in pd.read_csv(file_path, chunksize=chunksize):
    # cleaning tags -> string jadi list
    chunk["Tags"] = chunk["Tags"].apply(lambda x: x.replace("<", "").replace(">", " ").split())
    for tags in chunk["Tags"]:
        tag_counter.update(tags)

print("Jumlah tag unik:", len(tag_counter))
print("10 tag teratas:", tag_counter.most_common(10))

Jumlah tag unik: 2523480
10 tag teratas: [('|python|', 52681), ('|javascript|', 36598), ('|python|pandas|', 28031), ('|r|', 26140), ('|html|css|', 24397), ('|javascript|reactjs|', 20177), ('|excel|vba|', 19225), ('|python|pandas|dataframe|', 18674), ('|flutter|dart|', 18580), ('|c++|', 17761)]


In [None]:
#membatasi jumlah tag top n
TOP_N = 5000
top_tags = set([tag for tag, _ in tag_counter.most_common(TOP_N)])
print(f"Ambil {len(top_tags)} tag terpopuler")

Ambil 5000 tag terpopuler


In [None]:
filtered_output = "/content/drive/MyDrive/StackOverflow_Data/all_baru/topN-5000.csv"
first_write = True

for chunk in pd.read_csv(file_path, chunksize=chunksize):
    chunk["Tags"] = chunk["Tags"].fillna("").apply(lambda x: x.strip("<>").split("><") if x else [])
    # filter: hanya pertanyaan yang mengandung minimal 1 tag dari top_tags
    chunk = chunk[chunk["Tags"].apply(lambda tags: any(t in top_tags for t in tags))]

    if not chunk.empty:
        if first_write:
            chunk.to_csv(filtered_output, index=False)
            first_write = False
        else:
            chunk.to_csv(filtered_output, index=False, mode="a", header=False)

print(f"✅ Dataset terfilter dengan Top-{TOP_N} tag disimpan di: {filtered_output}")

✅ Dataset terfilter dengan Top-5000 tag disimpan di: /content/drive/MyDrive/StackOverflow_Data/all_baru/topN-5000.csv


In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/StackOverflow_Data/all_baru/topN-5000.csv"

# baca file
df = pd.read_csv(file_path)

print("Kolom yang ada:", df.columns.tolist())
print("Jumlah total baris:", len(df))
print("Contoh 10 tag pertama:\n", df.head(10))

# cek apakah ada duplikat
duplicates = df.duplicated().sum()
print("Jumlah duplikat:", duplicates)

# cek apakah ada nilai kosong/null
nulls = df.isnull().sum().sum()
print("Jumlah null:", nulls)


Kolom yang ada: ['Title', 'Body', 'Tags', 'CreationDate']
Jumlah total baris: 1918986
Contoh 10 tag pertama:
                                                Title  \
0  Why is my call to strcmp not working as expected?   
1          How to align an enumerated list in latex?   
2            tkinter Frame not filling parent Canvas   
3  Concatenating strings into a range with altern...   
4  Test.MainForm.Dispose(bool): no ​suitable meth...   
5  Is it possible to show the computed/Methods ca...   
6                 Merge Two Array of Struct in Swift   
7  To add string at the end of matched line with ...   
8  How to download Docker for Windows without cre...   
9  django in production enviorment, urls.py only ...   

                                                Body  \
0  <p>I'm new to C and programming in general. It...   
1  <p>Suppose I want to center align the enumerat...   
2  <p>I'm trying to create a scrollable widget us...   
3  <p>I'm trying to concatenate a string out of s

#mulai dari sini bes

In [None]:
import pandas as pd
import ast
import re

file_path = "/content/drive/MyDrive/StackOverflow_Data/all_baru/topN-5000.csv"
output_path = "/content/drive/MyDrive/StackOverflow_Data/all_baru/topN-5000-bersih.csv"

chunksize = 50000
first = True
total_rows = 0

def clean_tags(tag_str):
    """
    Ubah string ['|python|django|'] -> ['python','django']
    """
    if pd.isna(tag_str):
        return []
    try:
        parsed = ast.literal_eval(tag_str)  # ubah string -> list
        if isinstance(parsed, list) and len(parsed) > 0:
            # ambil isi list pertama (karena kadang list bersarang)
            raw = parsed[0]
            # pisah berdasarkan '|', buang kosong
            tags = [t for t in raw.split("|") if t]
            return tags
    except:
        return []
    return []

for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunksize)):
    chunk["tag_list"] = chunk["Tags"].apply(clean_tags)

    # simpan ke csv baru
    mode = "w" if first else "a"
    header = first
    chunk.to_csv(output_path, index=False, mode=mode, header=header)
    first = False

    total_rows += len(chunk)
    print(f"Chunk {i+1} selesai diproses ({len(chunk)} baris)")

print(f"\nPreprocessing selesai! Total baris: {total_rows}")
print(f"Hasil tersimpan di: {output_path}")


Chunk 1 selesai diproses (50000 baris)
Chunk 2 selesai diproses (50000 baris)
Chunk 3 selesai diproses (50000 baris)
Chunk 4 selesai diproses (50000 baris)
Chunk 5 selesai diproses (50000 baris)
Chunk 6 selesai diproses (50000 baris)
Chunk 7 selesai diproses (50000 baris)
Chunk 8 selesai diproses (50000 baris)
Chunk 9 selesai diproses (50000 baris)
Chunk 10 selesai diproses (50000 baris)
Chunk 11 selesai diproses (50000 baris)
Chunk 12 selesai diproses (50000 baris)
Chunk 13 selesai diproses (50000 baris)
Chunk 14 selesai diproses (50000 baris)
Chunk 15 selesai diproses (50000 baris)
Chunk 16 selesai diproses (50000 baris)
Chunk 17 selesai diproses (50000 baris)
Chunk 18 selesai diproses (50000 baris)
Chunk 19 selesai diproses (50000 baris)
Chunk 20 selesai diproses (50000 baris)
Chunk 21 selesai diproses (50000 baris)
Chunk 22 selesai diproses (50000 baris)
Chunk 23 selesai diproses (50000 baris)
Chunk 24 selesai diproses (50000 baris)
Chunk 25 selesai diproses (50000 baris)
Chunk 26 

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/StackOverflow_Data/all_baru/topN-5000-bersih.csv"

df = pd.read_csv(file_path, nrows=5)

print("Kolom dataset:", df.columns.tolist())
print("\nContoh isi baris pertama:")
print(df.iloc[0])

sample = df["tag_list"].iloc[0]
try:
    parsed = ast.literal_eval(sample) if isinstance(sample, str) else sample
    print("\nTipe data tag_list:", type(parsed))
    print("Isi tag_list:", parsed)
except Exception as e:
    print("Gagal parse:", e)


Kolom dataset: ['Title', 'Body', 'Tags', 'CreationDate', 'tag_list']

Contoh isi baris pertama:
Title           Why is my call to strcmp not working as expected?
Body            <p>I'm new to C and programming in general. It...
Tags                                                      ['|c|']
CreationDate                              2020-01-01T09:13:36.567
tag_list                                                    ['c']
Name: 0, dtype: object

Tipe data tag_list: <class 'list'>
Isi tag_list: ['c']
