# Web Usage Mining

## Dataset webuage

### Filter mengambil request url .html, status 200, dan request methode get

In [None]:
import pandas as pd

# ===========================================
# 1. Baca file CSV
# ===========================================
file_path = "webuage.csv"
df = pd.read_csv(file_path, delimiter=';')

# ===========================================
# 2. Filter data sesuai kriteria
# ===========================================
filtered_df = df[
    (df["Request URI"].str.contains(".html", case=False, na=False)) &
    (df["Status"] == 200) &
    (df["Request method"].str.upper() == "GET")
].reset_index().rename(columns={"index": "Original Row"})

# ===========================================
# 3. Tampilkan hasil
# ===========================================
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
pd.set_option('display.colheader_justify', 'center')

print(f"✅ Jumlah baris hasil filter: {len(filtered_df)}\n")
print(filtered_df.head(20).to_string(index=False))  # tampilkan 20 baris pertama

# ===========================================
# 4. (Opsional) Simpan ke file CSV
# ===========================================
filtered_df.to_csv("filtered_web_log.csv", index=False)


✅ Jumlah baris hasil filter: 75656

 Original Row  Remote host   Remote logname Remote user     Request time     Request method Request URI Request Protocol  Status  Size of response (incl. headers)
       0       65.55.147.227       -             -      2009-10-15T02:00:24Z      GET       /index.html     HTTP/1.1       200                  21878             
       1         65.55.86.34       -             -      2009-10-15T02:00:58Z      GET       /index.html     HTTP/1.1       200                   1416             
       2       148.188.55.88       -             -      2009-10-15T02:01:41Z      GET         /faq.html     HTTP/1.1       200                  10946             
       4      66.249.139.233       -             -      2009-10-15T02:02:09Z      GET         /faq.html     HTTP/1.1       200                  17247             
       5        72.30.50.248       -             -      2009-10-15T02:02:13Z      GET       /index.html     HTTP/1.0       200                   7883

### Merubah request time menjadi WIB

In [None]:
# 2. Filter data sesuai kriteria
filtered_df = df[
    (df["Request URI"].str.contains(".html", case=False, na=False)) &
    (df["Status"] == 200) &
    (df["Request method"].str.upper() == "GET")
].reset_index().rename(columns={"index": "Original Row"})

# 3. Konversi waktu ke WIB (UTC+7)
filtered_df["Request time"] = pd.to_datetime(filtered_df["Request time"], utc=True, format='mixed', dayfirst=False) + pd.Timedelta(hours=7)
filtered_df["Request time"] = filtered_df["Request time"].dt.strftime("%Y-%m-%d %H:%M:%S") + " WIB"

# 4. Tampilkan hasil
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 150)

print(f"✅ Jumlah baris hasil filter: {len(filtered_df)}\n")
print(filtered_df.head(10).to_string(index=False))

# 5. (Opsional) Simpan ke file CSV
filtered_df.to_csv("filtered_web_log_wib.csv", index=False)

✅ Jumlah baris hasil filter: 75656

 Original Row  Remote host   Remote logname Remote user       Request time      Request method Request URI Request Protocol  Status  Size of response (incl. headers)
       0       65.55.147.227       -             -      2009-10-15 09:00:24 WIB      GET       /index.html     HTTP/1.1       200                  21878             
       1         65.55.86.34       -             -      2009-10-15 09:00:58 WIB      GET       /index.html     HTTP/1.1       200                   1416             
       2       148.188.55.88       -             -      2009-10-15 09:01:41 WIB      GET         /faq.html     HTTP/1.1       200                  10946             
       4      66.249.139.233       -             -      2009-10-15 09:02:09 WIB      GET         /faq.html     HTTP/1.1       200                  17247             
       5        72.30.50.248       -             -      2009-10-15 09:02:13 WIB      GET       /index.html     HTTP/1.0       200     

### Filter berdasarkan IP Address dan Request Protocol

In [15]:
import pandas as pd

# --- Pastikan df dimuat dari file CSV ---
file_path = "webuage.csv"
df = pd.read_csv(file_path, delimiter=';')
# ----------------------------------------

# 2. Filter data sesuai kriteria
filtered_df = df[
    (df["Request URI"].str.contains(".html", case=False, na=False)) &
    (df["Status"] == 200) &
    (df["Request method"].str.upper() == "GET")
].reset_index().rename(columns={"index": "Original Row"})

# ----- SORT SESUAI PROTOCOL: HTTP/1.1 dulu, baru HTTP/1.0 -----

# Bersihkan protocol biar seragam
filtered_df["Request Protocol"] = (
    filtered_df["Request Protocol"]
    .astype(str)
    .str.strip()
    .str.upper()
)

# Mapping urutan sort
protocol_order = {"HTTP/1.1": 0, "HTTP/1.0": 1}

# Buat kolom sort sementara
filtered_df["protocol_sort"] = filtered_df["Request Protocol"].map(protocol_order)

# Urutkan
filtered_df = filtered_df.sort_values(
    by=["protocol_sort", "Original Row"],
    ascending=[True, True]
).drop(columns=["protocol_sort"])

# ---------------------------------------------------------------

# 3. Konversi waktu ke WIB (UTC+7)
filtered_df["Request time"] = (
    pd.to_datetime(filtered_df["Request time"], utc=True, format='mixed', dayfirst=False)
    + pd.Timedelta(hours=7)
)
filtered_df["Request time"] = filtered_df["Request time"].dt.strftime("%Y-%m-%d %H:%M:%S") + " WIB"

# 4. Tampilkan hasil
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 150)

print(f"✅ Jumlah baris hasil filter: {len(filtered_df)}\n")
print(filtered_df.head(10).to_string(index=False))

# 5. Simpan
filtered_df.to_csv("filtered_web_protocol.csv", index=False)

✅ Jumlah baris hasil filter: 75656

 Original Row    Remote host Remote logname Remote user            Request time Request method Request URI Request Protocol  Status  Size of response (incl. headers)
            0  65.55.147.227              -           - 2009-10-15 09:00:24 WIB            GET /index.html         HTTP/1.1     200                             21878
            1    65.55.86.34              -           - 2009-10-15 09:00:58 WIB            GET /index.html         HTTP/1.1     200                              1416
            2  148.188.55.88              -           - 2009-10-15 09:01:41 WIB            GET   /faq.html         HTTP/1.1     200                             10946
            4 66.249.139.233              -           - 2009-10-15 09:02:09 WIB            GET   /faq.html         HTTP/1.1     200                             17247
            8    65.55.80.97              -           - 2009-10-15 09:02:51 WIB            GET /index.html         HTTP/1.1     200   

## Dataset NASA

In [None]:
import pandas as pd

# ===========================================
# 1. Baca dataset NASA (misalnya nasa.csv)
# ===========================================
file_path = "nasa.csv"   # ganti sesuai nama file kamu
df = pd.read_csv(file_path, delimiter=";")   # sesuaikan delimiter kalau bukan ';'

# ===========================================
# 2. Tampilkan informasi dataset
# ===========================================

print("\n=== Jumlah Baris & Kolom ===")
print(df.shape)

print("\n=== 10 Baris Pertama dalam Format Tabel ===")
print(df.head(10).to_string(index=False))


  df = pd.read_csv(file_path, delimiter=";")   # sesuaikan delimiter kalau bukan ';'



=== Jumlah Baris & Kolom ===
(1048575, 7)

=== 10 Baris Pertama dalam Format Tabel ===
Unnamed: 0     host       time   method               url                response  bytes
    0      ***.novo.dk 805465029  GET                         /ksc.html    200     7067 
    1      ***.novo.dk 805465031  GET        /images/ksclogo-medium.gif    200     5866 
    2      ***.novo.dk 805465051  GET      /images/MOSAIC-logosmall.gif    200      363 
    3      ***.novo.dk 805465053  GET         /images/USA-logosmall.gif    200      234 
    4      ***.novo.dk 805465054  GET        /images/NASA-logosmall.gif    200      786 
    5      ***.novo.dk 805465058  GET       /images/WORLD-logosmall.gif    200      669 
    6      ***.novo.dk 805465068  GET   /shuttle/missions/missions.html    200     8678 
    7      ***.novo.dk 805465071  GET          /images/launchmedium.gif    200    11853 
    8      ***.novo.dk 805465153  GET         /images/KSC-logosmall.gif    200     1204 
    9      ***.novo.dk

### Filter url .html, response 200, methode GET, time di rubah ke wib

In [None]:
import pandas as pd

# Re-load df from nasa.csv to ensure 'time' column is in epoch format
file_path = "nasa.csv"
df = pd.read_csv(file_path, delimiter=";", low_memory=False)

# Convert 'time' (epoch) to datetime objects
df['time'] = pd.to_datetime(df['time'], unit='s')

# ===========================================
# 2. Filter data (url .html, response 200, method GET)
# ===========================================
filtered_df = df[
    (df['url'].str.contains('.html', case=False, na=False)) &
    (df['response'] == 200) &
    (df['method'].str.upper() == 'GET')
].reset_index(drop=True)

# ===========================================
# 3. Tampilkan hasil
# ===========================================
print("\n=== 10 Baris Pertama Hasil ===")
print(filtered_df.head(10).to_string(index=False))

# ===========================================
# 4. Simpan hasil
# ===========================================
filtered_df.to_csv("filtered_nasa_wib.csv", index=False)
print("\n✅ File berhasil disimpan: filtered_nasa_wib.csv")


=== 10 Baris Pertama Hasil ===
Unnamed: 0           host                  time        method                     url                       response  bytes
     0                 ***.novo.dk 1995-07-11 12:17:09  GET                                      /ksc.html    200     7067 
     6                 ***.novo.dk 1995-07-11 12:17:48  GET                /shuttle/missions/missions.html    200     8678 
    12                 ***.novo.dk 1995-07-11 12:23:01  GET      /shuttle/resources/orbiters/columbia.html    200     6922 
    13                 ***.novo.dk 1995-08-09 07:02:48  GET   /shuttle/missions/sts-69/mission-sts-69.html    200    11264 
    23                 ***.novo.dk 1995-08-09 07:05:38  GET                /shuttle/countdown/liftoff.html    200     4665 
    26                 ***.novo.dk 1995-08-09 07:07:40  GET                 /shuttle/countdown/lps/fr.html    200     1879 
    29     001.msy4.communique.net 1995-08-30 06:55:47  GET                     /software/winvn/winv

### Menambahkan Session

In [19]:
import pandas as pd

# ============================================
# 1. Baca data filtered NASA
# ============================================
file_path = "filtered_nasa_wib.csv"
df = pd.read_csv(file_path, parse_dates=['time'])

# Tambahkan ID unik untuk mempercepat join nanti
df["row_id"] = df.index

# ============================================
# 2. Fungsi membuat sesi
# ============================================
def process_web_usage(df):
    df = df.sort_values(["host", "time"])
    result = []

    for host, group in df.groupby("host"):
        group = group.sort_values("time").reset_index(drop=True)

        sessions = []
        cur_session = []
        prev_time = None

        for _, row in group.iterrows():
            t = row["time"]
            url = row["url"]
            row_id = row["row_id"]

            if prev_time is None:
                cur_session.append((t, url, row_id))
            else:
                delta = (t - prev_time).total_seconds() / 60
                if delta > 20:
                    sessions.append(cur_session)
                    cur_session = [(t, url, row_id)]
                else:
                    cur_session.append((t, url, row_id))

            prev_time = t

        if cur_session:
            sessions.append(cur_session)

        result.append({"host": host, "sessions": sessions})

    return result

# ============================================
# 3. Ubah sessions
# ============================================
def sessions_to_table(processed, original_df):
    rows = []

    for item in processed:
        host = item["host"]

        for i, session in enumerate(item["sessions"], start=1):
            for t, url, row_id in session:
                rows.append({
                    "row_id": row_id,
                    "session": f"session {i}"
                })

    sessions_df = pd.DataFrame(rows)

    # Join cepat ke data asli
    final_df = sessions_df.merge(original_df, on="row_id", how="left")

    return final_df.drop(columns=["row_id"])

# ============================================
# 4. PROSES
# ============================================
processed = process_web_usage(df)
usage_table = sessions_to_table(processed, df)

# ============================================
# 5. Tampilkan & Simpan
# ============================================
print("\n=== TABEL SESI ===")
print(usage_table.head(10).to_string(index=False))

usage_table.to_csv("nasa_sessions.csv", index=False)
print("\n✔ File disimpan sebagai nasa_sessions.csv")


  df = pd.read_csv(file_path, parse_dates=['time'])



=== TABEL SESI ===
  session Unnamed: 0                    host                time method                                          url  response  bytes
session 1          0             ***.novo.dk 1995-07-11 12:17:09    GET                                    /ksc.html       200   7067
session 1          6             ***.novo.dk 1995-07-11 12:17:48    GET              /shuttle/missions/missions.html       200   8678
session 1         12             ***.novo.dk 1995-07-11 12:23:01    GET    /shuttle/resources/orbiters/columbia.html       200   6922
session 2         13             ***.novo.dk 1995-08-09 07:02:48    GET /shuttle/missions/sts-69/mission-sts-69.html       200  11264
session 2         23             ***.novo.dk 1995-08-09 07:05:38    GET              /shuttle/countdown/liftoff.html       200   4665
session 2         26             ***.novo.dk 1995-08-09 07:07:40    GET               /shuttle/countdown/lps/fr.html       200   1879
session 1         29 001.msy4.communique.n