# Analisis Log Hadoop

Proyek analisis log sistem Hadoop untuk mengekstrak insight operasional

## 1. Konfigurasi dan Setup

In [1]:
import os
import glob
import pandas as pd
from matplotlib import pyplot as plt

## 2. Ekstraksi Data


In [2]:
# Tentukan path dari dataset 
path = 'dataset/hadoop/'

In [3]:
# Cek isi folder (untuk memastikan file log ada)
files = glob.glob(os.path.join(path, '**/*.log'), recursive=True)
#print(f"File log yang ditemukan: {files}")

### Gunakan Semua File

In [5]:
def get_raw_logs(files):
    """
    Mengambil semua raw log dari daftar file.
    """
    
    raw_logs = []

    for file in files:
        with open(file, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    raw_logs.append(line)
    return raw_logs

raw_logs =  get_raw_logs(files)

In [7]:
def parse_logs(raw_logs):
    """
    Mem-parse raw logs menjadi beberapa kolom  kolom terpisah: 
    time, level, thread, log_class, dan message.

    Args:
        raw_logs (list): Daftar baris log mentah yang akan diparse.
    
    Returns:
        tuple: Tuple berisi lima list:
            - time (list): Waktu log.
            - level (list): Level log (INFO, ERROR, dll.).
            - thread (list): Thread log (dalam tanda [ ]).
            - log_class (list): Class logger.
            - message (list): Pesan log.
    """
    time = []
    level = []
    thread = []
    log_class = []
    message = []

    for line in raw_logs:
        try:
            parts = line.split(" ")

            # Pastikan minimal ada 5 part
            if len(parts) < 5:
                continue

            # Format waktu: gabungan parts[0] dan parts[1]
            time_str = parts[0].strip() + ' ' + parts[1].strip()

            # Level log (INFO, ERROR, dst.)
            level_str = parts[2].strip()

            # Thread dalam tanda [ ]
            if '[' in line and ']' in line:
                thread_str = line.split('[')[1].split(']')[0]
            else:
                thread_str = None

            # Class logger
            log_class_str = parts[4].strip()

            # Message: ambil bagian setelah 5 spasi pertama
            msg = line.split(' ', 5)[-1].strip()

            # Tambahkan ke list
            time.append(time_str)
            level.append(level_str)
            thread.append(thread_str)
            log_class.append(log_class_str)
            message.append(msg)

        except Exception as e:
            print(f"Baris gagal diparse:\n{line}\nError: {e}")

    return time, level, thread, log_class, message

time, level, thread, log_class, message = parse_logs(raw_logs)

# Buat DataFrame
df = pd.DataFrame({
    'time': time,
    'level': level,
    'thread': thread,
    'log_class': log_class,
    'message': message
})

# Parsing waktu
df['time'] = pd.to_datetime(df['time'], format='mixed', errors='coerce')
df = df.dropna(subset=['time'])

# Tampilkan 5 baris awal
print(df.head())

                 time level thread  \
0 2015-10-17 15:37:56  INFO   main   
1 2015-10-17 15:37:56  INFO   main   
2 2015-10-17 15:37:56  INFO   main   
3 2015-10-17 15:37:57  INFO   main   
4 2015-10-17 15:37:57  INFO   main   

                                         log_class  \
0  org.apache.hadoop.mapreduce.v2.app.MRAppMaster:   
1  org.apache.hadoop.mapreduce.v2.app.MRAppMaster:   
2  org.apache.hadoop.mapreduce.v2.app.MRAppMaster:   
3  org.apache.hadoop.mapreduce.v2.app.MRAppMaster:   
4  org.apache.hadoop.mapreduce.v2.app.MRAppMaster:   

                                             message  
0  Created MRAppMaster for application appattempt...  
1                             Executing with tokens:  
2  Kind: YARN_AM_RM_TOKEN, Service: , Ident: (app...  
3                      Using mapred newApiCommitter.  
4                 OutputCommitter set in config null  


### Gunakan Sample

In [None]:
# Misalkan file log pertama yang ditemukan
#log_file = files[1]

In [None]:
"""
# Buka dan baca beberapa baris pertama untuk melihat format file log
with open(log_file, 'r') as f:
    # Baca 10 baris pertama dari file log
    sample_lines = [f.readline() for _ in range(10)]

# Tampilkan baris pertama
#print("Contoh beberapa baris pertama dari file log:")
#for line in sample_lines:
#    print(line.strip())

time = []
level = []
thread = []
log_class = []
message = []
for line in sample_lines:
    parts = line.split(" ")
    time.append(parts[0].strip() + ' ' + parts[1].strip())
    level.append(parts[2].strip())
    thread.append(line.split('[')[1].split(']')[0])
    log_class.append(parts[4].strip())
    message.append(line.split(' ',5)[-1])

df = pd.DataFrame({
    'time': time,
    'level': level,
    'thread': thread,
    'log_class': log_class,
    'message': message
})

df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S,%f')


df.head()
"""

# 2. Transform

In [None]:
df['date'] = df['time'].dt.date

In [None]:
aktivitas_per_hari = df.groupby(['date']).size().reset_index(name='jumlah_log')
aktivitas_per_hari

In [None]:
aktivitas_level_per_hari = df.groupby(['date', 'level']).size().reset_index(name='jumlah_log')
aktivitas_level_per_hari

# 3. Load

In [None]:
aktivitas_per_hari.to_csv("aktivitas_per_hari.csv", index=False)
aktivitas_level_per_hari.to_csv("aktivitas_per_hari_berdasarkan_level.csv", index=False)

# 4. Visiualize

#### 

In [None]:
aktivitas_per_hari.plot(x='date', y='jumlah_log', kind='bar', figsize=(10,5))
plt.title("Jumlah Log Aktivitas per Hari")
plt.xlabel("Tanggal")
plt.ylabel("Jumlah Log")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
pivot = aktivitas_level_per_hari.pivot(index='date', columns='level', values='jumlah_log').fillna(0)
pivot.plot(kind='bar', stacked=True, figsize=(10,6))
plt.title("Log Level per Hari")
plt.xlabel("Tanggal")
plt.ylabel("Jumlah Log")
plt.xticks(rotation=45)
plt.legend(title="Level")
plt.tight_layout()
plt.show()


In [None]:
# Filter data untuk mengecualikan level 'INFO'
filtered_df = aktivitas_level_per_hari[aktivitas_level_per_hari['level'] != 'INFO']

# Pivot data yang sudah difilter
pivot = filtered_df.pivot(index='date', columns='level', values='jumlah_log').fillna(0)

# Plot stacked bar
pivot.plot(kind='bar', stacked=True, figsize=(10,6))
plt.title("Log Level per Hari (Tanpa INFO)")
plt.xlabel("Tanggal")
plt.ylabel("Jumlah Log")
plt.xticks(rotation=45)
plt.legend(title="Level")
plt.tight_layout()
plt.show()


### Melihat Jumlah Log per Level

In [None]:
df['level'].value_counts()

In [None]:
import matplotlib.pyplot as plt

df['level'].value_counts().plot(kind='bar')
plt.title('Jumlah Log per Level')
plt.xlabel('Level')
plt.ylabel('Jumlah')
plt.show()

### Melihat Aktivitas Log

In [None]:
type(df.time[0])

In [None]:
print(df['time'].dt.floor('h').nunique())

In [None]:
df.set_index('time').resample('h').size().plot(marker='o')
plt.title('Aktivitas Log per Jam')
plt.xlabel('Waktu')
plt.ylabel('Jumlah Log')
plt.show()