In [74]:
import re
import csv
from collections import defaultdict



In [75]:
from striprtf.striprtf import rtf_to_text

def read_rtf_log_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        rtf_content = file.read()
        plain_text = rtf_to_text(rtf_content)
    return plain_text.splitlines()

# Example usage
if __name__ == "__main__":
    log_file_path = "sample.log.rtf"  # Replace with your .rtf file path
    log_lines = read_rtf_log_file(log_file_path)

    # Print the first few lines to verify
    for line in log_lines[:5]:
        print(line)


192.168.1.1 - - [03/Dec/2024:10:12:34 +0000] "GET /home HTTP/1.1" 200 512
203.0.113.5 - - [03/Dec/2024:10:12:35 +0000] "POST /login HTTP/1.1" 401 128 "Invalid credentials"
10.0.0.2 - - [03/Dec/2024:10:12:36 +0000] "GET /about HTTP/1.1" 200 256
192.168.1.1 - - [03/Dec/2024:10:12:37 +0000] "GET /contact HTTP/1.1" 200 312
198.51.100.23 - - [03/Dec/2024:10:12:38 +0000] "POST /register HTTP/1.1" 200 128


In [61]:
import re

def parse_logs(log_lines):
    # Regex patterns to extract IP, endpoint, and status code
    ip_pattern = r'^(\S+)'  # Matches the IP address at the beginning of the line
    endpoint_pattern = r'"(?:GET|POST) (\S+)'  # Matches the requested endpoint after the HTTP method
    status_code_pattern = r'HTTP/\S+" (\d{3})'  # Matches the status code after HTTP method

    log_data = []
    for line in log_lines:
        # Using regex to find IP, endpoint, and status code
        ip_match = re.search(ip_pattern, line)
        endpoint_match = re.search(endpoint_pattern, line)
        status_code_match = re.search(status_code_pattern, line)

        # If all necessary information is found, append it to the log_data list
        if ip_match and endpoint_match and status_code_match:
            ip = ip_match.group(1)
            endpoint = endpoint_match.group(1)
            status_code = int(status_code_match.group(1))
            log_data.append((ip, endpoint, status_code))

    return log_data

# Example usage
if __name__ == "__main__":
    # Sample log lines from the provided log file (assuming it's already read into `log_lines`)
    log_lines = [
        "192.168.1.1 - - [03/Dec/2024:10:12:34 +0000] \"GET /home HTTP/1.1\" 200 512",
        "203.0.113.5 - - [03/Dec/2024:10:12:35 +0000] \"POST /login HTTP/1.1\" 401 128 \"Invalid credentials\"",
        "10.0.0.2 - - [03/Dec/2024:10:12:36 +0000] \"GET /about HTTP/1.1\" 200 256",
        # ... Add more log lines as needed
    ]

    parsed_data = parse_logs(log_lines)
    for entry in parsed_data:
        print(f"IP: {entry[0]}, Endpoint: {entry[1]}, Status Code: {entry[2]}")


IP: 192.168.1.1, Endpoint: /home, Status Code: 200
IP: 203.0.113.5, Endpoint: /login, Status Code: 401
IP: 10.0.0.2, Endpoint: /about, Status Code: 200


In [62]:
import csv
from collections import defaultdict

def count_requests_per_ip(log_data):
    ip_requests = defaultdict(int)

    # Count the requests for each IP
    for ip, _, _ in log_data:
        ip_requests[ip] += 1

    # Sort the results by the number of requests in descending order
    sorted_ip_requests = sorted(ip_requests.items(), key=lambda x: x[1], reverse=True)

    # Display the results in a formatted way
    print(f"{'IP Address':<20}{'Request Count'}")
    print("-" * 40)
    for ip, count in sorted_ip_requests:
        print(f"{ip:<20}{count}")

    # Save the results to a CSV file
    with open('log_analysis_results.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['IP Address', 'Request Count'])  # Header row
        for ip, count in sorted_ip_requests:
            writer.writerow([ip, count])

# Example usage
if __name__ == "__main__":
    # Sample log_data for testing
    log_data = [
        ('192.168.1.1', '/home', 200),
        ('203.0.113.5', '/login', 401),
        ('192.168.1.1', '/about', 200),
        ('192.168.1.1', '/contact', 200),
        ('203.0.113.5', '/login', 401),
        ('10.0.0.2', '/about', 200),
        ('192.168.1.1', '/profile', 200),
    ]

    count_requests_per_ip(log_data)


IP Address          Request Count
----------------------------------------
192.168.1.1         4
203.0.113.5         2
10.0.0.2            1


In [63]:
def detect_suspicious_activity(log_data, threshold=10):
    # Dictionary to track failed login attempts by IP
    failed_attempts = defaultdict(int)

    # Loop through each log entry and check for 401 status (failed login attempts)
    for ip, _, status_code in log_data:
        if status_code == 401:
            failed_attempts[ip] += 1

    # Filter IPs that have exceeded the threshold
    suspicious_ips = [(ip, count) for ip, count in failed_attempts.items() if count > threshold]

    # Return the list of suspicious IPs
    return suspicious_ips

def display_suspicious_activity(suspicious_ips):
    # Display suspicious IPs in a readable format
    if suspicious_ips:
        print(f"\n{'Suspicious Activity Detected:'}")
        print(f"{'IP Address':<20}{'Failed Login Attempts'}")
        print("-" * 40)
        for ip, count in suspicious_ips:
            print(f"{ip:<20}{count}")
    else:
        print("\nNo suspicious activity detected.")

def main():
    # Sample log_data for testing
    log_data = [
        ('192.168.1.1', '/home', 200),
        ('203.0.113.5', '/login', 401),
        ('192.168.1.1', '/about', 200),
        ('192.168.1.1', '/contact', 200),
        ('203.0.113.5', '/login', 401),
        ('10.0.0.2', '/about', 200),
        ('192.168.1.1', '/profile', 200),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
        ('192.168.1.100', '/login', 401),
    ]

    # Get suspicious activity based on the threshold
    suspicious_ips = detect_suspicious_activity(log_data, threshold=5)

    # Display suspicious activity results
    display_suspicious_activity(suspicious_ips)

if __name__ == "__main__":
    main()



Suspicious Activity Detected:
IP Address          Failed Login Attempts
----------------------------------------
192.168.1.100       9


In [64]:
log_data = parse_logs(log_lines)
print("Log Data:", log_data)  # Print to check its structure
ip_data = count_requests_per_ip(log_data)


Log Data: [('192.168.1.1', '/home', 200), ('203.0.113.5', '/login', 401), ('10.0.0.2', '/about', 200)]
IP Address          Request Count
----------------------------------------
192.168.1.1         1
203.0.113.5         1
10.0.0.2            1


In [65]:

def parse_logs(log_lines):
    ip_pattern = r'^(\S+)'  # Matches the IP address
    endpoint_pattern = r'"(?:GET|POST) (\S+)'  # Matches the requested endpoint
    status_code_pattern = r'HTTP/1.1" (\d{3})'  # Matches the status code

    log_data = []
    for line in log_lines:
        ip_match = re.search(ip_pattern, line)
        endpoint_match = re.search(endpoint_pattern, line)
        status_code_match = re.search(status_code_pattern, line)

        if ip_match and endpoint_match and status_code_match:
            ip = ip_match.group(1)
            endpoint = endpoint_match.group(1)
            status_code = int(status_code_match.group(1))
            if ip and endpoint and status_code:
                log_data.append((ip, endpoint, status_code))
        else:
            # Debugging: Log lines that don't match expected format
            print("Skipping invalid log line:", line)
    return log_data

In [66]:
def count_requests_per_ip(log_data):
    ip_requests = defaultdict(int)
    for ip, _, _ in log_data:
        ip_requests[ip] += 1
    # Debugging: Check the resulting IP requests
    print("IP Requests Count:", ip_requests)
    return sorted(ip_requests.items(), key=lambda x: x[1], reverse=True)

In [67]:
def most_frequent_endpoint(log_data):
    endpoint_requests = defaultdict(int)
    for _, endpoint, _ in log_data:
        endpoint_requests[endpoint] += 1
    most_accessed = max(endpoint_requests.items(), key=lambda x: x[1])
    return most_accessed

In [68]:
def detect_suspicious_activity(log_data, threshold=10):
    failed_attempts = defaultdict(int)
    for ip, _, status_code in log_data:
        if status_code == 401:
            failed_attempts[ip] += 1
    return [(ip, count) for ip, count in failed_attempts.items() if count > threshold]

In [69]:
def save_results_to_csv(file_path, ip_data, endpoint_data, suspicious_data):
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)

        # Requests per IP
        writer.writerow(["Requests per IP"])
        writer.writerow(["IP Address", "Request Count"])
        writer.writerows(ip_data)
        writer.writerow([])

        # Most Accessed Endpoint
        writer.writerow(["Most Accessed Endpoint"])
        writer.writerow(["Endpoint", "Access Count"])
        writer.writerow([endpoint_data[0], endpoint_data[1]])  # endpoint and access count as row
        writer.writerow([])

        # Suspicious Activity
        writer.writerow(["Suspicious Activity"])
        writer.writerow(["IP Address", "Failed Login Count"])
        writer.writerows(suspicious_data)


In [73]:
# Step 2: Parse log lines to extract relevant information
def parse_logs(log_lines):
    ip_pattern = r'^(\S+)'  # Matches the IP address
    endpoint_pattern = r'"(?:GET|POST) (\S+)'  # Matches the requested endpoint
    status_code_pattern = r'HTTP/1.1" (\d{3})'  # Matches the status code

    log_data = []
    for line in log_lines:
        ip_match = re.search(ip_pattern, line)
        endpoint_match = re.search(endpoint_pattern, line)
        status_code_match = re.search(status_code_pattern, line)

        if ip_match and endpoint_match and status_code_match:
            ip = ip_match.group(1)
            endpoint = endpoint_match.group(1)
            status_code = int(status_code_match.group(1))
            if ip and endpoint and status_code:
                log_data.append((ip, endpoint, status_code))
        else:
            # Debugging: Log lines that don't match expected format
            print("Skipping invalid log line:", line)
    return log_data

# Step 3: Count requests per IP address
def count_requests_per_ip(log_data):
    ip_requests = defaultdict(int)
    for ip, _, _ in log_data:
        ip_requests[ip] += 1
    # Debugging: Check the resulting IP requests
    print("IP Requests Count:", ip_requests)
    return sorted(ip_requests.items(), key=lambda x: x[1], reverse=True)

# Step 4: Identify the most accessed endpoint
def most_frequent_endpoint(log_data):
    endpoint_requests = defaultdict(int)
    for _, endpoint, _ in log_data:
        endpoint_requests[endpoint] += 1
    most_accessed = max(endpoint_requests.items(), key=lambda x: x[1])
    return most_accessed

# Step 5: Detect suspicious activity
def detect_suspicious_activity(log_data, threshold=10):
    failed_attempts = defaultdict(int)
    for ip, _, status_code in log_data:
        if status_code == 401:
            failed_attempts[ip] += 1
    return [(ip, count) for ip, count in failed_attempts.items() if count > threshold]

# Step 6: Save results to a CSV file
def save_results_to_csv(file_path, ip_data, endpoint_data, suspicious_data):
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)

        # Requests per IP
        writer.writerow(["Requests per IP"])
        writer.writerow(["IP Address", "Request Count"])
        writer.writerows(ip_data)
        writer.writerow([])

        # Most Accessed Endpoint
        writer.writerow(["Most Accessed Endpoint"])
        writer.writerow(["Endpoint", "Access Count"])
        writer.writerow([endpoint_data[0], endpoint_data[1]])  # endpoint and access count as row
        writer.writerow([])

        # Suspicious Activity
        writer.writerow(["Suspicious Activity"])
        writer.writerow(["IP Address", "Failed Login Count"])
        writer.writerows(suspicious_data)

# Main function to drive the process
def main():
    log_file_path = "sample.log.rtf"
    output_file_path = "log_analysis_results.csv"

    # Read and parse logs
    log_lines = read_log_file(log_file_path)
    log_data = parse_logs(log_lines)

    # Debugging: Check the parsed log data
    print("Parsed log data:", log_data)

    # Perform analysis
    if log_data:
        ip_data = count_requests_per_ip(log_data)
        endpoint_data = most_frequent_endpoint(log_data)
        suspicious_data = detect_suspicious_activity(log_data)

        # Display results
        print("Requests per IP Address:")
        if ip_data:
            for ip, count in ip_data:
                print(f"{ip:<20} {count}")
        else:
            print("No IP data available.")

        print("\nMost Frequently Accessed Endpoint:")
        if endpoint_data:
            print(f"{endpoint_data[0]} (Accessed {endpoint_data[1]} times)")
        else:
            print("No accessed endpoint data available.")

        print("\nSuspicious Activity Detected:")
        if suspicious_data:
            for ip, count in suspicious_data:
                print(f"{ip:<20} {count}")
        else:
            print("No suspicious activity detected.")

        # Save results to CSV
        save_results_to_csv(output_file_path, ip_data, endpoint_data, suspicious_data)
        print(f"\nResults saved to {output_file_path}")
    else:
        print("No valid log data available.")

if __name__ == "__main__":
    main()


Skipping invalid log line: {\rtf1\ansi\ansicpg1252\cocoartf2813

Skipping invalid log line: \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}

Skipping invalid log line: {\colortbl;\red255\green255\blue255;}

Skipping invalid log line: {\*\expandedcolortbl;;}

Skipping invalid log line: \paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0

Skipping invalid log line: \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

Skipping invalid log line: 

Parsed log data: [('\\f0\\fs24', '/home', 200), ('203.0.113.5', '/login', 401), ('10.0.0.2', '/about', 200), ('192.168.1.1', '/contact', 200), ('198.51.100.23', '/register', 200), ('203.0.113.5', '/login', 401), ('192.168.1.100', '/login', 401), ('10.0.0.2', '/dashboard', 200), ('198.51.100.23', '/about', 200), ('192.168.1.1', '/dashboard', 200), ('203.0.113.5', '/login', 401), ('203.0.113.5', '/login', 401), ('192.168.1.100', '