In [1]:
#NB: Kaggle requires phone verification to use the internet or a GPU. If you haven't done that yet, the cell below will fail
#    This code is only here to check that your internet is enabled. It doesn't do anything else.
#    Here's a help thread on getting your phone number verified: https://www.kaggle.com/product-feedback/135367

import socket,warnings
try:
    socket.setdefaulttimeout(1)
    socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect(('1.1.1.1', 53))
except socket.error as ex: raise Exception("STOP: No internet. Click '>|' in top right and set 'Internet' switch to on")

# may need to install library
!pip install fastai2 --quiet

In [2]:
import os
import pandas as pd
from fastai.text.all import *
# install library if needed

# function to truncate input files to save memory
def truncate_string(string, max_length):
    if len(string) > max_length:
        return string[:max_length]
    else:
        return string

# Define the root directory with the data folders 
#root_directory = '/kaggle/input/' #kaggle lowercases datastore name
root_directory = '/kaggle/input/logdataset2/logDeviceTypesVerySmall/'

# Initialize empty lists to store data
texts = []
labels = []

# Walk through the directories and read files. Use folder name as classifier label
for root, dirs, files in os.walk(root_directory):
    for file in files:
        
        file_path = os.path.join(root, file)
        label = os.path.basename(root)
        #print(file_path," ",label)
        
        # open binary in case there is unexpected format in files so loading continues
        with open(file_path, 'rb') as f:
            text = f.read()
        
        # decode in exception block and continue if data can't be decoded
        try:
            texts.append(truncate_string(text.decode('utf-8'),2000))
            labels.append(label)
        except UnicodeDecodeError as e:
            print('warning - unable to decode ',file_path)
            # skip line of text and label if can't be decoded properly
        
# Create a DataFrame using text and label lists
data = pd.DataFrame({'text': texts, 'label': labels})

# print descriptive info
#data.head()
data.describe()

Unnamed: 0,text,label
count,841,841
unique,841,10
top,"#!/bin/bash\n\n# Check if the correct number of arguments are provided\nif [ $# -ne 2 ]; then\n echo ""Usage: $0 <directory> <prefix>""\n exit 1\nfi\n\n# Get directory and prefix from command-line arguments\ndirectory=""$1""\nprefix=""$2""\n\n# Navigate to the directory\ncd ""$directory"" || exit\n\n# Use a loop to rename files with the prefix\nfor file in *; do\n if [ -f ""$file"" ]; then # Check if it's a regular file\n newname=""${prefix}${file}""\n mv ""$file"" ""$newname""\n echo ""Renamed: $file -> $newname""\n fi\ndone\n\n���������������������������������������������...",SSH
freq,1,215


In [3]:
from fastai.text.all import *

# create datablock from dataframe using text & label columns.  Randomly split for validation.
imdb_clas = DataBlock(
    blocks=(TextBlock.from_df('text', seq_len=72), CategoryBlock),
    get_x=ColReader('text'), get_y=ColReader('label'), splitter=RandomSplitter(valid_pct=0.1, seed=42))

dls = imdb_clas.dataloaders(data, bs=64)
dls.show_batch(max_n=2)

Unnamed: 0,text,category
0,xxbos [ we d xxmaj jan 25 01:20:15 2006 ] [ error ] [ client 69.64.38.143 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlsrv \n [ we d xxmaj jan 25 01:20:15 2006 ] [ error ] [ client 69.64.38.143 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlsrv \n [ we d xxmaj jan 25 01:20:15 2006 ] [ error ] [ client 69.64.38.143 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlsrv \n [ we d xxmaj jan 25 01:20:15 2006 ] [ error ] [ client 69.64.38.143 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlsrv \n [ we d xxmaj jan 25 01:20:15 2006 ] [ error ] [ client 69.64.38.143 ] xxmaj,Apache
1,xxbos [ we d xxmaj nov 09 04:44:17 2005 ] [ error ] [ client 85.214.16.149 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlrpc \n [ we d xxmaj nov 09 04:44:17 2005 ] [ error ] [ client 85.214.16.149 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlrpc \n [ we d xxmaj nov 09 04:44:17 2005 ] [ error ] [ client 85.214.16.149 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlrpc \n script not found or unable to stat \n script not found or unable to stat \n [ we d xxmaj nov 09 04:44:17 2005 ] [ error ] [ client 85.214.16.149 ] xxmaj file does not exist : / var / xxrep 3 w / html / xmlrpc \n [,Apache


In [4]:
# function to print prediction in pretty format
def printPrediction(prediction):
    classification,_,probs = prediction
    print(f"This is a {classification} file with probability {probs.max():.4f}")

In [None]:
# create learner
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

#train
learn.fine_tune(8, 1e-2)
learn.save('1epoch')
learn.show_results()

epoch,train_loss,valid_loss,accuracy,time


In [None]:
# load learner from storage
learn = learn.load('1epoch')

# test string - linux log
testStr = '''Sep 28 09:10:51 combo kernel: hdc: SAMSUNG CD-ROM SN-124, ATAPI CD/DVD-ROM drive
Sep 28 09:10:51 combo kernel: hdc: Disabling (U)DMA for SAMSUNG CD-ROM SN-124 (blacklisted)
Sep 28 09:10:51 combo kernel: ide1 at 0x170-0x177,0x376 on irq 15
Sep 28 09:10:51 combo kernel: hda: max request size: 128KiB
Sep 28 09:10:51 combo netfs: Mounting other filesystems:  succeeded
Sep 28 09:10:51 combo kernel: hda: 29336832 sectors (15020 MB) w/1916KiB Cache, CHS=29104/16/63, UDMA(66)
Sep 28 09:10:51 combo kernel:  hda: hda1 hda2 hda3
Sep 28 09:10:51 combo kernel: hdc: ATAPI 24X CD-ROM drive, 128kB Cache
Sep 28 09:10:51 combo kernel: Uniform CD-ROM driver Revision: 3.20
Sep 28 09:10:51 combo kernel: ide-floppy driver 0.99.newide
Sep 28 09:10:51 combo kernel: usbcore: registered new driver hiddev
Sep 28 09:10:51 combo apmd[1720]: Version 3.0.2 (APM BIOS 1.2, Linux driver 1.16ac)
'''

# predict and show 
prediction = learn.predict(testStr)
printPrediction(prediction)
print()
prediction