In [1]:
import tarfile
import os
import pandas as pd
import json
import datetime
import pytz



## Built-in functions

In [2]:
# Convertig timestamp to date
def timestamp_to_date(timestamp, timezone):
    tz = pytz.timezone(timezone)
    datetime_obj = datetime.datetime.fromtimestamp(timestamp/1000, tz)
    formatted_datetime = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")
    return formatted_datetime


In [3]:
# For extracting tar file
def extract_tar(tar_file_path,target_directory):
          # Open the .tar.gz file
          with tarfile.open(tar_file_path, 'r:gz') as tar_ref:
                    # Extract all files in the .tar.gz archive to the target directory
                    tar_ref.extractall(target_directory)

          # Optional: List the extracted files
          with tarfile.open(tar_file_path, 'r:gz') as tar_ref:
                    extracted_files = tar_ref.getnames()
                    print("Extracted files:")
                    for file in extracted_files:
                              print(file)

In [4]:
# For changing extention to JSON
def change_extension(this_file):
    base = os.path.splitext(this_file)[0]
    new_file = base+"_changer"+".json"  # Change the extension to '.json'
    

In [5]:
# For reading all files in tree structure directory and extract all information of sessions in to DataFrame 
def process_files(directory, desired_extension,df,timezone):
    entries = os.listdir(directory)
     
    for entry in entries:
        full_path = os.path.join(directory, entry)
    
        if os.path.isfile(full_path) and entry.endswith(desired_extension):
            
            # Process the file with the desired extension
            change_extension(full_path)
            
            # Read the JSON file
            with open(full_path, 'r') as json_file:
                data = json.load(json_file)
                
                try :
                    last_index = df.index[-1]+1
                except :
                    last_index = 0
                    
        
                # Extract the desired information
                sessionId=data['data']['info']['user']['sessionId']
                user_name=data['data']['info']['user']['name']
                user_ip=data['data']['info']['user']['ip']
                service_name=data['data']['info']['service']['name']
                date_start=timestamp_to_date(int(data['data']['info']['record']['data']['shells']['1']['baseTimeStamp']),timezone)
                duration=int(data['data']['info']['record']['data']['shells']['1']['duration'])/1000
                date_finish=timestamp_to_date(int(data['data']['info']['record']['data']['shells']['1']['baseTimeStamp'])+duration*1000,timezone)
                downloadDataSize=data['data']['info']['record']['data']['shells']['1']['downloadDataSize']
                uploadDataSize=data['data']['info']['record']['data']['shells']['1']['uploadDataSize']
                
                
                
                df.loc[last_index]=[sessionId,user_name,user_ip,service_name,date_start,duration,date_finish
                                    ,downloadDataSize,uploadDataSize]
                
                
                print(df)
                
        elif os.path.isdir(full_path):
            # Recursively process files in subdirectories
            process_files(full_path, desired_extension,df,timezone)    

## Parte 1 : Extracting files 

In [6]:
# Specify the path to the .tar.gz file
tar_file_path = '/home/elmmalick/archive.tar.gz'
# Specify the target directory to extract the files
target_directory = '/home/elmmalick/archive_file'

extract_tar(tar_file_path,target_directory)

Extracted files:
var/rubycat-labs/storage/local/records/ssh
var/rubycat-labs/storage/local/records/ssh/2023
var/rubycat-labs/storage/local/records/ssh/2023/06
var/rubycat-labs/storage/local/records/ssh/2023/06/16
var/rubycat-labs/storage/local/records/ssh/2023/06/16/49c8ec5f-9bf0-4160-920d-8de6f8691933_20230616140623138249/
var/rubycat-labs/storage/local/records/ssh/2023/06/16/49c8ec5f-9bf0-4160-920d-8de6f8691933_20230616140623138249/49c8ec5f-9bf0-4160-920d-8de6f8691933.1.shell.001
var/rubycat-labs/storage/local/records/ssh/2023/06/16/49c8ec5f-9bf0-4160-920d-8de6f8691933_20230616140623138249/49c8ec5f-9bf0-4160-920d-8de6f8691933.metadata.bak
var/rubycat-labs/storage/local/records/ssh/2023/06/16/49c8ec5f-9bf0-4160-920d-8de6f8691933_20230616140623138249/49c8ec5f-9bf0-4160-920d-8de6f8691933.metadata
var/rubycat-labs/storage/local/records/ssh/2023/06/16/3fa41eda-a488-4f9a-a919-929d52072527_20230616140542880387/
var/rubycat-labs/storage/local/records/ssh/2023/06/16/3fa41eda-a488-4f9a-a919-92

## Partie 2 : Data extracting to Dataframe

In [7]:
timezone = 'Europe/Paris'

# Set the root directory
root_directory = '/home/elmmalick/archive_file/var/rubycat-labs/storage/local/records/ssh'
# Set the desired file extension to read
desired_extension = '.metadata'

# Start processing files from the root directory
# DataFrame of sessions
df_sessions=pd.DataFrame([],columns=['sessionId','user_name','user_ip','service_name','date_start','duration','date_finish','downloadDataSize','uploadDataSize'])
process_files(root_directory,desired_extension,df_sessions,timezone)




df_sessions.head(17)


                              sessionId user_name          user_ip  \
0  4dcee4b9-ccf8-4555-bab2-fc6d4bf0f818     user2  192.168.122.167   

  service_name           date_start  duration          date_finish  \
0     service2  2023-06-16 16:10:13  1686.372  2023-06-16 16:38:19   

  downloadDataSize uploadDataSize  
0            10294              0  
                              sessionId user_name          user_ip  \
0  4dcee4b9-ccf8-4555-bab2-fc6d4bf0f818     user2  192.168.122.167   
1  49c8ec5f-9bf0-4160-920d-8de6f8691933     user3  192.168.122.167   

  service_name           date_start  duration          date_finish  \
0     service2  2023-06-16 16:10:13  1686.372  2023-06-16 16:38:19   
1     service2  2023-06-16 16:06:23  3605.363  2023-06-16 17:06:28   

  downloadDataSize uploadDataSize  
0            10294              0  
1            20534              0  
                              sessionId user_name          user_ip  \
0  4dcee4b9-ccf8-4555-bab2-fc6d4bf0f818     us

Unnamed: 0,sessionId,user_name,user_ip,service_name,date_start,duration,date_finish,downloadDataSize,uploadDataSize
0,4dcee4b9-ccf8-4555-bab2-fc6d4bf0f818,user2,192.168.122.167,service2,2023-06-16 16:10:13,1686.372,2023-06-16 16:38:19,10294,0
1,49c8ec5f-9bf0-4160-920d-8de6f8691933,user3,192.168.122.167,service2,2023-06-16 16:06:23,3605.363,2023-06-16 17:06:28,20534,0
2,f5b84fc4-cc33-4803-be0f-280fc075cb06,user2,192.168.122.167,service2,2023-06-16 14:33:16,564.134,2023-06-16 14:42:40,13568,524
3,3fa41eda-a488-4f9a-a919-929d52072527,user1,192.168.122.167,service2,2023-06-16 16:05:43,1935.494,2023-06-16 16:37:59,11662,1
4,0653152d-cf57-4eb3-8a08-bf126a529e38,user1,192.168.122.167,service2,2023-06-16 14:39:20,299.795,2023-06-16 14:44:20,10120,372
5,20a0b7a8-0b1a-4dc6-9e37-5de7b6982069,user4,192.168.122.167,service2,2023-06-16 14:45:16,4042.529,2023-06-16 15:52:38,34390,709
6,2f89715b-cc59-4809-90e9-e6049960f493,user4,192.168.122.167,service2,2023-06-16 16:06:43,3605.605,2023-06-16 17:06:48,20534,0
7,60b7eca5-9e1f-42a8-a01c-87acecdebdbe,user1,192.168.122.167,service2,2023-06-16 17:37:32,104.1,2023-06-16 17:39:16,5956,97
8,bfcd2a7c-a9b7-47b4-9988-493a3ee977b2,proveitadmin,192.168.122.167,service2,2023-06-16 16:07:37,3606.008,2023-06-16 17:07:43,20534,0
9,ec5bdea8-b610-4a08-9d41-b76d60e4e6f8,user2,192.168.122.167,service2,2023-06-16 16:06:04,3609.395,2023-06-16 17:06:13,20534,0


In [8]:
df_sessions.shape

(16, 9)