##### Pre-Processing of Log files

We process txt log files from VR sessions and convert them to CSV files

In [None]:
import pandas as pd
import glob

In [None]:
txt_files = glob.glob("Logs_ekeel/*.txt")  # just replace the directory since we are loading all the files
# txt_files

In [None]:
for file in txt_files:

    col_names = ['Timestamp', 'Action', 'ActionID', 'Direction']   # change headers if any additions in the logs
    df = pd.read_csv(file, delimiter=",", header=None, names=col_names)  
    
    
    csv_file = file.replace(".txt", ".csv")
    
    # Save as CSV in the same folder
    df.to_csv(csv_file, index=False)


In [None]:
csv_files = glob.glob("Logs_ekeel/*.csv")  
# csv_files

##### Feature Extraction from Log Files for Learning Analytics

In the next section, we process data from CSV log files and extract meaningful features.
Each log file represents a session, and the session ID is derived from the file name.


In [2]:
import pandas as pd
import os
import glob
from datetime import datetime

# Function to parse the custom timestamp format
def parse_timestamp(ts):
    """
    Converts a timestamp string into a datetime object.
    Expected format: "YYYY-MM-DD-HH:MM:SS:fff"
    """
    return datetime.strptime(ts, "%Y-%m-%d-%H:%M:%S:%f")

# Function to extract features from a single log file
def extract_features(file_path):
    """
    Reads a log file, converts timestamps, sorts events,
    and extracts various features including session duration, event counts,
    unique concept interactions, and average duration between a Touch and a PlayAnnotation.
    """
    # Read CSV file into DataFrame
    df = pd.read_csv(file_path)
    
    # Convert the 'Timestamp' column from string to datetime and sort the DataFrame
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], format="%Y-%m-%d-%H:%M:%S:%f")
    df = df.sort_values("Timestamp")

    
    # -----------------------
    # Basic Feature Extraction:
    # -----------------------

    
    # Calculate session duration in seconds (time between first and last event)
    session_duration = (df["Timestamp"].iloc[-1] - df["Timestamp"].iloc[0]).total_seconds()
    
    # Get counts of each action as a dictionary
    action_counts = df["Action"].value_counts().to_dict()
    
    # Count total Touch events
    count_touch = action_counts.get("Touch", 0)
    
    # Count unique concepts touched (from ActionID in Touch events)
    unique_concepts = df[df["Action"] == "Touch"]["ActionID"].nunique()
    
    # Per-concept counts for Touch events (counts per concept identifier)
    concept_counts = df[df["Action"] == "Touch"]["ActionID"].value_counts().to_dict()
    
    # Count of combined Touch and PlayAnnotation events
    touch_play_count = len(df[(df["Action"] == "Touch") | (df["Action"] == "PlayAnnotation")])
    
    # Count of Grab-Release pairs (use the minimum count to represent complete pairs)
    grab_release_count = min(action_counts.get("Grab", 0), action_counts.get("Release", 0))
    

    # -----------------------
    # Duration Between Touch and PlayAnnotation:
    # -----------------------

    # Calculate duration between each Touch event and the next PlayAnnotation event
    touch_play_durations = []
    # Filter for Touch events to iterate over.
    touch_events = df[df["Action"] == "Touch"]
    for _, touch in touch_events.iterrows():
        # For each Touch event, find the first PlayAnnotation event that occurs after it.
        subsequent_play = df[(df["Action"] == "PlayAnnotation") & (df["Timestamp"] > touch["Timestamp"])]
        if not subsequent_play.empty:
            # Calculate the time difference (in seconds) and append it to the list.
            duration = (subsequent_play.iloc[0]["Timestamp"] - touch["Timestamp"]).total_seconds()
            touch_play_durations.append(duration)
    # Calculate the average duration if there are any durations recorded; otherwise, use None.
    avg_touch_play_duration = (sum(touch_play_durations) / len(touch_play_durations)) if touch_play_durations else None


    # Create a dictionary of extracted features
    features = {
        "session_duration": session_duration,
        "count_Teleport": action_counts.get("Teleport", 0),
        "count_Grab": action_counts.get("Grab", 0),
        "count_Release": action_counts.get("Release", 0),
        "count_PlayVideo": action_counts.get("PlayVideo", 0),
        "count_Pause": action_counts.get("Pause", 0),
        "count_Resume": action_counts.get("Resume", 0),
        "count_Touch": count_touch,
        "unique_concepts_touched": unique_concepts,
        "touch_play_count": touch_play_count,
        "grab_release_count": grab_release_count,
        "avg_touch_play_duration": avg_touch_play_duration,
        "concept_counts": concept_counts  # To be expanded into separate columns later
    }
    return features


# -----------------------
# Main Processing:
# -----------------------

# Process all CSV log files and collect features for each session
file_list = glob.glob("Logs_ekeel/*.csv")
all_concepts = set()  # Collect all unique concept identifiers across sessions
session_features = []

for file in file_list:
    feats = extract_features(file)
    # Extract sessionID from file name using os.path.basename for compatibility
    session_id = os.path.basename(file).split(".")[0]
    feats["sessionID"] = session_id
    session_features.append(feats)
    all_concepts.update(feats["concept_counts"].keys())


# -----------------------
# Create a final DataFrame with one row per session.
# -----------------------

# Build the final features DataFrame: one row per session with each concept as a separate column
features_list = []
for feats in session_features:
    record = {
        "sessionID": feats["sessionID"],
        "session_duration": feats["session_duration"],
        "count_Teleport": feats["count_Teleport"],
        "count_Grab": feats["count_Grab"],
        "count_Release": feats["count_Release"],
        "count_PlayVideo": feats["count_PlayVideo"],
        "count_Pause": feats["count_Pause"],
        "count_Resume": feats["count_Resume"],
        "count_Touch": feats["count_Touch"],
        "unique_concepts_touched": feats["unique_concepts_touched"],
        "touch_play_count": feats["touch_play_count"],
        "grab_release_count": feats["grab_release_count"],
        "avg_touch_play_duration": feats["avg_touch_play_duration"]
    }
    # Add a column for each unique concept encountered across all sessions
    for concept in all_concepts:
        # If a concept was not interacted with in the session, default the count to 0
        record[concept] = feats["concept_counts"].get(concept, 0)
    features_list.append(record)

# Create a DataFrame from the list of session records
features_df = pd.DataFrame(features_list)

# -----------------------
# Display the head of the final DataFrame in a copy-paste friendly format
# -----------------------
print(features_df.head().to_string(index=False))


 sessionID  session_duration  count_Teleport  count_Grab  count_Release  count_PlayVideo  count_Pause  count_Resume  count_Touch  unique_concepts_touched  touch_play_count  grab_release_count  avg_touch_play_duration  concept_servizio  concept_fornitore_del_servizio  concept_livello_fisico  concept_caratteristica  concept_IaaS  concept_risorse_hardware  concept_infrastruttura_cloud  concept_On-Premises  concept_SaaS  concept_modello_di_cloud  concept_PaaS  concept_cloud_computing  concept_software  concept_livello_astratto
Userlogs07          1295.355              14           7              7                2            2             1            3                        2                 7                   7                13.666667                 0                               0                       0                       0             1                         0                             0                    0             2                         0             0            

The following code converts the DataFrame into an HTML table and saves it as "Feature_head.html". This makes it easier to view in a browser and capture high-quality screenshots.

In [3]:
# to export the entire DataFrame instead of just the first few rows, 
# replace features_df.head() with features_df

html_table = features_df.head().T.to_html()

with open("Feature_head.html", "w") as f:
    f.write(html_table)

##### Explanation of the Notebook

- **parse_timestamp:** Converts the custom timestamp string into a datetime object.
- **extract_features:** Reads a single log file, sorts events chronologically, and extracts key features such as session duration, event counts, unique concept interactions, and durations between Touch and PlayAnnotation events.
- **Main Processing:**  
  - Uses glob to collect all CSV files from the 'Logs_ekeel' folder. 
  - Extracts session features for each file and collects all unique concept identifiers.
  - Builds a final DataFrame where each row represents a session, with individual columns for overall metrics and separate columns for each concept.
- **Output:**  
  The head of the final DataFrame is printed in a format that is suitable for copying into a report or taking a screen grab.


Saving the dataframe as csv for future analysis

In [None]:
# Save to CSV file
features_df.to_csv("Mode2Logs.csv", index=False)  # Set index=False to exclude row indices