# LSTM Data Preparation

In [2]:

# Predicts the next category based on a block of 5 categories as window
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [3]:
import pandas as pd
import ast
import pickle as pk

In [4]:
# Data Uploads
tap_data = pd.read_csv('TapDataParsed_processed.csv', usecols=lambda column: column not in ['Unnamed: 0'])
tap_data

Unnamed: 0,id,taps,start,stop,appIds0,tapsSession,lengthSession,partId,timeZone,tapDeviceId,category,application
0,00e98a60-cfbc-11ed-a03c-bf72362c36e7,"['1680260654282', '1680260656216']",[1.68026065e+12],[1.68026068e+12],"['4', '6']",[2],[1934],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['-1', 'COMMUNICATION']","['com.miui.home', 'com.whatsapp']"
1,00e9b170-cfbc-11ed-a03c-bf72362c36e7,"['1680260848742', '1680260849767', '1680260850...",[1.68026085e+12],[1.6802609e+12],"['6', '6', '6', '6', '6', '6', '6', '6', '6', ...",[91],[51590],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['COMMUNICATION', 'COMMUNICATION', 'COMMUNICAT...","['com.whatsapp', 'com.whatsapp', 'com.whatsapp..."
2,00ea26a0-cfbc-11ed-a03c-bf72362c36e7,"['1680260926555', '1680260929904']",[1.68026092e+12],[1.68026094e+12],"['6', '6']",[2],[3349],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['COMMUNICATION', 'COMMUNICATION']","['com.whatsapp', 'com.whatsapp']"
3,00ea26a1-cfbc-11ed-a03c-bf72362c36e7,['1680260966773'],[1.68026097e+12],[1.68026097e+12],['6'],[1],[0],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],['COMMUNICATION'],['com.whatsapp']
4,00ea4db0-cfbc-11ed-a03c-bf72362c36e7,"['1680261384529', '1680261386184', '1680261386...",[1.68026138e+12],[1.6802614e+12],"['6', '6', '6', '6', '6', '6', '6', '6', '6', ...",[34],[12116],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['COMMUNICATION', 'COMMUNICATION', 'COMMUNICAT...","['com.whatsapp', 'com.whatsapp', 'com.whatsapp..."
...,...,...,...,...,...,...,...,...,...,...,...,...
144357,ffe7d2ba-50e8-4d98-a57e-e7fb66d1daf0,[''],[1.6873238e+12],[1.6873238e+12],[''],[0],[0],['138e0aa217a29b6e43b191d55547762fb7bc28eb'],['Asia/Kathmandu'],['aba9995c-d17c-4759-b6fc-bc5f75a1cfe4'],[None],[None]
144358,190f1650-3360-4014-b441-3963eb982f96,"['', '']",[1.68571532e+12],[1.68571533e+12],"['', '']",[0],[0],['138e4fc6a0c3f9c84a748783729e965d586628eb'],['Europe/Rome'],['01e797d4-2d69-4a04-abad-0f53de6eb216'],"[None, None]","[None, None]"
144359,33da6d62-a24b-4fac-911e-5210163999d8,"['1685715102620', '1685715103191', '1685715104...",[1.6857151e+12],[1.68571531e+12],"['1', '1', '4', '1', '1', '3', '3', '3', '3', ...",[18],[198617],['138e4fc6a0c3f9c84a748783729e965d586628eb'],['Europe/Rome'],['01e797d4-2d69-4a04-abad-0f53de6eb216'],"['PERSONALIZATION', 'PERSONALIZATION', '-1', '...","['com.sec.android.app.launcher', 'com.sec.andr..."
144360,7747522b-63d2-4dfd-9bf2-61010e8966f7,"['', '']",[1.68571539e+12],[1.68571541e+12],"['', '']",[0],[0],['138e4fc6a0c3f9c84a748783729e965d586628eb'],['Europe/Rome'],['01e797d4-2d69-4a04-abad-0f53de6eb216'],"[None, None]","[None, None]"


In [6]:
part_ids = len(list(tap_data['partId'].unique()))
part_ids

61

In [25]:
# remove empty taps [['', '']]
tap_data = tap_data[tap_data['taps'].apply(lambda x: x != "['', '']")]

In [26]:
# remove empty taps [['', '']]
tap_data = tap_data[tap_data['taps'].apply(lambda x: x != "['']")].reset_index(drop = True)
tap_data

Unnamed: 0,id,taps,start,stop,appIds0,tapsSession,lengthSession,partId,timeZone,tapDeviceId,category,application
0,00e98a60-cfbc-11ed-a03c-bf72362c36e7,"['1680260654282', '1680260656216']",[1.68026065e+12],[1.68026068e+12],"['4', '6']",[2],[1934],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['-1', 'COMMUNICATION']","['com.miui.home', 'com.whatsapp']"
1,00e9b170-cfbc-11ed-a03c-bf72362c36e7,"['1680260848742', '1680260849767', '1680260850...",[1.68026085e+12],[1.6802609e+12],"['6', '6', '6', '6', '6', '6', '6', '6', '6', ...",[91],[51590],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['COMMUNICATION', 'COMMUNICATION', 'COMMUNICAT...","['com.whatsapp', 'com.whatsapp', 'com.whatsapp..."
2,00ea26a0-cfbc-11ed-a03c-bf72362c36e7,"['1680260926555', '1680260929904']",[1.68026092e+12],[1.68026094e+12],"['6', '6']",[2],[3349],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['COMMUNICATION', 'COMMUNICATION']","['com.whatsapp', 'com.whatsapp']"
3,00ea26a1-cfbc-11ed-a03c-bf72362c36e7,['1680260966773'],[1.68026097e+12],[1.68026097e+12],['6'],[1],[0],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],['COMMUNICATION'],['com.whatsapp']
4,00ea4db0-cfbc-11ed-a03c-bf72362c36e7,"['1680261384529', '1680261386184', '1680261386...",[1.68026138e+12],[1.6802614e+12],"['6', '6', '6', '6', '6', '6', '6', '6', '6', ...",[34],[12116],['138e7be7ad1a1f6a496c9447ff1dfbde4e8228eb'],['Europe/Vienna'],['9a5c1517-2f31-4cd4-a5f7-951d0fa29775'],"['COMMUNICATION', 'COMMUNICATION', 'COMMUNICAT...","['com.whatsapp', 'com.whatsapp', 'com.whatsapp..."
...,...,...,...,...,...,...,...,...,...,...,...,...
66888,fea3361e-f3a7-42e2-aa9c-29b7c13d2fc8,"['1688567611127', '1688567611634']",[1.68856761e+12],[1.68856761e+12],"['4', '4']",[2],[507],['138e0aa217a29b6e43b191d55547762fb7bc28eb'],['Asia/Kathmandu'],['aba9995c-d17c-4759-b6fc-bc5f75a1cfe4'],"['-1', '-1']","['com.oppo.launcher', 'com.oppo.launcher']"
66889,fec83b19-38e1-4685-a8cd-fb8a63b4b9b7,"['1687686629107', '1687686630650', '1687686632...",[1.68768662e+12],[1.68768686e+12],"['25', '25', '25', '25', '25', '25', '25', '25...",[55],[232127],['138e0aa217a29b6e43b191d55547762fb7bc28eb'],['Asia/Kathmandu'],['aba9995c-d17c-4759-b6fc-bc5f75a1cfe4'],"['SOCIAL', 'SOCIAL', 'SOCIAL', 'SOCIAL', 'SOCI...","['com.zhiliaoapp.musically', 'com.zhiliaoapp.m..."
66890,fefc5f0a-ae06-495b-9ab8-bc5d55a6c37a,"['1688194591567', '1688194595574', '1688194602...",[1.68819459e+12],[1.6881949e+12],"['4', '36', '36', '36', '36', '36', '36', '36'...",[72],[302195],['138e0aa217a29b6e43b191d55547762fb7bc28eb'],['Asia/Kathmandu'],['aba9995c-d17c-4759-b6fc-bc5f75a1cfe4'],"['-1', 'SOCIAL', 'SOCIAL', 'SOCIAL', 'SOCIAL',...","['com.oppo.launcher', 'com.instagram.android',..."
66891,33da6d62-a24b-4fac-911e-5210163999d8,"['1685715102620', '1685715103191', '1685715104...",[1.6857151e+12],[1.68571531e+12],"['1', '1', '4', '1', '1', '3', '3', '3', '3', ...",[18],[198617],['138e4fc6a0c3f9c84a748783729e965d586628eb'],['Europe/Rome'],['01e797d4-2d69-4a04-abad-0f53de6eb216'],"['PERSONALIZATION', 'PERSONALIZATION', '-1', '...","['com.sec.android.app.launcher', 'com.sec.andr..."


In [6]:
# Function to strip external quotes and convert to list 
def process_entry(entry):
    try:
        # Remove external quotes and convert to list
        entry = entry.strip('"')
        return ast.literal_eval(entry)
    except (ValueError, SyntaxError):
        # Handle malformed entries
        #print(f"Failed to process: {entry}")
        return None

#tap_data['application'] = tap_data['application'].apply(process_entry)
tap_data['category'] = tap_data['category'].apply(process_entry)
tap_data['category']

0                                      [-1, COMMUNICATION]
1        [COMMUNICATION, COMMUNICATION, COMMUNICATION, ...
2                           [COMMUNICATION, COMMUNICATION]
3                                          [COMMUNICATION]
4        [COMMUNICATION, COMMUNICATION, COMMUNICATION, ...
                               ...                        
66888                                             [-1, -1]
66889    [SOCIAL, SOCIAL, SOCIAL, SOCIAL, SOCIAL, SOCIA...
66890    [-1, SOCIAL, SOCIAL, SOCIAL, SOCIAL, SOCIAL, S...
66891    [PERSONALIZATION, PERSONALIZATION, -1, PERSONA...
66892    [COMMUNICATION, COMMUNICATION, COMMUNICATION, ...
Name: category, Length: 66893, dtype: object

In [7]:
tap_data['taps'] = tap_data['taps'].apply(process_entry)
tap_data['taps']

0                           [1680260654282, 1680260656216]
1        [1680260848742, 1680260849767, 1680260850357, ...
2                           [1680260926555, 1680260929904]
3                                          [1680260966773]
4        [1680261384529, 1680261386184, 1680261386906, ...
                               ...                        
66888                       [1688567611127, 1688567611634]
66889    [1687686629107, 1687686630650, 1687686632048, ...
66890    [1688194591567, 1688194595574, 1688194602249, ...
66891    [1685715102620, 1685715103191, 1685715104549, ...
66892    [1685715937398, 1685715947273, 1685715958539, ...
Name: taps, Length: 66893, dtype: object

In [8]:
#Convert Applications into list
#applications = tap_data['application'].tolist()
categories = tap_data['category'].tolist()
#categories[0:10]
#applications[0:10]

In [9]:
timestamps = tap_data['taps'].tolist()

In [10]:
combined_lists = list(zip(categories,timestamps))
#combined_lists[0:10]

In [11]:
# Build a vocabulary of categories and map them to integers

# Flatten the list of lists, skipping None sublists
flattened_list = [item for sublist in categories if sublist is not None for item in sublist]

# Find unique values
unique_values = list(set(flattened_list))

# Check the length to ensure there are 30 unique elements
# print(len(unique_values))  # Should be 30

# Assuming unique_values contains your unique categories
stoi = {s: i for i, s in enumerate(unique_values)}
itos = {i: s for s, i in stoi.items()}

print("String to Integer Mapping:", stoi)

String to Integer Mapping: {'EDUCATION': 0, 'COMMUNICATION': 1, 'HEALTH_AND_FITNESS': 2, 'PHOTOGRAPHY': 3, 'PERSONALIZATION': 4, 'WEATHER': 5, 'LIFESTYLE': 6, 'MEDICAL': 7, 'GAME': 8, 'LIBRARIES_AND_DEMO': 9, 'SPORTS': 10, 'FOOD_AND_DRINK': 11, 'BEAUTY': 12, 'TRAVEL_AND_LOCAL': 13, 'HOUSE_AND_HOME': 14, 'FINANCE': 15, 'EVENTS': 16, None: 17, 'BOOKS_AND_REFERENCE': 18, 'NEWS_AND_MAGAZINES': 19, 'TOOLS': 20, 'VIDEO_PLAYERS': 21, 'MUSIC_AND_AUDIO': 22, 'AUTO_AND_VEHICLES': 23, 'PRODUCTIVITY': 24, '-1': 25, 'PARENTING': 26, 'MAPS_AND_NAVIGATION': 27, 'ART_AND_DESIGN': 28, 'SHOPPING': 29, 'BUSINESS': 30, 'SOCIAL': 31, 'ENTERTAINMENT': 32, 'DATING': 33}


In [12]:
len(stoi)

34

In [13]:
with open('cat_itos_stratified_look20.pkl', 'wb') as file:
    pk.dump(itos, file)

In [20]:
from collections import defaultdict
import torch
import numpy as np

# Assuming 'stoi' (string to index mapping) and 'combined_lists' (your data) are defined elsewhere
time_bin_size = 60000  # Size of your time bins to represent 1 minute
num_categories = len(stoi)  # Number of unique categories

# Initialize storage for categorized sessions
processed_sessions = defaultdict(lambda: defaultdict(list))

# Calculate diversity scores and session durations for each session
diversity_scores = []
session_durations = []  # To hold session durations
for categories, timestamps in combined_lists:
    if categories:  # Ensure the categories list is not empty
        diversity_score = len(set(categories)) / len(categories)
        # Safely convert timestamps to integers, ignoring any invalid entries
        timestamps_int = [int(ts) for ts in timestamps if ts.isdigit()]
        session_duration = max(timestamps_int) - min(timestamps_int) if timestamps_int else 0
    else:
        diversity_score = 0  # Handle empty categories list
        session_duration = 0
    diversity_scores.append(diversity_score)
    session_durations.append(session_duration)

# Define diversity and duration score thresholds
low_diversity_threshold = np.percentile(diversity_scores, 33)
high_diversity_threshold = np.percentile(diversity_scores, 66)
low_duration_threshold = np.percentile(session_durations, 33)
high_duration_threshold = np.percentile(session_durations, 66)

# Categorize and process sessions
for i, (categories, timestamps) in enumerate(combined_lists):
    diversity_score = diversity_scores[i]
    session_duration = session_durations[i]

    # Determine diversity category
    diversity_category = 'low' if diversity_score <= low_diversity_threshold else 'medium' if diversity_score <= high_diversity_threshold else 'high'

    # Determine duration category
    duration_category = 'low' if session_duration <= low_duration_threshold else 'medium' if session_duration <= high_duration_threshold else 'high'

    # Process session
    interval_counts = defaultdict(lambda: [0] * num_categories)
    timestamps_int = [int(ts) for ts in timestamps if ts.isdigit()]

    # Calculate differences between consecutive timestamps for additional analysis
    if len(timestamps_int) > 1:
        timestamp_diffs = [timestamps_int[i+1] - timestamps_int[i] for i in range(len(timestamps_int)-1)]

    for category_item, timestamp in zip(categories, timestamps_int):
        if category_item not in stoi:
            continue

        interval = timestamp // time_bin_size
        category_index = stoi[category_item]
        interval_counts[interval][category_index] += 1

    intervals_sorted = sorted(interval_counts.items())
    session_tensor = torch.tensor([counts for _, counts in intervals_sorted], dtype=torch.float)
    processed_sessions[diversity_category][duration_category].append(session_tensor)

# Print the contents of processed_sessions to check the lists
for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        category_combination = (diversity_category, duration_category)
        sessions_list = processed_sessions[diversity_category][duration_category]
        print(f"Category combination {category_combination}: {len(sessions_list)} sessions")
        for session_tensor in sessions_list[:2]:  # Limit output for demonstration
            print(f"    Session tensor shape: {session_tensor.shape}")


Category combination ('low', 'low'): 636 sessions
    Session tensor shape: torch.Size([1, 34])
    Session tensor shape: torch.Size([1, 34])
Category combination ('low', 'medium'): 5362 sessions
    Session tensor shape: torch.Size([2, 34])
    Session tensor shape: torch.Size([2, 34])
Category combination ('low', 'high'): 16137 sessions
    Session tensor shape: torch.Size([28, 34])
    Session tensor shape: torch.Size([3, 34])
Category combination ('medium', 'low'): 4689 sessions
    Session tensor shape: torch.Size([1, 34])
    Session tensor shape: torch.Size([2, 34])
Category combination ('medium', 'medium'): 11982 sessions
    Session tensor shape: torch.Size([2, 34])
    Session tensor shape: torch.Size([1, 34])
Category combination ('medium', 'high'): 6079 sessions
    Session tensor shape: torch.Size([3, 34])
    Session tensor shape: torch.Size([3, 34])
Category combination ('high', 'low'): 16750 sessions
    Session tensor shape: torch.Size([1, 34])
    Session tensor shape

In [None]:
from collections import Counter, defaultdict
import torch
import numpy as np

time_bin_size = 1000  # Define the size of your time bins
num_categories = len(stoi)  # Number of unique categories

# Calculate diversity scores for each session
diversity_scores = []
for categories, _ in combined_lists:
    if categories:  # Ensure the categories list is not empty
        diversity_score = len(set(categories)) / len(categories)
    else:
        diversity_score = 0  # Handle empty categories list
    diversity_scores.append(diversity_score)

# Define diversity score thresholds for categorizing sessions
low_diversity_threshold = np.percentile(diversity_scores, 33)
high_diversity_threshold = np.percentile(diversity_scores, 66)

# Categorize sessions based on their diversity scores
sessions_categories = {'low': [], 'medium': [], 'high': []}
for i, (categories, timestamps) in enumerate(combined_lists):
    score = diversity_scores[i]
    len = 
    if score <= low_diversity_threshold:
        category = 'low'
    elif score <= high_diversity_threshold:
        category = 'medium'
    else:
        category = 'high'
    sessions_categories[category].append((categories, timestamps))

# Now, let's integrate your existing code here to process the categorized sessions
# For simplicity, I'll process all sessions but in a real scenario, you'd sample as described previously
processed_sessions = {'low': [], 'medium': [], 'high': []}

for diversity_group in sessions_categories:  # Use 'diversity_group' instead of 'category'
    for categories, timestamps in sessions_categories[diversity_group]:
        interval_counts = defaultdict(lambda: [0] * num_categories)
        
        for category_item, timestamp in zip(categories, timestamps):  # Use 'category_item' to avoid conflict
            if category_item not in stoi or timestamp == '':
                continue
            
            try:
                interval = int(timestamp) // time_bin_size
            except ValueError:
                continue
            
            category_index = stoi[category_item]  # Use 'category_item' here
            interval_counts[interval][category_index] += 1
        
        intervals_sorted = sorted(interval_counts.items())
        session_tensor = torch.tensor([counts for _, counts in intervals_sorted], dtype=torch.float)
        processed_sessions[diversity_group].append(session_tensor)  # Use 'diversity_group' here


# At this point, 'processed_sessions' contains tensors categorized into 'low', 'medium', and 'high'
# diversity groups. You can now sample from these groups to balance your datasets for training, development, and testing.

# The next steps would involve sampling from 'processed_sessions' as described earlier.
# This is left as an exercise based on your specific requirements for dataset sizes and proportions.


In [27]:
from collections import defaultdict
import torch
import numpy as np

# Assuming 'stoi' (string to index mapping) and 'combined_lists' (your data) are defined elsewhere
time_bin_size = 60000  #size of your time bins to represent 1 minute
num_categories = len(stoi)  # Number of unique categories

# Calculate diversity scores and session durations for each session
diversity_scores = []
session_durations = []  # To hold session durations
for categories, timestamps in combined_lists:
    if categories:  # Ensure the categories list is not empty
        diversity_score = len(set(categories)) / len(categories)
        # Safely convert timestamps to integers, ignoring any invalid entries
        timestamps_int = [int(ts) for ts in timestamps if ts.isdigit()]
        session_duration = max(timestamps_int) - min(timestamps_int) if timestamps_int else 0
    else:
        diversity_score = 0  # Handle empty categories list
        session_duration = 0
    diversity_scores.append(diversity_score)
    session_durations.append(session_duration)

# Define diversity and duration score thresholds
low_diversity_threshold = np.percentile(diversity_scores, 33)
high_diversity_threshold = np.percentile(diversity_scores, 66)
low_duration_threshold = np.percentile(session_durations, 33)
high_duration_threshold = np.percentile(session_durations, 66)

# Initialize storage for categorized sessions
processed_sessions = defaultdict(lambda: defaultdict(list))

# Categorize and process sessions
for i, (categories, timestamps) in enumerate(combined_lists):
    diversity_score = diversity_scores[i]
    session_duration = session_durations[i]
    
    # Determine diversity category
    if diversity_score <= low_diversity_threshold:
        diversity_category = 'low'
    elif diversity_score <= high_diversity_threshold:
        diversity_category = 'medium'
    else:
        diversity_category = 'high'

    # Determine duration category
    if session_duration <= low_duration_threshold:
        duration_category = 'low'
    elif session_duration <= high_duration_threshold:
        duration_category = 'medium'
    else:
        duration_category = 'high'

    # Process session
    interval_counts = defaultdict(lambda: [0] * num_categories)
    timestamps_int = [int(ts) for ts in timestamps if ts.isdigit()]  # Safe conversion again for processing
    for category_item, timestamp in zip(categories, timestamps_int):
        if category_item not in stoi:
            continue
        
        interval = timestamp // time_bin_size
        category_index = stoi[category_item]
        interval_counts[interval][category_index] += 1

    intervals_sorted = sorted(interval_counts.items())
    session_tensor = torch.tensor([counts for _, counts in intervals_sorted], dtype=torch.float)
    processed_sessions[diversity_category][duration_category].append(session_tensor)

# Print the contents of processed_sessions to check the 9 lists
for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        category_combination = (diversity_category, duration_category)
        sessions_list = processed_sessions[diversity_category][duration_category]
        print(f"Category combination {category_combination}: {len(sessions_list)} sessions")
        # Optional: Print details of each session tensor for the first few sessions in each category
        for session_tensor in sessions_list[:2]:  # Adjust as needed to limit output
            print(f"    Session tensor shape: {session_tensor.shape}")


Category combination ('low', 'low'): 636 sessions
    Session tensor shape: torch.Size([1, 34])
    Session tensor shape: torch.Size([1, 34])
Category combination ('low', 'medium'): 5362 sessions
    Session tensor shape: torch.Size([2, 34])
    Session tensor shape: torch.Size([2, 34])
Category combination ('low', 'high'): 16137 sessions
    Session tensor shape: torch.Size([28, 34])
    Session tensor shape: torch.Size([3, 34])
Category combination ('medium', 'low'): 4689 sessions
    Session tensor shape: torch.Size([1, 34])
    Session tensor shape: torch.Size([2, 34])
Category combination ('medium', 'medium'): 11982 sessions
    Session tensor shape: torch.Size([2, 34])
    Session tensor shape: torch.Size([1, 34])
Category combination ('medium', 'high'): 6079 sessions
    Session tensor shape: torch.Size([3, 34])
    Session tensor shape: torch.Size([3, 34])
Category combination ('high', 'low'): 16750 sessions
    Session tensor shape: torch.Size([1, 34])
    Session tensor shape

In [None]:
#50th Percentile

import numpy as np

# Ensure reproducibility
np.random.seed(42)

# Instead of using the minimum number of sessions, use a percentile to determine the sample size
# This allows for more sessions to be included from each category
sample_size_per_category = int(np.percentile(
    [len(processed_sessions[div][dur]) for div in ['low', 'medium', 'high'] for dur in ['low', 'medium', 'high']],
    50  # Adjust the percentile as needed to balance between data utilization and balance
))

# Define proportions for splitting: 80% training, 10% development, 10% testing
train_split = int(sample_size_per_category * 0.8)
dev_split = int(sample_size_per_category * 0.1)

training_sessions, development_sessions, testing_sessions = [], [], []

for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        sessions = processed_sessions[diversity_category][duration_category]
        np.random.shuffle(sessions)
        sessions_sampled = sessions[:sample_size_per_category]  # Sample based on the determined size
        training_sessions.extend(sessions_sampled[:train_split])
        development_sessions.extend(sessions_sampled[train_split:train_split + dev_split])
        testing_sessions.extend(sessions_sampled[train_split + dev_split:sample_size_per_category])

print(f"Training sessions count: {len(training_sessions)}")
print(f"Development sessions count: {len(development_sessions)}")
print(f"Testing sessions count: {len(testing_sessions)}")


In [None]:
# UNDERSAMPLING

import numpy as np

# Ensure reproducibility
np.random.seed(42)

# Find the minimum number of sessions across all categories
min_sessions_size = min(
    [len(processed_sessions[div][dur]) for div in ['low', 'medium', 'high'] for dur in ['low', 'medium', 'high']]
)

# Use the minimum size for a balanced sample size across categories
sample_size_per_category = min_sessions_size

# Define proportions for splitting: 80% training, 10% development, 10% testing
train_split = int(sample_size_per_category * 0.8)
dev_split = int(sample_size_per_category * 0.1)
# The rest is for testing, ensuring we use all data in the split

training_sessions, development_sessions, testing_sessions = [], [], []

for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        sessions = processed_sessions[diversity_category][duration_category]
        np.random.shuffle(sessions)
        # Sample based on the determined minimum size
        sessions_sampled = sessions[:sample_size_per_category]
        training_sessions.extend(sessions_sampled[:train_split])
        development_sessions.extend(sessions_sampled[train_split:train_split + dev_split])
        testing_sessions.extend(sessions_sampled[train_split + dev_split:])

print(f"Training sessions count: {len(training_sessions)}")
print(f"Development sessions count: {len(development_sessions)}")
print(f"Testing sessions count: {len(testing_sessions)}")


In [None]:
#OVERSAMPLING

import numpy as np

# Ensure reproducibility
np.random.seed(42)

# Assuming processed_sessions is defined and is a dict of dicts of lists
# Calculate sizes for all categories
category_sizes = {
    (div, dur): len(processed_sessions[div][dur])
    for div in ['low', 'medium', 'high'] for dur in ['low', 'medium', 'high']
}

# Find the maximum size to aim for oversampling
max_size = max(category_sizes.values())

training_sessions, development_sessions, testing_sessions = [], [], []

for div, dur in category_sizes:
    sessions = processed_sessions[div][dur]
    np.random.shuffle(sessions)
    
    # Manually repeat the sessions to approximate oversampling
    oversampled_sessions = []
    while len(oversampled_sessions) < max_size:
        oversampled_sessions.extend(sessions)
    oversampled_sessions = oversampled_sessions[:max_size]
    
    # Splitting sessions
    train_split = int(len(oversampled_sessions) * 0.8)
    dev_split = int(len(oversampled_sessions) * 0.1)
    
    training_sessions.extend(oversampled_sessions[:train_split])
    development_sessions.extend(oversampled_sessions[train_split:train_split + dev_split])
    testing_sessions.extend(oversampled_sessions[train_split + dev_split:])

print(f"Training sessions count: {len(training_sessions)}")
print(f"Development sessions count: {len(development_sessions)}")
print(f"Testing sessions count: {len(testing_sessions)}")


In [15]:
# STRATIFICATION

from random import sample

from sklearn.model_selection import train_test_split

# Define split ratios
train_ratio = 0.7
test_ratio = 0.2
dev_ratio = 0.1

# Storage for stratified splits
stratified_train = []
stratified_dev = []
stratified_test = []
session_all = []
# Stratify and split data
for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        sessions_all = processed_sessions[diversity_category][duration_category]
        sessions = sample(sessions_all, min(len(sessions_all), 500))  # Corrected typo and ensured not to exceed list size

        
        # Check if there are sessions to split
        if len(sessions) > 0:
            # Split sessions into training and temp (temporary holding the rest)
            sessions_train, sessions_temp = train_test_split(sessions, test_size=(1 - train_ratio), random_state=42)
            
            # Split the temp into development and testing
            sessions_dev, sessions_test = train_test_split(sessions_temp, test_size=(test_ratio / (test_ratio + dev_ratio)), random_state=42)
            
            # Append to respective stratified sets
            stratified_train.extend(sessions_train)
            stratified_dev.extend(sessions_dev)
            stratified_test.extend(sessions_test)

# Now, stratified_train, stratified_dev, and stratified_test contain your stratified splits
print(f"Training set size: {len(stratified_train)}")
print(f"Development set size: {len(stratified_dev)}")
print(f"Testing set size: {len(stratified_test)}")


Training set size: 3141
Development set size: 450
Testing set size: 909


In [17]:
from sklearn.model_selection import train_test_split

# Define split ratios
train_ratio = 0.8
test_ratio = 0.1
dev_ratio = 0.1

# Storage for stratified splits
stratified_train = []
stratified_dev = []
stratified_test = []

# Stratify and split data
for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        # Directly use all sessions without sampling up to 500
        sessions_all = processed_sessions[diversity_category][duration_category]
        
        # Check if there are sessions to split
        if len(sessions_all) > 0:
            # Split sessions into training and temp (temporary holding the rest)
            sessions_train, sessions_temp = train_test_split(sessions_all, test_size=(1 - train_ratio), random_state=42)
            
            # Split the temp into development and testing
            sessions_dev, sessions_test = train_test_split(sessions_temp, test_size=(test_ratio / (test_ratio + dev_ratio)), random_state=42)
            
            # Append to respective stratified sets
            stratified_train.extend(sessions_train)
            stratified_dev.extend(sessions_dev)
            stratified_test.extend(sessions_test)

print(f"Training set size: {len(stratified_train)}")
print(f"Development set size: {len(stratified_dev)}")
print(f"Testing set size: {len(stratified_test)}")


Training set size: 53511
Development set size: 6690
Testing set size: 6692


In [18]:
tensor_dict_strat = {
    'train_sessions': stratified_train,
    'test_sessions': stratified_test,
    'dev_sessions': stratified_dev}

In [19]:
with open('LSTM_time1min_stratified_20lookback.pkl', 'wb') as file:
    pk.dump(tensor_dict_strat, file)

In [None]:
tensor_dict = {
    'train_sessions': training_sessions,
    'test_sessions': testing_sessions,
    'dev_sessions': development_sessions}

In [None]:
with open('LSTM_time1min_stratified_500samples_10sec.pkl', 'wb') as file:
    pk.dump(tensor_dict_strat, file)

In [None]:
from collections import defaultdict
import numpy as np
import torch
from sklearn.model_selection import train_test_split

# Assumed existing variables and data structures: processed_sessions, stoi

# Define split ratios
train_ratio = 0.8
test_ratio = 0.1
dev_ratio = 0.1

# Storage for stratified and oversampled splits
stratified_train = defaultdict(lambda: defaultdict(list))
stratified_dev = defaultdict(lambda: defaultdict(list))
stratified_test = defaultdict(lambda: defaultdict(list))

# Function to oversample within each stratified category
def oversample_sessions(sessions_dict):
    max_size = max(len(sessions) for category in sessions_dict.values() for sessions in category.values())
    for diversity_category in sessions_dict:
        for duration_category in sessions_dict[diversity_category]:
            sessions = sessions_dict[diversity_category][duration_category]
            while len(sessions) < max_size:
                oversampled_sessions = [session.clone() for session in sessions]  # Clone to avoid altering original tensors
                sessions.extend(oversampled_sessions[:max_size - len(sessions)])
            sessions_dict[diversity_category][duration_category] = sessions

# Stratify and split data
for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        sessions_all = processed_sessions[diversity_category][duration_category]
        if not sessions_all:
            continue  # Skip if no sessions in this category

        # Split sessions into training, dev, and test
        sessions_train, sessions_temp = train_test_split(sessions_all, test_size=(1 - train_ratio), random_state=42)
        sessions_dev, sessions_test = train_test_split(sessions_temp, test_size=(test_ratio / (test_ratio + dev_ratio)), random_state=42)
        
        # Store sessions
        stratified_train[diversity_category][duration_category] = sessions_train
        stratified_dev[diversity_category][duration_category] = sessions_dev
        stratified_test[diversity_category][duration_category] = sessions_test

# Apply oversampling to the training set after the function definition
oversample_sessions(stratified_train)

# Aggregate sessions from each stratified split into separate lists
def aggregate_sessions(stratified_sessions):
    aggregated_sessions = []
    for diversity_category in stratified_sessions:
        for duration_category in stratified_sessions[diversity_category]:
            aggregated_sessions.extend(stratified_sessions[diversity_category][duration_category])
    return aggregated_sessions

# Aggregate training, testing, and development sessions
training_sessions = aggregate_sessions(stratified_train)
testing_sessions = aggregate_sessions(stratified_test)
development_sessions = aggregate_sessions(stratified_dev)

# Combine aggregated sessions into a single dictionary
tensor_dict = {
    'train_sessions': training_sessions,
    'test_sessions': testing_sessions,
    'dev_sessions': development_sessions
}


In [None]:
len(training_sessions)

In [None]:

with open('data_time1min_stratified_oversampled.pkl', 'wb') as file:
    pk.dump(tensor_dict, file)

In [None]:
from random import sample
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter, defaultdict
from random import choices



# Define split ratios
train_ratio = 0.7
test_ratio = 0.2
dev_ratio = 0.1

# Storage for stratified splits
stratified_train = []
stratified_dev = []
stratified_test = []

# Placeholder for categories associated with each session in stratified_train
session_categories = []

# Stratify and split data
for diversity_category in ['low', 'medium', 'high']:
    for duration_category in ['low', 'medium', 'high']:
        sessions_all = processed_sessions[diversity_category][duration_category]
        sessions = sample(sessions_all, min(len(sessions_all), 500))  # Sample up to 500 sessions

        # Split sessions into training and temp (holding the rest)
        sessions_train, sessions_temp = train_test_split(sessions, test_size=(1 - train_ratio), random_state=42)
        sessions_dev, sessions_test = train_test_split(sessions_temp, test_size=(test_ratio / (test_ratio + dev_ratio)), random_state=42)
        
        # Append sessions and their categories to respective lists
        stratified_train.extend(sessions_train)
        stratified_dev.extend(sessions_dev)
        stratified_test.extend(sessions_test)
        session_categories.extend([f"{diversity_category}-{duration_category}"] * len(sessions_train))


def dynamic_oversample_sessions(sessions, categories):
    category_counts = Counter(categories)
    max_count = max(category_counts.values())
    
    # Organize sessions by category
    sessions_by_category = defaultdict(list)
    for session, category in zip(sessions, categories):
        sessions_by_category[category].append(session)
    
    # Oversample sessions in each category
    oversampled_sessions = []
    for category, sessions in sessions_by_category.items():
        required_additional = max_count - len(sessions)
        # Using random.choices for oversampling
        oversampled = sessions + choices(sessions, k=required_additional)
        oversampled_sessions.extend(oversampled)
    
    return oversampled_sessions

# Now apply dynamic oversampling with the corrected function
oversampled_train = dynamic_oversample_sessions(stratified_train, session_categories)

# Update stratified_train with oversampled data
stratified_train = oversampled_train

print(f"Training set size after oversampling: {len(stratified_train)}")
print(f"Development set size: {len(stratified_dev)}")
print(f"Testing set size: {len(stratified_test)}")

In [None]:
from random import choices
from collections import Counter, defaultdict

def dynamic_oversample_sessions(sessions, categories):
    category_counts = Counter(categories)
    max_count = max(category_counts.values())
    
    print(f"Original category counts: {category_counts}")
    print(f"Target max count for oversampling: {max_count}")
    
    # Organize sessions by category
    sessions_by_category = defaultdict(list)
    for session, category in zip(sessions, categories):
        sessions_by_category[category].append(session)
    
    oversampled_sessions = []
    for category, sessions in sessions_by_category.items():
        required_additional = max_count - len(sessions)
        oversampled = sessions + choices(sessions, k=required_additional)
        oversampled_sessions.extend(oversampled)
        print(f"Category: {category}, Original: {len(sessions)}, Oversampled: {len(oversampled)}")
    
    return oversampled_sessions

# Apply dynamic oversampling
oversampled_train = dynamic_oversample_sessions(stratified_train, session_categories)

# Update stratified_train with oversampled data
stratified_train = oversampled_train

print(f"\nTraining set size after oversampling: {len(stratified_train)}")
