In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd

# Define the file paths
file_path1 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_training_data.csv'
file_path2 = '/content/drive/MyDrive/SuperUROP /Data Analysis/caltech_testing_data.csv'
file_path3 = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_training_data.csv'
file_path4  = '/content/drive/MyDrive/SuperUROP /Data Analysis/JPL_testing_data.csv'
# Use pandas to read the CSV files and then convert them to NumPy arrays
caltech_train = pd.read_csv(file_path1).values
caltech_test = pd.read_csv(file_path2).values

JPL_train = pd.read_csv(file_path3).values
JPL_test=pd.read_csv(file_path4).values

#Data Processing

In [None]:
#Remove row number (in 1st column)
caltech_train=caltech_train[:,1:]
caltech_test=caltech_test[:,1:]

JPL_train=JPL_train[:,1:]
JPL_test=JPL_test[:,1:]

In [None]:
#Remove departure time (2nd column)
# Convert arrival date to hour and find day of the week
from datetime import datetime

def convert_time_and_day(data_array):
    """
    Converts the time from HH:MM to HH.XX format and appends the day of the week to it.
    Also, removes the second column.
    """
    transformed_data = []
    for row in data_array:
        # Convert the arrival time to HH.XX format
        time_obj = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        new_time = time_obj.hour + (time_obj.minute / 60.0)

        # Convert the date to a day of the week
        day_of_week = time_obj.strftime('%A')
        new_time = str(new_time) + " " + day_of_week

        # Create a new row excluding the second column
        new_row = [new_time] + list(row[2:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=convert_time_and_day(caltech_train)
caltech_test=convert_time_and_day(caltech_test)
JPL_train=convert_time_and_day(JPL_train)
JPL_test=convert_time_and_day(JPL_test)

In [None]:
def day_to_number(day):
    """Converts a day of the week to its corresponding discrete value."""
    days = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7
    }
    return days[day]

def separate_time_and_day(data_array):
    """
    Separates the time and day in the given column,
    and converts the day into a discrete value between 1 and 7.

    """
    transformed_data = []
    for row in data_array:
        time_day_str = row[0]
        time, day = time_day_str.split()
        time = float(time)
        day_num = day_to_number(day)

        # Create a new row with separated time and day number
        new_row = [time, day_num] + list(row[1:])
        transformed_data.append(new_row)

    return np.array(transformed_data)

caltech_train=separate_time_and_day(caltech_train)
caltech_test=separate_time_and_day(caltech_test)
JPL_train=separate_time_and_day(JPL_train)
JPL_test=separate_time_and_day(JPL_test)

Separate the dataset into 7 days - we want to compute correlations between users for **each** day to make it more accurate rather than general correlation of users in an entire week!

In [None]:
# Split the dataset into 7 datasets one for each day
datasets_by_day = {str(i): [] for i in range(1, 8)}

# Populate the dictionary by day
for row in JPL_train:
    day = row[1]
    datasets_by_day[day].append(row)



#Approach 1: Computing correlation for arrival time and stay duration **separately** and average

## Computing correlation for arrival time
In order to compute the correlation we need to discretize time so that each user has equal vector length necessary for Pearson correlation and cosine similarity.

Two different discretization (1 hour intervals and 30 minute intervals) used. Sum the number of charging sessions of each user in each interval for each day


In [None]:
from collections import defaultdict
from scipy.spatial.distance import cosine

# Organize the arrival times by user ID for each day
user_times_by_day = {str(i): defaultdict(list) for i in range(1, 8)}
for day, data in datasets_by_day.items():
    for row in data:
        arrival_time = float(row[0])
        user_id = row[3]
        user_times_by_day[day][user_id].append(arrival_time)

# Function to create a representation based on 1-hour intervals
def create_hourly_representation(times):
    hourly_vector = [0] * 24
    for time in times:
        hour = int(float(time))
        hourly_vector[hour] += 1
    return hourly_vector

# Function to create a representation based on 30-minute intervals
def create_half_hourly_representation(times):
    half_hourly_vector = [0] * 48
    for time in times:
        interval = int(float(time) * 2)  # Multiply by 2 for half-hourly intervals
        half_hourly_vector[interval] += 1
    return half_hourly_vector

# Compute the hourly and half-hourly representations for each user for each day
hourly_representations_by_day = {}
half_hourly_representations_by_day = {}

for day, user_times in user_times_by_day.items():
    hourly_representations_by_day[day] = {}
    half_hourly_representations_by_day[day] = {}

    for user_id, times in user_times.items():
        hourly_representations_by_day[day][user_id] = create_hourly_representation(times)
        half_hourly_representations_by_day[day][user_id] = create_half_hourly_representation(times)

###Compute cosine similarities for arrival time for both discretizations

In [None]:
# Function to compute cosine similarity
def compute_cosine_similarity(v1, v2):
    return 1 - cosine(v1, v2)

# Compute cosine similarities for both representations
def compute_similarities(representations):
    similarities = {}
    for day, reps in representations.items():
        user_ids = list(reps.keys())
        similarities[day] = {}
        for i in range(len(user_ids)):
            for j in range(i+1, len(user_ids)):
                user_i = user_ids[i]
                user_j = user_ids[j]
                sim = compute_cosine_similarity(reps[user_i], reps[user_j])
                similarities[day][(user_i, user_j)] = sim
    return similarities

cosine_similarities_hourly = compute_similarities(hourly_representations_by_day)
cosine_similarities_half_hourly = compute_similarities(half_hourly_representations_by_day)

###Compute pearson correlations for arrival time for both discretizations

In [None]:
#Function to compute the pearson correlation
def compute_pearson_correlation(v1, v2):
    return np.corrcoef(v1, v2)[0, 1]

# Compute pearson correlations for both representations
def compute_correlations(representations):
    correlations = {}
    for day, reps in representations.items():
        user_ids = list(reps.keys())
        correlations[day] = {}
        for i in range(len(user_ids)):
            for j in range(i+1, len(user_ids)):
                user_i = user_ids[i]
                user_j = user_ids[j]
                corr = compute_pearson_correlation(reps[user_i], reps[user_j])
                correlations[day][(user_i, user_j)] = corr
    return correlations

pearson_correlations_hourly = compute_correlations(hourly_representations_by_day)
pearson_correlations_half_hourly = compute_correlations(half_hourly_representations_by_day)

##Computing correlation for stay duration

Use maximum duration to determine length of array -> don't use (max-min) for each user -> use same length for all users by finding maximum in entire JPL_train array

In [None]:
max_duration = int(np.round(JPL_train[:, 4].astype(float).max()))

In [None]:
import numpy as np
from collections import defaultdict

# Organize the stay durations by user ID for each day
user_durations_by_day = {str(i): defaultdict(list) for i in range(1, 8)}
for day, data in datasets_by_day.items():
    for row in data:
        stay_duration = float(row[4])
        user_id = row[3]
        user_durations_by_day[day][user_id].append(stay_duration)

# Function to create a representation based on 1-hour intervals
def create_hourly_representation_duration(times,max_duration):
    hourly_vector = [0] * (max_duration+1)
    for time in times:
        hour = int(float(time))
        hourly_vector[hour] += 1
    return hourly_vector

# Function to create a representation based on 30-minute intervals
def create_half_hourly_representation_duration(times,max_duration):
    half_hourly_vector = [0] * ((max_duration+1)*2)
    for time in times:
        interval = int(float(time) * 2)  # Multiply by 2 for half-hourly intervals
        half_hourly_vector[interval] += 1
    return half_hourly_vector

# Compute the hourly and half-hourly representations for each user for each day
hourly_representations_by_day_duration = {}
half_hourly_representations_by_day_duration = {}

for day, user_times in user_durations_by_day.items():
    hourly_representations_by_day_duration[day] = {}
    half_hourly_representations_by_day_duration[day] = {}

    for user_id, times in user_times.items():
        hourly_representations_by_day_duration[day][user_id] = create_hourly_representation_duration(times,max_duration)
        half_hourly_representations_by_day_duration[day][user_id] = create_half_hourly_representation_duration(times,max_duration)

###Compute cosine similarity for stay duration for both discretizations

In [None]:
# Function to compute cosine similarity
def compute_cosine_similarity(v1, v2):
    return 1 - cosine(v1, v2)

# Compute cosine similarities for both representations
def compute_similarities(representations):
    similarities = {}
    for day, reps in representations.items():
        user_ids = list(reps.keys())
        similarities[day] = {}
        for i in range(len(user_ids)):
            for j in range(i+1, len(user_ids)):
                user_i = user_ids[i]
                user_j = user_ids[j]
                sim = compute_cosine_similarity(reps[user_i], reps[user_j])
                similarities[day][(user_i, user_j)] = sim
    return similarities

cosine_correlations_dynamic_hourly_durations = compute_similarities(hourly_representations_by_day_duration)
cosine_correlations_dynamic_half_hourly_durations = compute_similarities(half_hourly_representations_by_day_duration)

###Compute pearson correlations for stay duration for both discretizations




In [None]:
#Function to compute the pearson correlation
def compute_pearson_correlation(v1, v2):
    return np.corrcoef(v1, v2)[0, 1]

# Compute pearson correlations for both discretizations
def compute_correlations(durations):
    correlations = {}
    for day, reps in durations.items():
        user_ids = list(reps.keys())
        correlations[day] = {}
        for i in range(len(user_ids)):
            for j in range(i+1, len(user_ids)):
                user_i = user_ids[i]
                user_j = user_ids[j]
                if len(reps[user_i]) > 1 and len(reps[user_j]) > 1:  # Need at least 2 data points
                    corr = compute_pearson_correlation(reps[user_i], reps[user_j])
                    correlations[day][(user_i, user_j)] = corr
    return correlations

pearson_correlations_dynamic_hourly_durations = compute_correlations(hourly_representations_by_day_duration)
pearson_correlations_dynamic_half_hourly_durations = compute_correlations(half_hourly_representations_by_day_duration)

## Calculate the average correlation (equal weight for arrival time and stay duration)
 - Cosine similarity, 30 minute intervals
 - Cosine similarity, 1 hour intervals
 - Pearson Correlation, 30 minute intervals
 - Pearson Correlation, 1 hour intervals

In [None]:
def average_correlations(dict1, dict2):

    averaged_dict = {}

    for key, sub_dict_1 in dict1.items():
        averaged_dict[key] = {}
        sub_dict_2 = dict2.get(key, {})

        # Get all unique subkeys from both sub-dictionaries
        all_subkeys = set(sub_dict_1.keys()) | set(sub_dict_2.keys())
        for subkey in all_subkeys:
            value_1 = sub_dict_1.get(subkey, 0)
            value_2 = sub_dict_2.get(subkey, 0)

            # Compute the average for the subkey
            averaged_value = (value_1 + value_2) / 2.0

            averaged_dict[key][subkey] = averaged_value

    return averaged_dict

# Usage example:
cosine_corr_half_hour = average_correlations(cosine_similarities_hourly,cosine_correlations_dynamic_half_hourly_durations)
cosine_corr_hour=average_correlations(cosine_similarities_hourly,cosine_correlations_dynamic_hourly_durations)
pearson_corr_half_hour=average_correlations(pearson_correlations_half_hourly,pearson_correlations_dynamic_half_hourly_durations)
pearson_corr_hour=average_correlations(pearson_correlations_hourly,pearson_correlations_dynamic_hourly_durations)

# Approach 2: Computing correlation using both arrival time and stay duration
Concatenate the two features for each user using the individual vectors so length is the same for all users


In [None]:
def concatenate_dicts(dict1, dict2):
    concatenated = {}
    for key in set(dict1.keys()) | set(dict2.keys()):  # union of keys from both dicts
        concatenated[key] = {}

        # Get all unique subkeys from both sub-dictionaries
        all_subkeys = set(dict1.get(key, {}).keys()) | set(dict2.get(key, {}).keys())

        for subkey in all_subkeys:
            list1 = dict1.get(key, {}).get(subkey, [])
            list2 = dict2.get(key, {}).get(subkey, [])

            # Concatenate the lists
            concatenated_list = list1 + list2

            concatenated[key][subkey] = concatenated_list

    return concatenated

concatenated_hourly=concatenate_dicts(hourly_representations_by_day,hourly_representations_by_day_duration)
concatenated_half_hourly=concatenate_dicts(half_hourly_representations_by_day,half_hourly_representations_by_day_duration)


## Compute cosine similarities for concatenated features for both discretizations

In [None]:
# Function to compute cosine similarity
def compute_cosine_similarity(v1, v2):
    return 1 - cosine(v1, v2)

# Compute cosine similarities for both representations
def compute_similarities(representations):
    similarities = {}
    for day, reps in representations.items():
        user_ids = list(reps.keys())
        similarities[day] = {}
        for i in range(len(user_ids)):
            for j in range(i+1, len(user_ids)):
                user_i = user_ids[i]
                user_j = user_ids[j]
                sim = compute_cosine_similarity(reps[user_i], reps[user_j])
                similarities[day][(user_i, user_j)] = sim
    return similarities

concatanated_cosine_corr_hourly = compute_similarities(concatenated_hourly)
concatanated_cosine_corr_half_hourly = compute_similarities(concatenated_half_hourly)

## Compute pearson correlation for concatenated features for both discretizations

In [None]:
#Function to compute the pearson correlation
def compute_pearson_correlation(v1, v2):
    return np.corrcoef(v1, v2)[0, 1]

# Compute pearson correlations for both discretizations
def compute_correlations(durations):
    correlations = {}
    for day, reps in durations.items():
        user_ids = list(reps.keys())
        correlations[day] = {}
        for i in range(len(user_ids)):
            for j in range(i+1, len(user_ids)):
                user_i = user_ids[i]
                user_j = user_ids[j]
                if len(reps[user_i]) > 1 and len(reps[user_j]) > 1:  # Need at least 2 data points
                    corr = compute_pearson_correlation(reps[user_i], reps[user_j])
                    correlations[day][(user_i, user_j)] = corr
    return correlations

concatanated_pearson_corr_hourly = compute_correlations(concatenated_hourly)
concatanated_pearson_corr_half_hourly = compute_correlations(concatenated_half_hourly)