# Introduction

This project will explore the OCCP data. Open Charge Point Protocol (OCPP) is an open standard communication protocol for Electric Vehicle (EV) charging stations. It defines interactions between EV charging stations and a central system, helping to facilitate security, transactions, diagnostics, and more.

This dataset if from OCCP v1.6

## Prepare Enviornment

Organization < Property < Location < Cluster < Station < UserID

A cluster is a grouping of chargers/stations. This for convenience/load balancing

Each circuit can have multiple clusters.

Each cluster has its own breaker


In [2]:
# Access to Google Drive
# This seems to propagate credentials better from its own cell

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Packages and methods

!pip install PyGithub
from github import Github
import os
import datetime
from google.colab import userdata


!pip install pandas pyxlsb
import pandas as pd

import numpy as np

import os
import logging
import psycopg2

!pip install SQLAlchemy psycopg2-binary
import seaborn as sns
import matplotlib.pyplot as p

import json

import statsmodels.api as sm
from statsmodels.formula.api import ols

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

import matplotlib.pyplot as plt

from datetime import timedelta
import holidays

!pip install statsmodels
import statsmodels.api as sm





In [2]:
# Update github

def colab_to_github(notebook_path, github_repo, folder_path=None, commit_message=None, branch="main"):
   try:
       print("Fetching GitHub token...")
       token = os.getenv('GITHUB_TOKEN')
       if not token:
           raise ValueError("GitHub token is missing or invalid. Ensure it is set as an environment variable.")

       # Add debug logging (only showing first few chars for security)
       print(f"Token format check - starts with: {token[:4]}")

       print("Token successfully retrieved.")
       g = Github(token)
       repo = g.get_repo(github_repo)
       print(f"Connected to repository: {github_repo}")

       if not commit_message:
           commit_message = f"Auto-commit from Colab: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
       print(f"Using commit message: {commit_message}")

       with open(notebook_path, 'r') as file:
           notebook_content = file.read()
       print(f"Notebook content read from {notebook_path}")

       filename = os.path.basename(notebook_path)
       # Construct the full file path including the folder if specified
       file_path = f"{folder_path}/{filename}" if folder_path else filename
       print(f"Target file path in repo: {file_path}")

       try:
           print(f"Checking if file exists at {file_path}...")
           existing_file = repo.get_contents(file_path, ref=branch)
           repo.update_file(
               path=file_path,
               message=commit_message,
               content=notebook_content,
               sha=existing_file.sha,
               branch=branch
           )
           print(f"File updated successfully in branch '{branch}'.")
       except Exception:
           print(f"File does not exist at {file_path}. Attempting to create...")
           repo.create_file(
               path=file_path,
               message=commit_message,
               content=notebook_content,
               branch=branch
           )
           print(f"File created successfully in branch '{branch}'.")

   except Exception as e:
       print(f"Error occurred: {e}")

raw_token = userdata.get('GITHUB_TOKEN')
cleaned_token = raw_token.replace('token ', '').strip()
print(f"Cleaned token starts with: {cleaned_token[:4]}")

os.environ['GITHUB_TOKEN'] = cleaned_token

# Call the function with your parameters
notebook_path = "/content/drive/MyDrive/Colab Notebooks/OCCP.ipynb"
github_repo = "davidelgas/DataSciencePortfolio"  # This is the correct repository path
folder_path = "OCCP"  # This specifies the directory within the repository
commit_message = "Updated notebook from Colab"

colab_to_github(notebook_path, github_repo, folder_path, commit_message)

Cleaned token starts with: ghp_
Fetching GitHub token...
Token format check - starts with: ghp_
Token successfully retrieved.
Connected to repository: davidelgas/DataSciencePortfolio
Using commit message: Updated notebook from Colab
Notebook content read from /content/drive/MyDrive/Colab Notebooks/OCCP.ipynb
Target file path in repo: OCCP/OCCP.ipynb
Checking if file exists at OCCP/OCCP.ipynb...
File updated successfully in branch 'main'.


## Ingest data

In [67]:
# import logs
# These are from Splunk logs and are a 1% sample due to size

import pandas as pd
import numpy as np
import json

def load_file(file_path):
    """Load a single CSV file."""
    return pd.read_csv(file_path)

def concatenate_files(file_paths):
    """Load and combine multiple CSV files."""
    dfs = []
    for file_path in file_paths:
        df = load_file(file_path)
        if not df.empty:
            dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

def expand_json(df, json_column):
    """Expand JSON column into separate columns."""
    parsed = df[json_column].apply(lambda x: json.loads(x) if pd.notna(x) else {})

    expanded = pd.DataFrame()
    expanded['property_id'] = df['property_id']
    expanded['user_id'] = df['user_id']
    expanded['timestamp'] = parsed.apply(lambda x: x.get('meterValue', [{}])[0].get('timestamp'))

    sampled_values = parsed.apply(lambda x: x.get('meterValue', [{}])[0].get('sampledValue', [{}]))
    expanded['value'] = sampled_values.apply(lambda x: x[0].get('value') if x else None)
    expanded['unit'] = sampled_values.apply(lambda x: x[0].get('unit') if x else None)

    return expanded

def clean_ids(df):
    """Remove rows with invalid property_ids."""
    return df[df['property_id'].notna()].reset_index(drop=True)

def save_df(df, filepath):
    """Save DataFrame."""
    df.to_pickle(filepath + '.pkl')

def process_logs(file_paths, output_path):
    """Complete workflow to process log files."""
    # Concatenate files
    df_combined = concatenate_files(file_paths)

    # Process JSON and clean data
    df_expanded = expand_json(df_combined, 'cleaned_message')
    df_logs = clean_ids(df_expanded)

    # Save processed data
    save_df(df_logs, output_path)
    return df_logs

if __name__ == "__main__":
    file_paths = [
        '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/sept_100_sample.csv',
        '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/oct_100_sample.csv',
        '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/nov_100_sample.csv'
    ]

    output_path = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_logs'

    # Run complete workflow
    df_logs = process_logs(file_paths, output_path)


In [185]:
df_logs = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_logs.pkl')
df_logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319124 entries, 0 to 1319123
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   property_id  1319124 non-null  object
 1   user_id      1268308 non-null  object
 2   timestamp    1319124 non-null  object
 3   value        1319124 non-null  object
 4   unit         1319124 non-null  object
dtypes: object(5)
memory usage: 50.3+ MB


In [178]:
# import property table
import pandas as pd

def process_properties(file_path, output_path):
    """Process properties file workflow."""
    # Load CSV
    df_prop = pd.read_csv(file_path)

    # Clean IDs
    df_prop = df_prop[df_prop['id'].notna()].reset_index(drop=True)

    # Rename id column
    df_prop = df_prop.rename(columns={'id': 'property_id'})

    # Save processed data
    df_prop.to_pickle(output_path_3 + '.pkl')

    return df_lookup


if __name__ == "__main__":
    input_path_3 = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/properties.csv'
    output_path_3 = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_prop'

    # Run workflow
    df_prop = process_properties(input_path_3, output_path_3)


In [1]:
df_prop = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_prop.pkl')
df_prop.info()

NameError: name 'pd' is not defined

In [189]:
# Can I join df_log and df_prop on property_id
# Merge the DataFrames on property_id
df_merged = pd.merge(df_logs, df_prop,
                    left_on='property_id',
                    right_on='property_id',
                    how='left',
                    indicator=True)

# Count matches and non-matches
matches = df_merged[df_merged['_merge'] == 'both']
non_matches = df_merged[df_merged['_merge'] == 'left_only']

print("Total rows in df_log:", len(df_logs))
print("Rows that matched:", len(matches))
print("Rows that did not match:", len(non_matches))
print("\nPercentage matched: {:.2f}%".format(len(matches) / len(df_logs) * 100))

# Non-matching property_id values from df_logs
unmatched_property_ids_logs = non_matches['property_id'].unique()
print("Unique unmatched property_id values in df_logs:")
print(unmatched_property_ids_logs)
print("Number of unique unmatched property_id values:", len(unmatched_property_ids_logs))

# Optional: Save unmatched property_ids to CSV
pd.DataFrame(unmatched_property_ids_logs, columns=['property_id']).to_csv(
    '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/unmatched_property_ids.csv',
    index=False
)



Total rows in df_log: 1319124
Rows that matched: 1318319
Rows that did not match: 805

Percentage matched: 99.94%
Unique unmatched property_id values in df_logs:
['9c1bc7da-9235-4b6a-92ed-51bbdad719db'
 'b5da81dd-e999-4e23-a8d0-466e92b34576\na7dc9a96-91f3-40ea-aed2-582f3099fb03'
 '4d8f5977-bc0c-4bed-a2b6-690d8d0f93b4\n8471ffd3-bfaf-44a7-9135-dca30b64b73a'
 '54e7058c-0a1f-4d5a-8c64-31b0e948ace4\n348c41a5-baaa-4275-b8c5-4d9e6ba8f97d'
 '2360d17a-1103-4f67-9419-0b016df70fe9\n060c8da3-e20e-4b49-89e2-f0bc228bc9c9'
 '8bbdcff9-06b0-44cf-8c9f-a9ddce51a317\n24d7e3f1-9d63-457c-9f0c-a50b0bc08ebc'
 '24d7e3f1-9d63-457c-9f0c-a50b0bc08ebc\ne95523e6-3470-4a60-b586-ee715cd9f34b'
 '7044cd75-52ab-4044-aeaa-e0fdb02bc4d8\n38b28445-1cdb-467c-b5be-628b2902d03a'
 '8f411e48-b09c-4834-88e9-4b7f59e86130\nc4ab1149-c4a9-46ab-8bfd-3b471b70d6d9'
 '53e9cab2-aed2-4c35-ab39-e8375825e6fb\n4cb8af8c-e0ed-41d9-8c08-6e66c4125cdd'
 'e95523e6-3470-4a60-b586-ee715cd9f34b\n7dc180ff-efbe-4c31-8d05-f452ec7db3b2'
 '6d5e9f52-8dcd-4e

In [None]:


# Here are a couple property_id values that are in df_logs but not in df_prop
9c1bc7da-9235-4b6a-92ed-51bbdad719db
b5da81dd-e999-4e23-a8d0-466e92b34576 a7dc9a96-91f3-40ea-aed2-582f3099fb03
4d8f5977-bc0c-4bed-a2b6-690d8d0f93b4 8471ffd3-bfaf-44a7-9135-dca30b64b73a


#Check

import os
import logging
import psycopg2
import pandas as pd

# Configure logging
logging.basicConfig(level=logging.INFO)

# Load credentials from file
def load_credentials(path_to_credentials):
    try:
        with open(path_to_credentials, 'r') as file:
            for line_num, line in enumerate(file, start=1):
                line = line.strip()
                if line and '=' in line:
                    key, value = line.split('=', 1)  # Split only on the first '='
                    os.environ[key.strip()] = value.strip()
                else:
                    logging.warning(f"Issue with line {line_num} in {path_to_credentials}: '{line}'")
        logging.info("Credentials loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading credentials: {str(e)}")

# Call the function to load credentials
path_to_credentials = '/content/drive/MyDrive/Colab Notebooks/credentials/aws_credentials.txt'
load_credentials(path_to_credentials)

# Create connection parameters from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')  # Fetch the port from environment variables
}

# List of tables to process
tables = [properties]
]


# Connect to the PostgreSQL database
try:
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    # Loop through each table name
    for table in tables:
        logging.info(f"Processing table: {table}")

        # Query to fetch the first few rows from the current table
        query = f"SELECT * FROM {table} LIMIT 10;"
        cursor.execute(query)

        # Fetch the rows
        rows = cursor.fetchall()
        # Fetch the column headers
        column_names = [desc[0] for desc in cursor.description]

        # Create a DataFrame from the fetched data
        df = pd.DataFrame(rows, columns=column_names)

        # Prepare the transposed DataFrame
        transposed_data = {
            'Header': column_names,
            'Data Type': [df[col].dtype.name for col in column_names],  # Get the data type
            'Example': [df[col].iloc[0] if not df[col].empty else None for col in column_names]  # Example from the first row
        }

        df_transposed = pd.DataFrame(transposed_data)

        # Write the DataFrame to CSV
        output_csv_path = f'/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/{table}_fields.csv'
        df_transposed.to_csv(output_csv_path, index=False)
        logging.info(f"Data written to {output_csv_path} successfully.")

except Exception as error:
    logging.error(f"Error connecting to the database: {error}")

finally:
    if 'connection' in locals() and connection:
        cursor.close()
        connection.close()
        logging.info("Connection closed.")






In [65]:
# Load property_type metadata
# This is from AWS
import pandas as pd

def process_property_types(file_path, output_path):
    """Process property types file workflow."""
    # Load CSV
    df_prop_type = pd.read_csv(file_path)

    # Clean IDs
    df_prop_type = df_prop_type[df_prop_type['id'].notna()].reset_index(drop=True)

    # Rename columns
    df_prop_type = df_prop_type.rename(columns={
        'id': 'property_id',
        'name': 'prop_type'
    })

    # Save processed data
    df_prop_type.to_pickle(output_path_1 + '.pkl')

    return df_prop_type

# Example usage
if __name__ == "__main__":
    input_path_1 = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/property_types.csv'
    output_path_1 = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_prop_type'

    # Run workflow
    df_prop_type = process_property_types(input_path_1, output_path_1)

In [171]:
# Ingest prop size data
# This is from Salesforce

import pandas as pd

def clean_record_id(record_id):
   """
   Remove 'zcrm_' prefix from Record Id
   """
   return str(record_id).replace('zcrm_', '') if pd.notna(record_id) else record_id

def process_property_sizes(file_path, output_path):
   """Process property size file workflow."""
   # Load CSV with explicit encoding
   df_prop_size = pd.read_csv(file_path, encoding='latin-1')

   # Rename columns
   df_prop_size = df_prop_size.rename(columns={
       'Record Id': 'Record_id_lg',
       'Record Id (Managed Account)': 'Record_id_js'
   })

   # Clean IDs by removing 'zcrm_' prefix directly in the existing column
   df_prop_size['Record_id_js'] = df_prop_size['Record_id_js'].apply(clean_record_id)

   # Clean IDs
   df_prop_size = df_prop_size[df_prop_size['Record_id_js'].notna()].reset_index(drop=True)

   # Cast id to object type
   df_prop_size = df_prop_size.astype({'Record_id_js': 'object'})

   # Save processed data
   df_prop_size.to_pickle(output_path + '.pkl')

   return df_prop_size

# Example usage
if __name__ == "__main__":
   input_path_2 = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/All_Viable_Accounts_JS.csv'
   output_path_2 = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_prop_size'

   # Run workflow
   df_prop_size = process_property_sizes(input_path_2, output_path_2)

In [173]:
df_prop_size = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_prop_size.pkl')
df_prop_size.head()

  cast_date_col = pd.to_datetime(column, errors="coerce")


Unnamed: 0,Record_id_lg,Record_id_js,ID Number,Project Name,Managed Account Name,Total Parking Space Count,Chargers Allowed,Chargers Allowed Number,Activated Date,Project Stage 2,Primary Funding Sources,Secondary Funding Source,Core Parent
0,zcrm_3436570000137786029,3436570000137658197,3.44e+18,"665 Butte Ave, Big Bear Lake, CA 92315 - EV","665 Butte Ave, Big Bear Lake, CA 92315 - EV",4.0,1,1.0,,0. Missing Documents /or/ Site Not Ready,Cash,,E & J Worldwide LLC
1,zcrm_3436570000081274491,3436570000088955132,3.44e+18,"18111 Nordhoff Street, Northridge CA 91330 - C...","18111 Nordhoff Street, Northridge CA 91330 - C...",200.0,8,8.0,,0. Missing Documents /or/ Site Not Ready,Cash,,
2,zcrm_3436570000076671363,3436570000121879092,3.44e+18,"520 Media Pl. Sacramento, CA 95815 - SMUD 5 ch...","520 Media Pl. Sacramento, CA 95815 - SMUD 5 ch...",200.0,5,5.0,,0. Missing Documents /or/ Site Not Ready,SMUD Rebate,,Redwood Residential
3,zcrm_3436570000062920079,3436570000072666096,3.44e+18,6545 N 19th Avenue Phoenix AZ 85015 - SRP EV,6545 N 19th Avenue Phoenix AZ 85015 - SRP EV,136.0,20,20.0,,0. Missing Documents /or/ Site Not Ready,SRP Rebate,,Macroreal Commercial Inc
4,zcrm_3436570000062906661,3436570000105470072,3.44e+18,340 Hauser Blvd Los Angeles CA 90036 (Palazzo ...,"346 Hauser Blvd, Los Angeles, CA 90036 (Palazz...",800.0,40,8.0,,0. Missing Documents /or/ Site Not Ready,LADWP Rebate,Cash,AIR Communities


In [162]:
# Here are the dfs Ill be working with

df_logs = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_logs.pkl')
df_prop_type = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_prop_type.pkl')
df_prop_size = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_prop_size.pkl')
df_lookup = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_lookup.pkl')
df_logs_parking = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/df_logs_parking.pkl')



## Join Logic
df_log.property_id = df_lookup.property_id

df_lookup.property_id != df_prop_type.property_id

df_prop_type.name is the type of property to extract


df_prop_size.Record_id_js == df_lookup.managed_account_id



In [169]:
df_lookup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797 entries, 0 to 796
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   property_id           797 non-null    object 
 1   organization_id       797 non-null    object 
 2   name                  797 non-null    object 
 3   phone                 725 non-null    object 
 4   contact               751 non-null    object 
 5   longitude             791 non-null    float64
 6   latitude              791 non-null    float64
 7   watts_soft_limit      797 non-null    int64  
 8   property_type         786 non-null    object 
 9   note                  245 non-null    object 
 10  utility_provider      791 non-null    object 
 11  gateway_type          681 non-null    object 
 12  carrier_name          598 non-null    object 
 13  address_1             794 non-null    object 
 14  address_2             10 non-null     object 
 15  city                  7

In [170]:
western_jet_record = df_lookup[df_lookup['name'] == 'Western Jet']

print(western_jet_record)

                             property_id  \
32  907a7e25-d890-4953-97ea-9ee9b994bc21   

                         organization_id         name phone contact  \
32  cea56dbe-bde9-4675-92b8-975687b8d3ed  Western Jet   NaN     NaN   

     longitude   latitude  watts_soft_limit  \
32 -118.485946  34.209651              1000   

                           property_type note  ... state    zip email  \
32  b94d385c-cd17-4a0f-83ec-a2368baa8aba  NaN  ...    CA  91406   NaN   

   has_editable_penalty   managed_account_id  \
32                False  3436570000034423029   

                          created_at                        updated_at  \
32  2024-03-06 05:23:08.827026+00:00  2024-03-22 23:40:14.196047+00:00   

   reboot_cron_schedule    account_id  hidden  
32                  NaN  3.436570e+18   False  

[1 rows x 26 columns]


In [168]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/properties.csv')
western_jet_record = df[df['name'] == 'Western Jet']
print(western_jet_record)

                                      id  \
32  907a7e25-d890-4953-97ea-9ee9b994bc21   

                         organization_id         name phone contact  \
32  cea56dbe-bde9-4675-92b8-975687b8d3ed  Western Jet   NaN     NaN   

     longitude   latitude  watts_soft_limit  \
32 -118.485946  34.209651              1000   

                           property_type note  ... state    zip email  \
32  b94d385c-cd17-4a0f-83ec-a2368baa8aba  NaN  ...    CA  91406   NaN   

   has_editable_penalty   managed_account_id  \
32                False  3436570000034423029   

                          created_at                        updated_at  \
32  2024-03-06 05:23:08.827026+00:00  2024-03-22 23:40:14.196047+00:00   

   reboot_cron_schedule    account_id  hidden  
32                  NaN  3.436570e+18   False  

[1 rows x 26 columns]


In [167]:
western_jet_record = df[df['name'] == 'Western Jet']

print(western_jet_record)

3436570000034420000
3436570000034423029

                                      id  \
32  907a7e25-d890-4953-97ea-9ee9b994bc21   

                         organization_id         name phone contact  \
32  cea56dbe-bde9-4675-92b8-975687b8d3ed  Western Jet   NaN     NaN   

     longitude   latitude  watts_soft_limit  \
32 -118.485946  34.209651              1000   

                           property_type note  ... state    zip email  \
32  b94d385c-cd17-4a0f-83ec-a2368baa8aba  NaN  ...    CA  91406   NaN   

   has_editable_penalty   managed_account_id  \
32                False  3436570000034423029   

                          created_at                        updated_at  \
32  2024-03-06 05:23:08.827026+00:00  2024-03-22 23:40:14.196047+00:00   

   reboot_cron_schedule    account_id  hidden  
32                  NaN  3.436570e+18   False  

[1 rows x 26 columns]


In [154]:
# Get unique managed_account_id values from df_lookup
lookup_managed_account_ids = set(df_lookup.managed_account_id.dropna())
prop_size_record_ids = set(df_prop_size.Record_id_js)

# Find matching values
matching_values = lookup_managed_account_ids.intersection(prop_size_record_ids)

print("Total unique values in df_lookup.managed_account_id:", len(lookup_managed_account_ids))
print("Total unique values in df_prop_size.Record_id_js:", len(prop_size_record_ids))
print("Number of matching values:", len(matching_values))
print("Number of non-matching values in df_lookup.managed_account_id:",
      len(lookup_managed_account_ids) - len(matching_values))

Total unique values in df_lookup.managed_account_id: 755
Total unique values in df_prop_size.Record_id_js: 1249
Number of matching values: 643
Number of non-matching values in df_lookup.managed_account_id: 112


In [157]:
# Get sets of values that are failing the join
# These records were sent to JS to analyze
lookup_managed_account_ids = set(df_lookup.managed_account_id.dropna())
prop_size_record_ids = set(df_prop_size.Record_id_js)

# Find values in lookup that are not in prop_size
non_matching_ids = lookup_managed_account_ids - prop_size_record_ids

# Get the full records from df_lookup for these non-matching IDs
non_matching_records = df_lookup[df_lookup.managed_account_id.isin(non_matching_ids)]

# Save to CSV
non_matching_records.to_csv('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/non_matching_managed_account_ids.csv', index=False)

print("Number of non-matching managed_account_id values:", len(non_matching_ids))
print("CSV saved to: /content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/non_matching_managed_account_ids.csv")

Number of non-matching managed_account_id values: 112
CSV saved to: /content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/non_matching_managed_account_ids.csv


## Clean data

In [None]:
import pandas as pd
import os

# UDF to clean the 'user_id' column
def clean_user_id(df, column_name="user_id"):
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

# UDF to clean the 'timestamp' column
def clean_timestamp(df, column_name="timestamp"):
    df.loc[:, column_name] = pd.to_datetime(df[column_name], errors='coerce')  # Convert to datetime
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

# UDF to clean the 'property_id' column
def clean_property_id(df, column_name="property_id"):
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

# UDF to clean the 'value_0' column
def clean_value_0(df, column_name="value_0"):
    df[column_name] = df[column_name].fillna(0).astype(int)
    return df

# UDF to clean the 'value_1' column
def clean_value_1(df, column_name="value_1"):
    df[column_name] = df[column_name].fillna(0).astype(int)
    return df

# UDF to remove rows where both unit_0 and unit_1 are null
def remove_null_units(df):
    return df.dropna(subset=['unit_0', 'unit_1'], how='all')

# UDF to rename 'value_0' to 'Watthrs' and 'value_1' to 'Amps'
def rename_values(df):
    df = df.rename(columns={'value_0': 'Watthrs', 'value_1': 'Amps'})
    return df

# UDF to drop 'unit_0' and 'unit_1' columns
def drop_units(df):
    df = df.drop(columns=['unit_0', 'unit_1'])
    return df

# UDF to set data types
def set_data_types(df):
    dtype_mapping = {
        "property_id": "object",
        "user_id": "object",
        "timestamp": "datetime64[ns]",
        "Watthrs": "int64",  # updated to reflect the renamed column
        "Amps": "int64",     # updated to reflect the renamed column
    }

    for column, dtype in dtype_mapping.items():
        try:
            if dtype == "datetime64[ns]":
                df.loc[:, column] = pd.to_datetime(df[column], errors='coerce')
                invalid_rows = df[column].isna().sum()
                if invalid_rows > 0:
                    print(f"Warning: {invalid_rows} invalid timestamps found in '{column}' and coerced to NaT.")
            else:
                df.loc[:, column] = df[column].astype(dtype)
        except KeyError:
            print(f"Column '{column}' not found in DataFrame.")
        except Exception as e:
            print(f"Error converting column '{column}' to type '{dtype}': {e}")
    return df

# Function to compact the data and combine Amps and Watthrs in the same row
def compact_data(df):
    # Group by the relevant columns and aggregate to ensure Amps and Watthrs are in the same row
    df_compacted = df.groupby(['property_id', 'user_id', 'timestamp', 'property_type'], as_index=False).agg(
        Watthrs=('Watthrs', 'max'),  # Take the maximum (non-zero) value for Watthrs
        Amps=('Amps', 'max')         # Take the maximum (non-zero) value for Amps
    )

    print(f"After compacting data, data shape: {df_compacted.shape}")
    return df_compacted

# Generalized function to clean a DataFrame
def clean_data_with_udfs_and_dtypes(df, cleaning_functions):
    for func, col in cleaning_functions:
        print(f"Applying cleaning rule: {func.__name__} on column: {col}")
        if col:
            df = func(df, column_name=col)
        else:
            df = func(df)  # Call function without column_name argument
    print("Setting data types...")
    df = set_data_types(df)
    return df

# File paths
file_paths = [
    '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties.csv'
]

# Define cleaning rules
cleaning_rules = [
    (clean_user_id, "user_id"),
    (clean_timestamp, "timestamp"),
    (clean_property_id, "property_id"),
    (clean_value_0, "value_0"),
    (clean_value_1, "value_1"),
    (remove_null_units, None),
    (rename_values, None),  # Renaming values columns
    (drop_units, None),     # Dropping unit columns
]

# Process each file
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        continue  # Skip to the next file if this one isn't found

    df_cleaned = clean_data_with_udfs_and_dtypes(df.copy(), cleaning_rules)

    # Compact the data to ensure Watthrs and Amps are in the same row
    df_compacted = compact_data(df_cleaned)

    print("\nFinal DataFrame Info:")
    print(df_compacted.info())

    # Save the compacted DataFrame
    output_path = file_path.replace(".csv", "_cleaned_compacted.csv")
    df_compacted.to_csv(output_path, index=False)

    print(f"Compacted file saved to: {output_path}")
    print(f"Rows before cleaning: {len(df)}, Rows after cleaning: {len(df_compacted)}\n")


In [None]:
# Data check

import pandas as pd
import numpy as np

# Function to count NaN and infinite values in the DataFrame
def count_nan_inf(df):
    # Count NaN values
    nan_count = df.isna().sum().sum()

    # Count infinite values (positive and negative infinity)
    inf_count = (df == np.inf).sum().sum() + (df == -np.inf).sum().sum()

    print(f"NaN values: {nan_count}")
    print(f"Inf values: {inf_count}")

    # Optionally: Display count of NaN and Inf values per column
    print("\nNaN and Inf values per column:")
    print(df.isna().sum())  # Count of NaN per column
    print("\nInfinite values per column:")
    print((df == np.inf).sum() + (df == -np.inf).sum())  # Count of Inf per column

    return nan_count, inf_count

# Load your dataset (replace with the correct file path)
file_path = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties_cleaned_compacted.csv'
df = pd.read_csv(file_path)

# Run the check for NaN and Inf values
count_nan_inf(df)



In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties_cleaned_compacted.csv')
df.head()

## Engineer Features

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

# Utility Functions
def add_day_info(df, timestamp_col='timestamp'):
    df['day_of_week'] = df[timestamp_col].dt.dayofweek + 1  # Convert 0-6 (Monday-Sunday) to 1-7 (Sunday-Saturday)
    df['day_weekend'] = (df['day_of_week'] >= 6).astype(int)  # Weekend: Saturday (6) and Sunday (7)
    return df

def calculate_days_to_nearest_holiday(df, date_col, holiday_dates):
    df[date_col] = pd.to_datetime(df[date_col])

    if df[date_col].dt.tz is not None:
        holiday_dates = [
            holiday if holiday.tz is not None else holiday.tz_localize('UTC')
            for holiday in holiday_dates
        ]
    else:
        holiday_dates = [
            holiday.tz_convert(None) if holiday.tz is not None else holiday
            for holiday in holiday_dates
        ]

    df['days_to_nearest_holiday'] = df[date_col].apply(
        lambda x: min(abs((x - holiday).days) for holiday in holiday_dates)
    )
    return df

def add_datetime_components(df, timestamp_col='timestamp'):
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')  # Coerce invalid datetime to NaT

    # Drop rows where the timestamp is NaT after coercion
    df = df.dropna(subset=[timestamp_col])

    # Extract the datetime components
    df['year'] = df[timestamp_col].dt.year
    df['month'] = df[timestamp_col].dt.month
    df['day'] = df[timestamp_col].dt.day
    df['hour'] = df[timestamp_col].dt.hour

    return df

def encode_month_column(df, month_col='month'):
    month_mapping = {'September': 9, 'October': 10, 'November': 11}
    df['month_encoded'] = df[month_col].map(month_mapping).fillna(df[month_col]).astype(int)
    return df

def add_unique_user_counts(df, group_cols, user_col='user_id'):
    unique_user_counts = (
        df.groupby(group_cols)[user_col]
        .nunique()
        .reset_index()
        .rename(columns={user_col: 'unique_user_count'})
    )
    df = df.merge(unique_user_counts, on=group_cols, how='left')
    return df

def add_usage_sums(df, group_cols, value_cols):
    sums = df.groupby(group_cols)[value_cols].sum().reset_index()
    sums.rename(
        columns={
            value_cols[0]: 'hour_sum_value_Wh',
            value_cols[1]: 'hour_sum_value_A'
        },
        inplace=True
    )
    df = df.merge(sums, on=group_cols, how='left')
    return df

# New function to encode 'property_type' column using label encoding
def encode_property_type(df):
    label_encoder = LabelEncoder()
    df['property_type'] = label_encoder.fit_transform(df['property_type'])
    return df

def engineer_data(df, timestamp_col, user_col, group_cols, value_cols, holiday_dates=None):
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce', utc=True)
    df = add_day_info(df, timestamp_col)
    if holiday_dates:
        df = calculate_days_to_nearest_holiday(df, timestamp_col, holiday_dates)
    df = add_datetime_components(df, timestamp_col)
    df = add_unique_user_counts(df, group_cols, user_col)
    df = add_usage_sums(df, group_cols, value_cols)

    # Encode 'property_type' using label encoding
    df = encode_property_type(df)

    return df

def process_files(file_paths, output_dir, group_cols, value_cols, timestamp_col='timestamp', user_col='user_id', holiday_dates=None):
    processed_dfs = []
    for file_path in file_paths:
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        output_file = os.path.join(output_dir, f"{file_name}_eng_features.csv")
        print(f"Processing file: {file_path}")
        df = pd.read_csv(file_path)

        # Engineer data and handle NaT in timestamp column by dropping those rows
        processed_df = engineer_data(df, timestamp_col, user_col, group_cols, value_cols, holiday_dates)

        # Optionally, drop any rows with NaT in the timestamp column before saving
        processed_df = processed_df.dropna(subset=[timestamp_col])

        processed_df.to_csv(output_file, index=False)
        print(f"Processed file saved to: {output_file}")
        processed_dfs.append(processed_df)

    return processed_dfs

# Usage
file_paths = ['/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties_cleaned_compacted.csv']
output_dir = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/'
group_cols = ['property_id', 'year', 'month', 'day', 'hour']
value_cols = ['Watthrs', 'Amps']  # Updated to reflect the compacted columns
holiday_dates = [
    pd.Timestamp("2024-09-04", tz='UTC'),
    pd.Timestamp("2024-10-09", tz='UTC'),
    pd.Timestamp("2024-11-23", tz='UTC')
]

# Process files
processed_dfs = process_files(
    file_paths=file_paths,
    output_dir=output_dir,
    group_cols=group_cols,
    value_cols=value_cols,
    timestamp_col='timestamp',
    user_col='user_id',
    holiday_dates=holiday_dates
)


In [None]:

import pandas as pd
import numpy as np

# Function to count NaN and infinite values in the DataFrame
def count_nan_inf(df):
    # Count NaN values
    nan_count = df.isna().sum().sum()

    # Count infinite values (positive and negative infinity)
    inf_count = (df == np.inf).sum().sum() + (df == -np.inf).sum().sum()

    print(f"NaN values: {nan_count}")
    print(f"Inf values: {inf_count}")

    # Optionally: Display count of NaN and Inf values per column
    print("\nNaN and Inf values per column:")
    print(df.isna().sum())  # Count of NaN per column
    print("\nInfinite values per column:")
    print((df == np.inf).sum() + (df == -np.inf).sum())  # Count of Inf per column

    return nan_count, inf_count

# Load your dataset (replace with the correct file path)
file_path = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties_cleaned_compacted_eng_features.csv'
df = pd.read_csv(file_path)



## Check for colinearity

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load your dataset (replace with the correct file path)
file_path = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties_cleaned_eng_features.csv'
df = pd.read_csv(file_path)

# Select only numerical features for VIF calculation
numerical_columns = [
    "day_of_week",
    "day_weekend",
    "days_to_nearest_holiday",
    "year",
    "month",
    "day",
    "hour",
    "unique_user_count",
    "hour_sum_value_A"
]

# Prepare the DataFrame for VIF calculation
X = df[numerical_columns].copy()

# Check for NaN and inf values
print(f"NaN values before VIF calculation: {X.isna().sum().sum()}")
print(f"Inf values before VIF calculation: {((X == np.inf) | (X == -np.inf)).sum().sum()}")

# Handle NaN and inf values
X = X.fillna(0)  # Replace NaN values with 0 or other strategy (e.g., median, mean)
X.replace([np.inf, -np.inf], 0, inplace=True)  # Replace inf values with 0

# Check again after handling NaN and inf values
print(f"NaN values after VIF calculation: {X.isna().sum().sum()}")
print(f"Inf values after VIF calculation: {((X == np.inf) | (X == -np.inf)).sum().sum()}")

# Add a constant column for intercept
X['intercept'] = 1

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Drop the constant column after VIF calculation
vif_data = vif_data[vif_data["Feature"] != "intercept"]

# Display the VIF values
print(vif_data)


In [None]:
# Load your dataset (replace with the correct file path)
file_path = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties_cleaned_compacted_eng_features.csv'
df = pd.read_csv(file_path)
df.info()

# Prep df for regression/ANOVA

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load your dataset (replace with the correct file path)
file_path = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/logs_with_properties_cleaned_compacted_eng_features.csv'
df = pd.read_csv(file_path)

# Define the fields to drop
fields_to_drop = ['property_id', 'user_id', 'timestamp', 'hour_sum_value_A','Watthrs','Amps']

# Drop non-numeric columns from the dataframe
X = df.drop(columns=fields_to_drop)  # All columns except the dependent variable
y = df['hour_sum_value_Wh']  # Dependent variable

# Check types of X and y
print("X dtypes:\n", X.dtypes)
print("y dtype:", y.dtypes)

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression model
print(model.summary())


In [None]:
import numpy as np
# Fit a regression model
X = sm.add_constant(df_sampled['unique_user_count'])  # Add intercept
model = sm.OLS(df_sampled['hour_sum_value_A'], X).fit()

# Predict values for regression line
predictions = model.predict(X)

# Plot scatter with regression line
plt.figure(figsize=(8, 6))
plt.scatter(df_sampled['unique_user_count'], df_sampled['hour_sum_value_A'], alpha=0.6, label='Data Points')
plt.plot(df_sampled['unique_user_count'], predictions, color='red', label='Regression Line')
plt.title('Regression Plot: Unique User Count vs Hour Sum Value 0')
plt.xlabel('Unique User Count')
plt.ylabel('Hour hour_sum_value_A')
plt.legend()
plt.show()


# Ensure property_id is treated as a categorical variable
df_sampled['property_id'] = df_sampled['property_id'].astype('category')

# Prepare the formula for ANOVA
independent_vars = ['unique_user_count', 'property_id'] + [col for col in df_sampled.columns if col.startswith('day_')]
formula = 'hour_sum_value_A ~ ' + ' + '.join(independent_vars)

# Fit the model
model = ols(formula, data=df_sampled).fit()

# Perform ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)

# Display the ANOVA results
print(anova_results)


In [None]:
# Calculate the IQR for the column with potential outliers
Q1 = df_sampled['hour_sum_value_A'].quantile(0.25)
Q3 = df_sampled['hour_sum_value_A'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
df_filtered = df_sampled[(df_sampled['hour_sum_value_A'] >= lower_bound) & (df_sampled['hour_sum_value_A'] <= upper_bound)]


In [None]:
## Create a property lookup

import os
import logging
import psycopg2
import pandas as pd

# Configure logging
logging.basicConfig(level=logging.INFO)

# Load credentials from file
def load_credentials(path_to_credentials):
    try:
        with open(path_to_credentials, 'r') as file:
            for line_num, line in enumerate(file, start=1):
                line = line.strip()
                if line and '=' in line:
                    key, value = line.split('=', 1)  # Split only on the first '='
                    os.environ[key.strip()] = value.strip()
                else:
                    logging.warning(f"Issue with line {line_num} in {path_to_credentials}: '{line}'")
        logging.info("Credentials loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading credentials: {str(e)}")

# Call the function to load credentials
path_to_credentials = '/content/drive/MyDrive/Colab Notebooks/credentials/aws_credentials.txt'
load_credentials(path_to_credentials)

# Create connection parameters from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')  # Fetch the port from environment variables
}

# Function to fetch non-transposed data
def fetch_non_transposed_data(cursor, table):
    query = f"SELECT * FROM {table};"
    cursor.execute(query)
    rows = cursor.fetchall()
    column_names = [desc[0] for desc in cursor.description]

    # Create a DataFrame from the fetched data
    df = pd.DataFrame(rows, columns=column_names)
    return df

# List of tables to process
tables = [
    "location",
    "properties"
]

# Connect to the PostgreSQL database
try:
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    # Loop through each table name
    for table in tables:
        logging.info(f"Processing table: {table}")

        # Fetch non-transposed data
        df_non_transposed = fetch_non_transposed_data(cursor, table)

        # Write the DataFrame to CSV with new naming convention
        output_csv_path = f'/content/drive/MyDrive/Colab Notebooks/Data_sets/Chargie/{table}_table_extract.csv'
        df_non_transposed.to_csv(output_csv_path, index=False)
        logging.info(f"Data written to {output_csv_path} successfully.")

except Exception as error:
    logging.error(f"Error connecting to the database: {error}")

finally:
    if 'connection' in locals() and connection:
        cursor.close()
        connection.close()
        logging.info("Connection closed.")



In [None]:
import numpy as np
# Fit a regression model
X = sm.add_constant(df_sampled['unique_user_count'])  # Add intercept
model = sm.OLS(df_sampled['hour_sum_value_A'], X).fit()

# Predict values for regression line
predictions = model.predict(X)

# Plot scatter with regression line
plt.figure(figsize=(8, 6))
plt.scatter(df_filtered['unique_user_count'], df_filtered['hour_sum_value_A'], alpha=0.6, label='Data Points')
plt.plot(df_filtered['unique_user_count'], predictions, color='red', label='Regression Line')
plt.title('Regression Plot: Unique User Count vs Hour Sum Value 0')
plt.xlabel('Unique User Count')
plt.ylabel('Hour hour_sum_value_A')
plt.legend()
plt.show()


# Ensure property_id is treated as a categorical variable
df_filtered['property_id'] = df_filtered['property_id'].astype('category')

# Prepare the formula for ANOVA
independent_vars = ['unique_user_count', 'property_id'] + [col for col in df_filtered.columns if col.startswith('day_')]
formula = 'hour_sum_value_A ~ ' + ' + '.join(independent_vars)

# Fit the model
model = ols(formula, data=df_filtered).fit()

# Perform ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)

# Display the ANOVA results
print(anova_results)

In [None]:
# Decorate data with engineered values

from datetime import datetime
import pytz

# Function to convert to PST and extract datetime
def convert_to_pst_as_datetime(timestamp):
    # Parse the UTC timestamp
    utc_time = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
    # Set timezone to UTC
    utc_time = utc_time.replace(tzinfo=pytz.UTC)
    # Convert to PST
    pst_time = utc_time.astimezone(pytz.timezone('US/Pacific'))
    # Truncate to day, month, year, and hour (zero minutes and seconds)
    return pst_time.replace(minute=0, second=0, microsecond=0)

# Apply the function to convert timestamp
df_a_s_o['time_sample'] = df_a_s_o['timestamp'].apply(convert_to_pst_as_datetime)

# Add a column for day of the week (0 = Monday, 6 = Sunday)
df_a_s_o['day_of_week'] = df_a_s_o['time_sample'].dt.dayofweek

# Add a column for hour of the day (24hr format)
df_a_s_o['hour_of_day'] = df_a_s_o['time_sample'].dt.hour

# Add a column for ISO week number
df_a_s_o['week_number'] = df_a_s_o['time_sample'].dt.isocalendar().week

# Add in count of unique users
df_a_s_o['unique_user_count'] = (
    df_a_s_o
    .groupby(['week_number', 'day_of_week', 'hour_of_day'])['user_id']
    .transform('nunique')
)

# Add in sum of unit_a
df_a_s_o['sum_of_unit_a'] = (
    df_a_s_o
    .groupby(['week_number', 'day_of_week', 'hour_of_day'])['unit_a']
    .transform('sum')
)

# Add in sum of watt_h
df_a_s_o['sum_of_unit_wh'] = (
    df_a_s_o
    .groupby(['week_number', 'day_of_week', 'hour_of_day'])['unit_wh']
    .transform('sum')
)

# Print the updated DataFrame
print(df_a_s_o)

In [None]:
# Data Check
print(df_a_s_o['week_number'].unique())


# Calculate the overall count of unique user IDs
unique_user_count = df_a_s_o['user_id'].nunique()

# Calculate the sum of unit_a
sum_of_unit_a = df_a_s_o['unit_a'].sum()

# Calculate the sum of watt_h
sum_of_unit_wh = df_a_s_o['unit_wh'].sum()

# Print the results
print(f"Unique User Count: {unique_user_count}")
print(f"Sum of unit_a: {sum_of_unit_a}")
print(f"Sum of unit_wh: {sum_of_unit_wh}")

# Unique User Count: 1028
# Sum of unit_a: 84714332.39000002
# Sum of unit_wh: 57182938816884.78

In [None]:
df_a_s_o.info()

In [None]:
# Reduce the DataFrame to unique rows based on the specified columns
reduced_df = df_a_s_o.drop_duplicates(
    subset=['day_of_week', 'hour_of_day', 'week_number', 'unique_user_count', 'sum_of_unit_a', 'sum_of_unit_wh']
)

# Keep only the specified columns
reduced_df = reduced_df[['day_of_week', 'hour_of_day', 'week_number', 'unique_user_count', 'sum_of_unit_a', 'sum_of_unit_wh']]

# Display the resulting DataFrame
print(reduced_df.info())
print(reduced_df.head())


In [None]:

# Calculate the sum of unit_a
sum_of_unit_a = reduced_df['sum_of_unit_a'].sum()

# Calculate the sum of watt_h
sum_of_unit_wh = reduced_df['sum_of_unit_wh'].sum()

# Print the results

print(f"Sum of unit_a: {sum_of_unit_a}")
print(f"Sum of unit_wh: {sum_of_unit_wh}")

# Unique User Count: 1028
# Sum of unit_a: 84714332.39000002
# Sum of unit_wh: 57182938816884.78



In [None]:
# Write a local file to take a look

df_a_s_o.to_csv('/content/drive/MyDrive/Colab Notebooks/Data_sets/Chargie/df_a_s_o.csv', index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.regplot(x='unique_user_count', y='sum_of_unit_wh', data=df_a_s_o, scatter_kws={'alpha': 0.3}, line_kws={'color': 'red'})
plt.xlabel('User unique_user_count Count')
plt.ylabel('Total Unit WH')
plt.title('Regression Plot: User ID Count vs. Total Unit WH')
plt.show()

In [None]:
df_a_s_o.info()

In [None]:
# Data is week 32 through week 44 (12)
# So below, there is no week lag1 value for week 32 because it is the first

# Identify the peak total_unit_wh for each week
peak_weekly_data = df.loc[df.groupby('week_number')['sum_of_unit_wh'].idxmax()]

# Sort by week number to ensure correct lagging
peak_weekly_data = peak_weekly_data.sort_values('week_number')

# Add only lag_1 features
peak_weekly_data['lag_1_day_of_week'] = peak_weekly_data['day_of_week'].shift(1)
peak_weekly_data['lag_1_hour'] = peak_weekly_data['hour_of_day'].shift(1)

# Drop rows with insufficient lag (week 1)
peak_weekly_data = peak_weekly_data.dropna()

# Retain only relevant columns
peak_weekly_data = peak_weekly_data[['week_number', 'day_of_week', 'hour_of_day', 'lag_1_day_of_week', 'lag_1_hour']]

print("Updated DataFrame:")
print(peak_weekly_data)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Features (lagged day of week and hour) and target (day of week)
X = peak_weekly_data[['lag_1_day_of_week', 'lag_1_hour']]
y = peak_weekly_data['day_of_week']  # Target: Day of the week

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Day of Week Prediction Accuracy:", accuracy)

# Display true vs predicted values
results = pd.DataFrame({'True Day': y_test, 'Predicted Day': y_pred})
print("\nTrue vs Predicted Days of the Week:")
print(results)


In [None]:


# Feature importance for day_of_week classification
clf_importances = clf.feature_importances_
plt.barh(X.columns, clf_importances)
plt.title("Feature Importance for Day of Week Prediction")
plt.show()

# Feature importance for hour regression
reg_importances = reg.feature_importances_
plt.barh(X.columns, reg_importances)
plt.title("Feature Importance for Hour Prediction")
plt.show()


## Extract from Eddie

In [None]:
file_path_a = '/content/drive/MyDrive/Colab Notebooks/Data_sets/Chargie/2024-08-01.csv'

df_big = pd.read_csv(file_path_a)



### Data Exploration

In [None]:
df_big.info()

In [None]:
header = [
    "qrcode",  # Column 0
    "connector",  # Column 1
    "serial_num",  # Column 2
    "org_id",  # Column 3
    "property_id",  # Column 4
    "station_id",  # Column 5
    "transaction_id",  # Column 6
    "metered_type",  # Column 7
    "timestamp",  # Column 8
    "metered_value"   # Column 9
]

df_big.columns = header

In [None]:
df_big['Timestamp'] = pd.to_datetime(df_big['Timestamp'])


## Appendix

### Tables I can access

In [190]:

# Load credentials from file
def load_credentials(path_to_credentials):
    try:
        with open(path_to_credentials, 'r') as file:
            for line_num, line in enumerate(file, start=1):
                line = line.strip()
                if line and '=' in line:
                    key, value = line.split('=', 1)  # Split only on the first '='
                    os.environ[key.strip()] = value.strip()
                else:
                    logging.warning(f"Issue with line {line_num} in {path_to_credentials}: '{line}'")
        logging.info("Credentials loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading credentials: {str(e)}")

# Call the function to load credentials
path_to_credentials = '/content/drive/MyDrive/Colab Notebooks/credentials/aws_credentials.txt'
load_credentials(path_to_credentials)

# Create connection parameters from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')  # Fetch the port from environment variables
}

# Connect to the PostgreSQL database
try:
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    # Execute a query to fetch all table names
    query = """
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'public';
    """

    cursor.execute(query)
    tables = cursor.fetchall()

    # Print the table names
    for table in tables:
        print(table[0])

except Exception as error:
    print(f"Error connecting to the database: {error}")

finally:
    if 'connection' in locals() and connection:
        cursor.close()
        connection.close()
        print("Connection closed.")


properties
adjustment
group_discount
group_discount_properties
payment
roles
station_credit_program
subscription
user_discount_properties
pos_device
users
location
station_history
vehicle
router
stripe_payment_intent
cluster_name
rfid_user
station_logs
global_setting
api_token
station_model
awsdms_ddl_audit
user_access
adr
audit
pricing
stations
credit_program
errors
gateway
gateway_ip_lease
refresh_token
net_device_ip_lease
ocpp_sub_session
property_types
maintenance_window
transaction
user_device
accounts
address
clusters
connectors
flyway_schema_history
net_devices
ocpp_session
organizations
panels
Connection closed.


In [None]:
# This creates a table of field names and sample values
import os
import logging
import psycopg2
import pandas as pd

# Configure logging
logging.basicConfig(level=logging.INFO)

# Load credentials from file
def load_credentials(path_to_credentials):
    try:
        with open(path_to_credentials, 'r') as file:
            for line_num, line in enumerate(file, start=1):
                line = line.strip()
                if line and '=' in line:
                    key, value = line.split('=', 1)  # Split only on the first '='
                    os.environ[key.strip()] = value.strip()
                else:
                    logging.warning(f"Issue with line {line_num} in {path_to_credentials}: '{line}'")
        logging.info("Credentials loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading credentials: {str(e)}")

# Call the function to load credentials
path_to_credentials = '/content/drive/MyDrive/Colab Notebooks/credentials/aws_credentials.txt'
load_credentials(path_to_credentials)

# Create connection parameters from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')  # Fetch the port from environment variables
}

# List of tables to process
tables = [
    "group_discount_properties", "adjustment", "payment", "group_discount",
    "roles", "pos_device", "location", "station_credit_program", "station_history",
    "subscription", "user_discount_properties", "users", "router", "vehicle",
    "properties", "rfid_user", "stripe_payment_intent", "adr", "audit",
    "cluster_name", "global_setting", "station_logs", "station_model",
    "awsdms_ddl_audit", "user_access", "pricing", "stations", "gateway",
    "gateway_ip_lease", "errors", "credit_program", "maintenance_window",
    "refresh_token", "net_device_ip_lease", "ocpp_sub_session", "property_types",
    "user_device", "transaction", "address", "accounts", "net_devices",
    "organizations", "ocpp_session", "panels", "flyway_schema_history",
    "connectors", "clusters"
]


# "station_logs" is the big one. WOuld have the same fields/data as MeterValues data in Splunk.




# Connect to the PostgreSQL database
try:
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    # Loop through each table name
    for table in tables:
        logging.info(f"Processing table: {table}")

        # Query to fetch the first few rows from the current table
        query = f"SELECT * FROM {table} LIMIT 10;"
        cursor.execute(query)

        # Fetch the rows
        rows = cursor.fetchall()
        # Fetch the column headers
        column_names = [desc[0] for desc in cursor.description]

        # Create a DataFrame from the fetched data
        df = pd.DataFrame(rows, columns=column_names)

        # Prepare the transposed DataFrame
        transposed_data = {
            'Header': column_names,
            'Data Type': [df[col].dtype.name for col in column_names],  # Get the data type
            'Example': [df[col].iloc[0] if not df[col].empty else None for col in column_names]  # Example from the first row
        }

        df_transposed = pd.DataFrame(transposed_data)

        # Write the DataFrame to CSV
        output_csv_path = f'/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/{table}_fields.csv'
        df_transposed.to_csv(output_csv_path, index=False)
        logging.info(f"Data written to {output_csv_path} successfully.")

except Exception as error:
    logging.error(f"Error connecting to the database: {error}")

finally:
    if 'connection' in locals() and connection:
        cursor.close()
        connection.close()
        logging.info("Connection closed.")

In [None]:
# This creates a table of sample records

import os
import logging
import psycopg2
import pandas as pd

# Configure logging
logging.basicConfig(level=logging.INFO)

# Load credentials from file
def load_credentials(path_to_credentials):
    try:
        with open(path_to_credentials, 'r') as file:
            for line_num, line in enumerate(file, start=1):
                line = line.strip()
                if line and '=' in line:
                    key, value = line.split('=', 1)  # Split only on the first '='
                    os.environ[key.strip()] = value.strip()
                else:
                    logging.warning(f"Issue with line {line_num} in {path_to_credentials}: '{line}'")
        logging.info("Credentials loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading credentials: {str(e)}")

# Call the function to load credentials
path_to_credentials = '/content/drive/MyDrive/Colab Notebooks/credentials/aws_credentials.txt'
load_credentials(path_to_credentials)

# Create connection parameters from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')  # Fetch the port from environment variables
}

# Function to fetch non-transposed data
def fetch_non_transposed_data(cursor, table):
    query = f"SELECT * FROM {table} LIMIT 10;"
    cursor.execute(query)
    rows = cursor.fetchall()
    column_names = [desc[0] for desc in cursor.description]

    # Create a DataFrame from the fetched data
    df = pd.DataFrame(rows, columns=column_names)
    return df

# List of tables to process
tables = [
    "group_discount_properties", "adjustment", "payment", "group_discount",
    "roles", "pos_device", "location", "station_credit_program", "station_history",
    "subscription", "user_discount_properties", "users", "router", "vehicle",
    "properties", "rfid_user", "stripe_payment_intent", "adr", "audit",
    "cluster_name", "global_setting", "station_logs", "station_model",
    "awsdms_ddl_audit", "user_access", "pricing", "stations", "gateway",
    "gateway_ip_lease", "errors", "credit_program", "maintenance_window",
    "refresh_token", "net_device_ip_lease", "ocpp_sub_session", "property_types",
    "user_device", "transaction", "address", "accounts", "net_devices",
    "organizations", "ocpp_session", "panels", "flyway_schema_history",
    "connectors", "clusters"
]

# Connect to the PostgreSQL database
try:
    connection = psycopg2.connect(**connection_params)
    cursor = connection.cursor()

    # Loop through each table name
    for table in tables:
        logging.info(f"Processing table: {table}")

        # Fetch non-transposed data
        df_non_transposed = fetch_non_transposed_data(cursor, table)

        # Write the DataFrame to CSV with new naming convention
        output_csv_path = f'/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/{table}_example_data.csv'
        df_non_transposed.to_csv(output_csv_path, index=False)
        logging.info(f"Data written to {output_csv_path} successfully.")

except Exception as error:
    logging.error(f"Error connecting to the database: {error}")

finally:
    if 'connection' in locals() and connection:
        cursor.close()
        connection.close()
        logging.info("Connection closed.")


###Create a table for all property info

In [None]:
import pandas as pd
import os

# File paths
properties_file = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/properties.csv'
property_types_file = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/property_types.csv'
output_file = '/content/drive/MyDrive/Colab Notebooks/Data_sets/OCCP/property_lookup_2.csv'

# Load and verify files
if not os.path.exists(properties_file):
    raise FileNotFoundError(f"File not found: {properties_file}")
if not os.path.exists(property_types_file):
    raise FileNotFoundError(f"File not found: {property_types_file}")

properties = pd.read_csv(properties_file)
property_types = pd.read_csv(property_types_file)

# Normalize column names to lowercase and strip whitespace
properties.columns = properties.columns.str.strip().str.lower()
property_types.columns = property_types.columns.str.strip().str.lower()

# Perform the left join with suffixes
property_lookup = properties.merge(
    property_types,
    how='left',  # Use 'left' join to keep all rows from properties and add property_type name where available
    left_on='property_type',  # Assuming 'property_type' is the column in properties.csv
    right_on='id',  # Assuming 'id' is the column in property_types.csv
    suffixes=('_property', '_type')
)

# Keep all columns from properties and just add the 'name_type' column as 'property_type'
property_lookup['property_type'] = property_lookup['name_type']

# Drop the 'name_type' column, since we already added it as 'property_type'
property_lookup = property_lookup.drop(columns=['name_type'])

# Rename 'id_property' column to 'property_id'
property_lookup = property_lookup.rename(columns={'id_property': 'property_id'})

# Save the resulting DataFrame to CSV
property_lookup.to_csv(output_file, index=False)
print(f"Property lookup table saved to {output_file}")


# Now I need to build the correct table directly from RS

In [None]:
import os
import pandas as pd
import logging
from itertools import combinations

# Configure logging
logging.basicConfig(level=logging.INFO)

# Path to the directory containing the CSV files
data_dir = '/content/drive/MyDrive/Colab Notebooks/Data_sets/Chargie/'

# List of tables (as per your previous code)
tables = [
    "group_discount_properties", "adjustment", "payment", "group_discount",
    "roles", "pos_device", "location", "station_credit_program", "station_history",
    "subscription", "user_discount_properties", "users", "router", "vehicle",
    "properties", "rfid_user", "stripe_payment_intent", "adr", "audit",
    "cluster_name", "global_setting", "station_logs", "station_model",
    "awsdms_ddl_audit", "user_access", "pricing", "stations", "gateway",
    "gateway_ip_lease", "errors", "credit_program", "maintenance_window",
    "refresh_token", "net_device_ip_lease", "ocpp_sub_session", "property_types",
    "user_device", "transaction", "address", "accounts", "net_devices",
    "organizations", "ocpp_session", "panels", "flyway_schema_history",
    "connectors", "clusters"
]

# Function to load CSV files into DataFrames
def load_dataframes(tables):
    dataframes = {}
    for table in tables:
        csv_path = os.path.join(data_dir, f"{table}_example_data.csv")
        try:
            df = pd.read_csv(csv_path)
            dataframes[table] = df
            logging.info(f"Loaded data for table: {table}")
        except Exception as e:
            logging.error(f"Error loading data for table {table}: {e}")
    return dataframes

# Function to find strict join matches
def find_strict_joins(df1, df2, table1_name, table2_name):
    strict_joins = []
    # Iterate over all column pairs
    for col1 in df1.columns:
        for col2 in df2.columns:
            if df1[col1].dtype == df2[col2].dtype:
                # Perform the join
                joined_df = pd.merge(df1, df2, left_on=col1, right_on=col2, how='inner')
                # Check if all rows in df1 are in the joined DataFrame
                if len(joined_df) == len(df1):
                    strict_joins.append((col1, col2))
                    logging.info(f"Strict join success: {table1_name}.{col1} <-> {table2_name}.{col2}")
    return strict_joins

# Main function to perform the strict join analysis
def analyze_strict_joins(tables):
    dataframes = load_dataframes(tables)
    results = {}
    table_pairs = combinations(tables, 2)

    for table1, table2 in table_pairs:
        df1 = dataframes.get(table1)
        df2 = dataframes.get(table2)

        if df1 is not None and df2 is not None:
            logging.info(f"Analyzing strict joins between {table1} and {table2}")
            joins = find_strict_joins(df1, df2, table1, table2)
            if joins:
                results[f"{table1} <-> {table2}"] = joins
        else:
            logging.warning(f"Data for {table1} or {table2} is missing. Skipping.")

    return results

# Run the strict join analysis
strict_join_results = analyze_strict_joins(tables)

# Print the results
for table_pair, joins in strict_join_results.items():
    print(f"\nStrict joins for {table_pair}:")
    for col1, col2 in joins:
        print(f"Columns: {table_pair.split(' <-> ')[0]}.{col1} <-> {table_pair.split(' <-> ')[1]}.{col2}")

if not strict_join_results:
    print("No strict joins found.")


In [None]:
import os
import logging
import pandas as pd
from sqlalchemy import create_engine

# Configure logging
logging.basicConfig(level=logging.INFO)

# Load credentials from file
def load_credentials(path_to_credentials):
    try:
        with open(path_to_credentials, 'r') as file:
            for line_num, line in enumerate(file, start=1):
                line = line.strip()
                if line and '=' in line:
                    key, value = line.split('=', 1)  # Split only on the first '='
                    os.environ[key.strip()] = value.strip()
                else:
                    logging.warning(f"Issue with line {line_num} in {path_to_credentials}: '{line}'")
        logging.info("Credentials loaded successfully.")
    except Exception as e:
        logging.error(f"Error loading credentials: {str(e)}")

# Call the function to load credentials
path_to_credentials = '/content/drive/MyDrive/Colab Notebooks/credentials/aws_credentials.txt'
load_credentials(path_to_credentials)

# Create connection string for SQLAlchemy
connection_string = f"postgresql+psycopg2://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
engine = create_engine(connection_string)

# Function to fetch column names for a table
def get_columns(table_name):
    try:
        query = f"""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = '{table_name}';
        """
        with engine.connect() as connection:
            df = pd.read_sql_query(query, connection)
        return df[['column_name', 'data_type']].to_dict('records')
    except Exception as e:
        logging.error(f"Error fetching columns for table {table_name}: {e}")
        return []

# Function to test join logic between two tables
def test_joins(table1, table2, attempts=3):
    columns_table1 = get_columns(table1)
    columns_table2 = get_columns(table2)
    successful_joins = []

    for col1 in columns_table1:
        for col2 in columns_table2:
            # Only test joins on matching data types
            if col1['data_type'] == col2['data_type']:
                success_count = 0
                for _ in range(attempts):  # Attempt the join multiple times
                    query = f"""
                    SELECT *
                    FROM {table1} t1
                    INNER JOIN {table2} t2
                    ON t1.{col1['column_name']} = t2.{col2['column_name']}
                    LIMIT 1;  -- Test with one row at a time
                    """
                    try:
                        with engine.connect() as connection:
                            df = pd.read_sql_query(query, connection)
                            if not df.empty:
                                success_count += 1
                    except Exception as e:
                        logging.debug(f"Join failed for {table1}.{col1['column_name']} = {table2}.{col2['column_name']}: {e}")

                if success_count == attempts:  # Only count as successful if all attempts work
                    successful_joins.append((col1['column_name'], col2['column_name']))
                    logging.info(f"Successful join: {table1}.{col1['column_name']} = {table2}.{col2['column_name']}")

    return successful_joins

# Cross-check join fields for all table pairs
tables = [
    "users", "ocpp_sub_session"
]

results = {}

for i, table1 in enumerate(tables):
    for table2 in tables[i+1:]:
        logging.info(f"Testing joins between {table1} and {table2}")
        joins = test_joins(table1, table2)
        if joins:
            results[f"{table1} <-> {table2}"] = joins
        else:
            logging.info(f"No join found between {table1} and {table2}")

# Print results
for table_pair, joins in results.items():
    print(f"Successful joins for {table_pair}: {joins}")

if not results:
    print("No successful joins found.")


In [None]:
import pandas as pd
import json

# Example dataframe (assuming df['message'] contains the raw strings)
# Clean the 'message' column by removing the prefix 'OCPP : MeterValues '
def clean_message(msg):
    try:
        # Remove the prefix
        msg_cleaned = msg.lstrip('OCPP : MeterValues ')

        # Attempt to load the cleaned message as JSON
        return json.loads(msg_cleaned)
    except (json.JSONDecodeError, TypeError):
        # If the message cannot be decoded as JSON, return None or handle as needed
        return None

# Apply the function to the 'message' column
df['message'] = df['message'].apply(clean_message)

# Filter out rows where the 'message' column is None (indicating a JSON parse failure)
df = df[df['message'].notna()]

# Step 1: Extract top-level fields and keep 'meterValue' as is (as a list of dicts)
flattened_rows = []

for idx, row in df.iterrows():
    message = row['message']  # Now this is a valid JSON object

    # Extract top-level fields
    connector_id = message.get('connectorId')
    transaction_id = message.get('transactionId')

    # Keep the 'meterValue' field as is (as a list of dicts)
    meter_value = message.get('meterValue', [])

    # Add a row to the flattened list, including the nested 'meterValue' list
    flattened_rows.append({
        '_time': row['time'],  # Retain the original timestamp from the dataframe
        'user_id': row['user_id'],  # Assuming 'user_id' is part of the original dataframe
        'connectorId': connector_id,
        'meterValue': meter_value  # The entire 'meterValue' field, as it is (list of dictionaries)
    })

# Step 2: Create a new DataFrame from the flattened rows
flattened_df = pd.DataFrame(flattened_rows)

# Display the resulting DataFrame
print(flattened_df.head())


In [None]:
import pandas as pd

# Set pandas options to display the full content of any column (e.g., 'meterValue')
pd.set_option('display.max_colwidth', None)

# Now, display the full content of the 'meterValue' column for the first 5 rows
print(flattened_df['meterValue'].head(1))

In [None]:
import pandas as pd

# Create a list to hold the expanded rows
expanded_rows = []

# Iterate over each row in the dataframe
for idx, row in flattened_df.iterrows():
    meter_values = row['meterValue']  # This is the list of meter readings (list of dicts)

    # For each meter value entry (there should be one timestamp and a list of measurements)
    for meter in meter_values:
        timestamp = meter['timestamp']  # Extract the timestamp

        # Initialize values for each measurement type
        watt_hours_value = None  # WattHours
        amps_value = None        # Amps (Current)
        voltage_value = None     # Voltage (Volts)

        # Iterate over the sampledValue list (which contains the three measurements)
        for sample in meter['sampledValue']:
            # Check the 'unit' to assign the value to the correct column
            if sample['unit'] == 'Wh':  # WattHours
                watt_hours_value = sample['value']
            elif sample['unit'] == 'A':  # Amps (Current)
                amps_value = sample['value']
            elif sample['unit'] == 'V':  # Volts (Voltage)
                voltage_value = sample['value']

        # Append the expanded row with the extracted values
        expanded_rows.append({
            '_time': row['_time'],  # Retain the original timestamp from the dataframe
            'user_id': row['user_id'],  # Assuming 'user_id' is part of the original dataframe
            'connectorId': row['connectorId'],  # Connector ID
            'timestamp': timestamp,  # Timestamp from the meter value
            'WattHours': watt_hours_value,  # Renamed to WattHours
            'Amps': amps_value,  # Keep Amps as the column name
            'Voltage': voltage_value  # Value for Voltage (V)
        })

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Convert the numeric columns to appropriate types (float)
expanded_df['WattHours'] = pd.to_numeric(expanded_df['WattHours'], errors='coerce')
expanded_df['Amps'] = pd.to_numeric(expanded_df['Amps'], errors='coerce')
expanded_df['Voltage'] = pd.to_numeric(expanded_df['Voltage'], errors='coerce')

# Display the resulting DataFrame
print(expanded_df.head())


In [None]:
# Ensure all columns are numeric (in case there are any string values left)
expanded_df['WattHours'] = pd.to_numeric(expanded_df['WattHours'], errors='coerce')
expanded_df['Amps'] = pd.to_numeric(expanded_df['Amps'], errors='coerce')
expanded_df['Voltage'] = pd.to_numeric(expanded_df['Voltage'], errors='coerce')

# Classify values as 0 or > 0 for each of the measurements
expanded_df['WattHours_Class'] = expanded_df['WattHours'].apply(lambda x: '0' if x == 0 else '>0')
expanded_df['Amps_Class'] = expanded_df['Amps'].apply(lambda x: '0' if x == 0 else '>0')
expanded_df['Voltage_Class'] = expanded_df['Voltage'].apply(lambda x: '0' if x == 0 else '>0')

# Set up the plot
plt.figure(figsize=(18, 6))

# Plot the count of each class for 'WattHours', 'Amps', and 'Voltage'
plt.subplot(1, 3, 1)
sns.countplot(data=expanded_df, x='WattHours_Class')
plt.title('Count of Rows with WattHours: 0 vs > 0')
plt.xlabel('WattHours Class')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.countplot(data=expanded_df, x='Amps_Class')
plt.title('Count of Rows with Amps: 0 vs > 0')
plt.xlabel('Amps Class')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.countplot(data=expanded_df, x='Voltage_Class')
plt.title('Count of Rows with Voltage: 0 vs > 0')
plt.xlabel('Voltage Class')
plt.ylabel('Count')

# Display the plots
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure that the '_time' column is in datetime format
expanded_df['timestamp'] = pd.to_datetime(expanded_df['timestamp'], errors='coerce')

# Convert 'Amps', 'WattHours', and 'Voltage' to numeric (handling any errors)
expanded_df['Amps'] = pd.to_numeric(expanded_df['Amps'], errors='coerce')
expanded_df['WattHours'] = pd.to_numeric(expanded_df['WattHours'], errors='coerce')
expanded_df['Voltage'] = pd.to_numeric(expanded_df['Voltage'], errors='coerce')

# Drop rows where any of the values are missing
expanded_df = expanded_df.dropna(subset=['_time', 'Amps', 'WattHours', 'Voltage'])

# Set the style for the plots
sns.set(style="whitegrid")

# Create the figure and axes for the plots
plt.figure(figsize=(18, 6))

# Plot Amps over time
plt.subplot(1, 3, 1)
plt.plot(expanded_df['timestamp'], expanded_df['Amps'], label='Amps', color='b', alpha=0.7)
plt.title('Amps over Time')
plt.xlabel('Time')
plt.ylabel('Amps')
plt.xticks(rotation=45)

# Plot WattHours over time
plt.subplot(1, 3, 2)
plt.plot(expanded_df['timestamp'], expanded_df['WattHours'], label='WattHours', color='g', alpha=0.7)
plt.title('WattHours over Time')
plt.xlabel('Time')
plt.ylabel('WattHours')
plt.xticks(rotation=45)

# Plot Voltage over time
plt.subplot(1, 3, 3)
plt.plot(expanded_df['timestamp'], expanded_df['Voltage'], label='Voltage', color='r', alpha=0.7)
plt.title('Voltage over Time')
plt.xlabel('Time')
plt.ylabel('Voltage')
plt.xticks(rotation=45)

# Adjust layout to avoid overlap of labels
plt.tight_layout()
plt.show()


In [None]:
expanded_df.info()

In [None]:
# Run descriptive statistics on 'Amps', 'WattHours', and 'Voltage'
descriptive_stats = expanded_df[['Amps', 'WattHours', 'Voltage']].describe()

# Display the statistics
print(descriptive_stats)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Create a figure with 3 subplots (1 row, 3 columns)
plt.figure(figsize=(18, 6))

# Plot for Amps
plt.subplot(1, 3, 1)
sns.boxplot(data=expanded_df['Amps'], color='skyblue')
plt.title('Boxplot of Amps')
plt.ylabel('Amps')

# Plot for WattHours
plt.subplot(1, 3, 2)
sns.boxplot(data=expanded_df['WattHours'], color='lightgreen')
plt.title('Boxplot of WattHours')
plt.ylabel('WattHours')

# Plot for Voltage
plt.subplot(1, 3, 3)
sns.boxplot(data=expanded_df['Voltage'], color='lightcoral')
plt.title('Boxplot of Voltage')
plt.ylabel('Voltage')

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()


In [None]:
# count of propertyIDs

import os
import psycopg2

# Load credentials from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')
}

# Table to process (the 'stations' table)
table = "stations"

# Connect to the PostgreSQL database and run the query
connection = psycopg2.connect(**connection_params)
cursor = connection.cursor()

# Query to count unique 'property_id' values
query = f"SELECT COUNT(DISTINCT property_id) FROM {table};"
cursor.execute(query)

# Fetch the result
result = cursor.fetchone()

# Extract and print the count
unique_property_id_count = result[0] if result else 0
print(f"Number of unique 'property_id' values: {unique_property_id_count}")

# Clean up
cursor.close()
connection.close()

In [None]:
#Count of cluster IDs

import os
import psycopg2

# Load credentials from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')
}

# Table to process (the 'stations' table)
table = "stations"

# Connect to the PostgreSQL database and run the query
connection = psycopg2.connect(**connection_params)
cursor = connection.cursor()

# Query to count unique 'property_id' values
query = f"SELECT COUNT(DISTINCT cluster_id) FROM {table};"
cursor.execute(query)

# Fetch the result
result = cursor.fetchone()

# Extract and print the count
unique_property_id_count = result[0] if result else 0
print(f"Number of unique 'cluster_id' values: {unique_property_id_count}")

# Clean up
cursor.close()
connection.close()

In [None]:
# counts of peropertyID and clusterIDimport os
import psycopg2

# Load credentials from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')
}

# Table to process (the 'stations' table)
table = "stations"

# Connect to the PostgreSQL database and run the query
connection = psycopg2.connect(**connection_params)
cursor = connection.cursor()

# Query to count unique 'cluster_id' values for each 'property_id'
query = f"""
SELECT property_id, COUNT(DISTINCT cluster_id)
FROM {table}
GROUP BY property_id
;
"""
cursor.execute(query)

# Fetch all the results
results = cursor.fetchall()

# Print the results
for row in results:
    property_id, cluster_count = row
    print(f"Property ID: {property_id}, Unique Cluster ID Count: {cluster_count}")

# Clean up
cursor.close()
connection.close()

import os
import psycopg2

# Load credentials from environment variables
connection_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'port': os.getenv('DB_PORT')
}

# Table to process (the 'stations' table)
table = "stations"

# Connect to the PostgreSQL database and run the query
connection = psycopg2.connect(**connection_params)
cursor = connection.cursor()

# Query to count unique 'property_id' values for each 'cluster_id'
query = f"""
SELECT cluster_id, COUNT(DISTINCT property_id)
FROM {table}
GROUP BY cluster_id;
"""
cursor.execute(query)

# Fetch all the results
results = cursor.fetchall()

# Print the results
for row in results:
    cluster_id, property_count = row
    print(f"Cluster ID: {cluster_id}, Unique Property ID Count: {property_count}")

# Clean up
cursor.close()
connection.close()


OCPP_SessionID has a userID and TransactionID
I need to map to the cluster and property

Stations has propertyID and cluster_id




In [None]:
# Clean message field and port to a df
import json
import pandas as pd

# Function to clean up the 'message' field by removing the prefix and parsing JSON
def clean_and_parse_message(message):
    try:
        # Strip the non-JSON prefix before the first '{'
        cleaned_message = message[message.find('{'):]
        # Parse the cleaned JSON string
        return json.loads(cleaned_message)
    except json.JSONDecodeError as e:
        print(f"Invalid JSON in message: {message}\nError: {e}")
        return None

# Function to flatten nested JSON
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

# Apply the cleaning and parsing function to all rows in the 'message' field
df['parsed_message'] = df['message'].apply(clean_and_parse_message)

# Drop rows where parsing failed (invalid JSON) or was not cleaned properly
valid_df = df[df['parsed_message'].notnull()]

# Flatten all the JSON objects and store them in a new DataFrame
flattened_data = valid_df['parsed_message'].apply(flatten_json).apply(pd.Series)

# Combine the flattened JSON fields with the original DataFrame (excluding the original 'message' and 'parsed_message' fields)
new_df = pd.concat([valid_df.drop(columns=['message', 'parsed_message']), flattened_data], axis=1)

# Write the DataFrame to CSV with new naming convention
output_csv_path = f'/content/drive/MyDrive/Colab Notebooks/Data_sets/Chargie/new_df.csv'
new_df.to_csv(output_csv_path, index=False)
logging.info(f"Data written to {output_csv_path} successfully.")




In [None]:
new_df.info
new_df.head()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df= new_df

# Assuming your DataFrame is named df
# Step 1: Convert 'time' to datetime
df['time'] = pd.to_datetime(df['time'], errors='coerce')  # errors='coerce' will turn invalid parsing to NaT

# Step 2: Convert 'meterValue_0_sampledValue_0_value' to numeric
df['meterValue_0_timestamp'] = pd.to_numeric(df['meterValue_0_timestamp'], errors='coerce')

# Step 3: Drop any rows with NaT or NaN values (optional, depending on your needs)
df = df.dropna(subset=['time', 'meterValue_0_sampledValue_0_value'])

# Step 4: Plot the time series
plt.figure(figsize=(10, 6))
plt.plot(df['meterValue_0_timestamp'], df['meterValue_0_sampledValue_0_value'], label='Meter Value', color='b')
plt.xlabel('Time')
plt.ylabel('Meter Value')
plt.title('Meter Value Over Time')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
# Group by user_id and count the occurrences of meterValue_0_timestamp
hist_data = df.groupby('user_id')['message'].count().reset_index()

# Rename the columns for clarity
hist_data.columns = ['user_id', 'count']

# Sort the data by user_id for better visualization
hist_data = hist_data.sort_values('user_id')

# Plotting the normal line plot
plt.figure(figsize=(10, 6))
plt.plot(hist_data['user_id'], hist_data['count'], marker='o', linestyle='-', color='skyblue')
plt.xlabel('User ID')
plt.ylabel('Count of message')
plt.title('Count of message per User ID')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
import pandas as pd

# Assuming new_df is already defined and contains the necessary columns

# List of columns to keep
columns_to_keep = [
    #'time',
    'user_id',
    #'station_id',
    'property_id',
    'connectorId',
    'meterValue_0_timestamp',
    'meterValue_0_sampledValue_1_value',
    'meterValue_0_sampledValue_1_context',
    'meterValue_0_sampledValue_1_format',
    'meterValue_0_sampledValue_1_measurand',
    'meterValue_0_sampledValue_1_phase',
    'meterValue_0_sampledValue_1_location',
    'meterValue_0_sampledValue_1_unit'
]

# Create new_df_2 with only the selected columns
new_df_2 = new_df[columns_to_keep].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Convert 'time' to datetime
new_df_2['meterValue_0_timestamp'] = pd.to_datetime(new_df_2['meterValue_0_timestamp'], errors='coerce')

# Check for any NaT values that may have resulted from the conversion
if new_df_2['meterValue_0_timestamp'].isnull().any():
    print("Some values could not be converted to datetime.")

# Extract day and hour using .loc to avoid warnings
new_df_2.loc[:, 'meterValue_0_day'] = new_df_2['meterValue_0_timestamp'].dt.date
new_df_2.loc[:, 'meterValue_0_hour'] = new_df_2['meterValue_0_timestamp'].dt.hour



In [None]:
new_df_2.info()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = new_df_2

# Assuming df is your DataFrame
# Group by user_id and count the occurrences of meterValue_0_timestamp
hist_data = df.groupby('user_id')['meterValue_0_timestamp'].count().reset_index()

# Rename the columns for clarity
hist_data.columns = ['user_id', 'count']

# Sort the data by user_id for better visualization
hist_data = hist_data.sort_values('user_id')

# Plotting the normal line plot
plt.figure(figsize=(10, 6))
plt.plot(hist_data['user_id'], hist_data['count'], marker='o', linestyle='-', color='skyblue')
plt.xlabel('User ID')
plt.ylabel('Count of message')
plt.title('Count of message per User ID')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.grid()
plt.show()

In [None]:
# Assuming new_df_2 is the df

unique_values = new_df_2['user_id'].unique()

# To display the unique values
print(unique_values)


# Assuming new_df_2 is your DataFrame
unique_count = new_df_2['user_id'].nunique()

# To display the count of unique user_id values
print(f"Number of unique user_id values: {unique_count}")

In [None]:
import pandas as pd


new_df_2['meterValue_0_sampledValue_1_value'] = pd.to_numeric(new_df_2['meterValue_0_sampledValue_1_value'], errors='coerce')

max_values = new_df_2.loc[new_df_2.groupby(['user_id', 'meterValue_0_day'])['meterValue_0_sampledValue_1_value'].idxmax()]

result_df = max_values[['user_id', 'meterValue_0_day', 'meterValue_0_sampledValue_1_value', 'meterValue_0_timestamp']]

print(result_df)
result_df.info()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
# Group by user_id and count the occurrences of meterValue_0_timestamp
hist_data = df.groupby('user_id')['meterValue_0_timestamp'].count().reset_index()

# Rename the columns for clarity
hist_data.columns = ['user_id', 'count']

# Sort the data by user_id for better visualization
hist_data = hist_data.sort_values('user_id')

# Plotting the normal line plot
plt.figure(figsize=(10, 6))
plt.plot(hist_data['user_id'], hist_data['count'], marker='o', linestyle='-', color='skyblue')
plt.xlabel('User ID')
plt.ylabel('Count of meterValue_0_timestamp')
plt.title('Count of meterValue_0_timestamp per User ID')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.grid()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming result_df is your DataFrame

# Print column names to verify
print("Column names in DataFrame:")
print(result_df.columns.tolist())

# Specify the user_id you're interested in
specific_user_id = '013f0335-da69-4fdd-b378-b6a9a8cfc8a8'  # replace with the actual user_id

# Filter the DataFrame for the specific user_id
filtered_df = result_df[result_df['user_id'] == specific_user_id]

# Check if there are any rows for the specified user_id
if not filtered_df.empty:
    # Check for the timestamp column again
    timestamp_col = 'meterValue_0_timestamp'  # Update if necessary
    value_col = 'meterValue_0_sampledValue_1_value'

    # Ensure the column names are correct
    print("Filtered DataFrame columns:")
    print(filtered_df.columns.tolist())

    # Plotting
    plt.figure(figsize=(12, 6))
    plt.plot(filtered_df[timestamp_col], filtered_df[value_col], marker='o')
    plt.title(f'Meter Values for User ID: {specific_user_id}')
    plt.xlabel('Timestamp')
    plt.ylabel('Meter Value')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
    plt.grid()
    plt.tight_layout()  # Adjust layout to make room for the rotated x-axis labels
    plt.show()
else:
    print(f"No data found for user_id: {specific_user_id}")


In [None]:

# Write the DataFrame to CSV
output_csv_path = f'/content/drive/MyDrive/Colab Notebooks/Data_sets/Chargie/result_df_exported.csv'
result_df.to_csv(output_csv_path, index=False)
logging.info(f"Data written to {output_csv_path} successfully.")


