<div style="background-color:green; color:white; padding:10px; font-size:20px">
1 - Setup

In [1]:
!conda install tensorflow -y

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 22.9.0
  latest version: 23.7.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Retrieving notices: ...working... done


In [2]:
!pip install boto3

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [3]:
# (VGG 16) model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Plotting/Visualization
from matplotlib import pyplot as plt
from matplotlib import cm

# for everything else
import numpy as np
from random import randint
import time
import io # File handling
from io import BytesIO
from tqdm import tqdm # Progress bar
from PIL import Image

In [4]:
import boto3
import pandas as pd


# Project Scripts
import data_handling

In [5]:
# Re-freshing the module import for development purposes
from importlib import reload
reload(data_handling)

<module 'data_handling' from '/root/projects/behavior_project/data_handling.py'>

#### Sagemaker User & IAM Credentials

In [6]:
# Create credentials session (log-in to AWS account)
sess = boto3.Session()
# Get information about current User
sts = sess.client('sts')
response = sts.get_caller_identity()

print("Your AWS Account:  717145514721")
print("Logged-in Account:", response['Account'])
print("User ARN:", response['Arn'])

Your AWS Account:  717145514721
Logged-in Account: 717145514721
User ARN: arn:aws:sts::717145514721:assumed-role/AWS-SageMakerFullAccess-Role/SageMaker


### S3 Connection

In [7]:
import boto3

# Create a session using your AWS credentials
s3 = boto3.client('s3')

# List all bucket names
response = s3.list_buckets()

# Get a list of all bucket names from the response
buckets = [bucket['Name'] for bucket in response['Buckets']]

# Print out the bucket list
print("Bucket List: %s" % buckets)

Bucket List: ['behavior-project', 'sagemaker-us-east-1-717145514721']




<div style="background-color:green; color:white; padding:10px; font-size:20px">
2 - Data Import

In [8]:
from io import StringIO

#### mPFC Trace Info

#### mPFC s3 Image Paths

#### FR Trace Info

In [15]:
bucket_name = "behavior-project"
file_key = "processed-data/processed-data/trace-info-dfs/fr-trace-info-df.csv"

# Get the CSV file from S3
csv_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
body = csv_obj["Body"].read().decode("utf-8")

# Convert the CSV data to a pandas DataFrame
FR_trace_info_df = pd.read_csv(StringIO(body))
FR_trace_info_df.head(3)

Unnamed: 0,session_id,trace_num,start_time,end_time,s3_url
0,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,0,1970-01-01 00:00:45.854000091,1970-01-01 00:00:50.800998687,behavior-project/processed-data/trace-images/f...
1,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,1,1970-01-01 00:01:03.058998107,1970-01-01 00:01:08.055000305,behavior-project/processed-data/trace-images/f...
2,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,2,1970-01-01 00:01:33.384002685,1970-01-01 00:01:38.366996765,behavior-project/processed-data/trace-images/f...


#### FR s3 Image Paths

In [10]:
bucket_name = 'behavior-project'
prefix = 'processed-data/trace-images/fr-icss/'

s3_client = boto3.client('s3')
paginator = s3_client.get_paginator('list_objects_v2')

# Too many paths to retrieve all at once so pagination & concatenation is required
FR_png_paths = []
for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
    if 'Contents' in page:
        FR_png_paths.extend([item['Key'] for item in page['Contents'] if item['Key'].endswith('.png')])

print(len(FR_png_paths))



13469


<div style="background-color:green; color:white; padding:10px; font-size:20px">
3 - Model Import & Embedding Creation

#### Creating DF's to hold data

In [16]:
# Function to split the session_id and return the first two elements concatenated with '_'
def extract_animal_id(session_id):
    parts = session_id.split('_')
    return f"{parts[0]}_{parts[1]}"

In [26]:
# Create new df to hold embed & cluster data
FR_emb_cluster_df = pd.DataFrame(FR_trace_info_df[['session_id', 'trace_num', 's3_url']])
FR_emb_cluster_df['animal_id'] = FR_emb_cluster_df['session_id'].apply(extract_animal_id)

#### Embedding

In [13]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

In [9]:
from data_handling import extract_features_batch, create_dataframe
from data_handling import optimize_dataframe_dtypes

#### Reloading From Saved

In [11]:
def process_chunk(chunk):
    # Convert embedding columns to a single column containing lists
    chunk['embedding'] = chunk.iloc[:, :4095].values.tolist()
    return chunk[['s3_path', 'embedding']]

In [12]:
bucket_name = "behavior-project"
file_key = "processed-data/processed-data/feats-dfs/fr-feats-df.csv"

# Get the CSV file from S3
csv_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
body = csv_obj["Body"].read().decode("utf-8")

# Define chunk size
chunksize = 1000  # Adjust based on memory availability after upgrade

chunks_processed = []
for chunk in pd.read_csv(StringIO(body), chunksize=chunksize):
    processed_chunk = process_chunk(chunk)
    chunks_processed.append(processed_chunk)
    
    # Explicitly delete to free memory
    del chunk

# Concatenate all chunks into a single DataFrame
FR_feats_df = pd.concat(chunks_processed, ignore_index=True)
# optimize_dataframe_dtypes(FR_feats_df, inplace=True)

FR_feats_df.head(3)

Unnamed: 0,s3_path,embedding
0,processed-data/trace-images/fr-icss/C1000_C42A...,"[0.0, 0.0, 0.89690435, 0.0, 0.076437354, 0.0, ..."
1,processed-data/trace-images/fr-icss/C1000_C42A...,"[0.0, 0.0, 1.7854563, 0.0, 0.0, 0.0, 1.2393667..."
2,processed-data/trace-images/fr-icss/C1000_C42A...,"[0.0, 0.0, 0.6444253, 0.0, 0.0, 0.0, 2.2935755..."


In [25]:
print(f"num_rows = {len(FR_feats_df)}")
FR_feats_df.head(3)

num_rows = 13469


'processed-data/trace-images/fr-icss/C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON_0.png'

In [34]:
# Merge Embeddings into emb_cluster_df

# Assuming the format is like "session_id_trace_num.png"
FR_feats_df['extracted_session_id'] = FR_feats_df['s3_path'].str.rsplit('_', 1).str[0]  # everything before the last underscore
FR_feats_df['trace_num'] = FR_feats_df['s3_path'].str.rsplit('_', 1).str[1].str.replace('.png', '').astype(int)  # everything after the last underscore, remove .png and convert to int

# 3. Merge on both criteria
FR_emb_cluster_df = FR_emb_cluster_df.merge(
    FR_feats_df[['extracted_session_id', 'trace_num', 'embedding']],
    left_on=['session_id', 'trace_num'],
    right_on=['extracted_session_id', 'trace_num'],
    how='left'
).drop(columns=['extracted_session_id'])

  """


In [35]:
FR_emb_cluster_df.head(3)

Unnamed: 0,session_id,trace_num,s3_url,animal_id,embedding
0,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,0,behavior-project/processed-data/trace-images/f...,C1000_C42A,"[0.0, 0.0, 0.89690435, 0.0, 0.076437354, 0.0, ..."
1,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,1,behavior-project/processed-data/trace-images/f...,C1000_C42A,"[0.0, 0.0, 1.7854563, 0.0, 0.0, 0.0, 1.2393667..."
2,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,2,behavior-project/processed-data/trace-images/f...,C1000_C42A,"[0.0, 0.0, 1.8289622, 0.0, 0.0, 0.0, 1.802902,..."


<div style="background-color:green; color:white; padding:10px; font-size:20px">
4 - PCA & Clustering

In [36]:
FR_emb_cluster_df.columns

Index(['session_id', 'trace_num', 's3_url', 'animal_id', 'embedding'], dtype='object')

In [37]:
pca = PCA(n_components=4, random_state=22)
kmeans = KMeans(n_clusters=6)

In [39]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Group by 'animal_id'
groups = FR_emb_cluster_df.groupby('animal_id')
all_clusters = []

# Iterate through the groups
for animal_id, group_data in groups:
    # Extract and expand the 'embedding' column
    embeddings = pd.DataFrame(group_data['embedding'].tolist())
    
    # Perform PCA on the embeddings
    pca_features = pca.fit_transform(embeddings)

    # Perform Clustering (e.g., using KMeans)
    clusters = kmeans.fit_predict(pca_features)
    
    # Append the cluster labels to the list
    all_clusters.extend(clusters)

# Add the cluster labels as a new column to the original DataFrame
FR_emb_cluster_df['cluster_label'] = all_clusters

In [41]:
print(FR_emb_cluster_df.cluster_label.unique())
FR_emb_cluster_df.head(3)

[0 1 5 3 2 4]


Unnamed: 0,session_id,trace_num,s3_url,animal_id,embedding,cluster_label
0,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,0,behavior-project/processed-data/trace-images/f...,C1000_C42A,"[0.0, 0.0, 0.89690435, 0.0, 0.076437354, 0.0, ...",0
1,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,1,behavior-project/processed-data/trace-images/f...,C1000_C42A,"[0.0, 0.0, 1.7854563, 0.0, 0.0, 0.0, 1.2393667...",1
2,C1000_C42A_ChR2_xPVT_F00_S01_P05_N_NON_NON,2,behavior-project/processed-data/trace-images/f...,C1000_C42A,"[0.0, 0.0, 1.8289622, 0.0, 0.0, 0.0, 1.802902,...",5


<div style="background-color:green; color:white; padding:10px; font-size:20px">
5 - Save

In [42]:
# fr_emb_cluster_df

FILE = FR_emb_cluster_df
S3_DIR_PATH = "s3://behavior-project/processed-data/processed-data/embed-cluster-dfs/fr-icss"
FILE_NAME = 'fr-embed-cluster-df.csv'

# Split the s3_path into bucket_name and prefix
bucket_name, prefix = S3_DIR_PATH.replace("s3://", "").split("/", 1)
file_path = f"{prefix}/{FILE_NAME}"

# Create a CSV buffer
csv_buffer = StringIO()

# Convert the dataframe to CSV and write it to the buffer
FILE.to_csv(csv_buffer, index=False)

# Upload the CSV file to S3
s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'ND5E4V8BRNNC5ZDA',
  'HostId': 'r8oLKKU6rhxDGQuZTRt0zgT2TLFwWJ7rq9zSF5RIz8adgqyuycftW0Sy/0ZOPbxCO1R2YzfcXt4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'r8oLKKU6rhxDGQuZTRt0zgT2TLFwWJ7rq9zSF5RIz8adgqyuycftW0Sy/0ZOPbxCO1R2YzfcXt4=',
   'x-amz-request-id': 'ND5E4V8BRNNC5ZDA',
   'date': 'Tue, 08 Aug 2023 22:36:13 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"8f82bb65158fd5c48f0223f0c9f6dd48"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"8f82bb65158fd5c48f0223f0c9f6dd48"',
 'ServerSideEncryption': 'AES256'}