In [364]:
import boto3
import pandas as pd
from datetime import datetime

In [365]:
# Specify your DynamoDB table details
table_name = 'DeviceTimeStamp-zvvc26kofnethfd47fmdbb2skm-dev'

# Create a DynamoDB client
dynamodb = boto3.client('dynamodb')
# Scan the DynamoDB table to retrieve all data

response = dynamodb.scan(TableName=table_name)
items = response['Items']

# Convert DynamoDB items to a Pandas DataFrame
df = pd.DataFrame(items)

# Function to convert DynamoDB item to a dictionary of values
def parse_dynamodb_item(item):
    return {key: list(value.values())[0] for key, value in item.items()} 

# Apply the parsing function to each row in the DataFrame
df = df.apply(parse_dynamodb_item, axis=1).apply(pd.Series)  # Convert Series to DataFrame

# Convert timestamp strings to datetime objects if the 'timestamp' column exists
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%dT%H:%M:%S.%fZ')

# Print the cleaned DataFrame
df

Unnamed: 0,__typename,deviceID,userID,updatedAt,eventStatus,timestamp,createdAt,owner,id
0,DeviceTimeStamp,6y2e0q34-992e-901d-2r24-4t99e27q9789,aye06b84-ceb1-4f65-8407-13cd12663818,2023-11-14T05:00:00.001Z,False,2023-11-14 05:00:00.001,2023-11-14T05:00:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2b61
1,DeviceTimeStamp,111j6h66-938q-278r-8e24-4t99e27q8371,fat06b84-ceb1-4f65-8407-13cd12663818,2023-12-01T23:17:00.001Z,True,2023-12-01 23:17:00.001,2023-12-01T23:17:00.001Z,fat06b84-ceb1-4f65-8407-13cd12663818::fat06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2j67
2,DeviceTimeStamp,7a2e3c32-888a-446c-9d15-8f79f40a7331,b3f06b84-ceb1-4f65-8407-13cd12663818,2023-12-05T14:00:00.001Z,True,2023-12-05 14:00:00.001,2023-12-05T14:00:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2j86
3,DeviceTimeStamp,6y2e0q34-992e-901d-2r24-4t99e27q9789,aye06b84-ceb1-4f65-8407-13cd12663818,2023-11-22T14:30:00.001Z,True,2023-11-22 14:30:00.001,2023-11-22T14:30:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2f65
4,DeviceTimeStamp,ab4454c7-1d75-4328-af9c-f03a12ccea2b,b3f06b84-ceb1-4f65-8407-13cd12663818,2023-10-27T21:00:00.001Z,False,2023-10-27 21:00:00.001,2023-10-27T21:00:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2c08
...,...,...,...,...,...,...,...,...,...
955,DeviceTimeStamp,9c1j6h66-938q-278r-8e24-4t99e27q8371,b3f06b84-ceb1-4f65-8407-13cd12663818,2023-11-02T08:33:00.001Z,False,2023-11-02 08:33:00.001,2023-11-02T08:33:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,hij6d759-1240-4d42-ab68-827b9bf22acd
956,DeviceTimeStamp,6y2e0q34-992e-901d-2r24-4t99e27q9789,aye06b84-ceb1-4f65-8407-13cd12663818,2023-11-18T14:00:00.001Z,True,2023-11-18 14:00:00.001,2023-11-18T14:00:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2d38
957,DeviceTimeStamp,9c1j6h66-938q-278r-8e24-4t99e27q8371,b3f06b84-ceb1-4f65-8407-13cd12663818,2023-11-24T06:00:00.001Z,False,2023-11-24 06:00:00.001,2023-11-24T06:00:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2h39
958,DeviceTimeStamp,ab4454c7-1d75-4328-af9c-f03a12ccea2b,b3f06b84-ceb1-4f65-8407-13cd12663818,2023-12-04T08:30:00.001Z,False,2023-12-04 08:30:00.001,2023-12-04T08:30:00.001Z,b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...,jkl4b9d8-dbf4-44fa-b3f8-c01dd3ce2j38


In [366]:
# Check for missing values in the DataFrame
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# Handle missing values (if any) - for example, drop rows with missing values
df = df.dropna()

Missing Values:
 __typename     0
deviceID       0
userID         0
updatedAt      0
eventStatus    0
timestamp      0
createdAt      0
owner          0
id             0
dtype: int64


In [367]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
label_encoder = LabelEncoder()

# Specify columns to be label encoded
columns_to_encode = ['userID', 'deviceID']

# Apply label encoding to the specified columns
for column in columns_to_encode:
    # Check if the column exists in the DataFrame and has dtype 'object'
    if column in df.columns and df[column].dtype == 'object':
        # Fit and transform the column with label encoding
        df[column] = label_encoder.fit_transform(df[column])

# Display the DataFrame after label encoding
print(df.head())


        __typename  deviceID  userID                 updatedAt  eventStatus  \
0  DeviceTimeStamp         1       0  2023-11-14T05:00:00.001Z        False   
1  DeviceTimeStamp         0       2  2023-12-01T23:17:00.001Z         True   
2  DeviceTimeStamp         2       1  2023-12-05T14:00:00.001Z         True   
3  DeviceTimeStamp         1       0  2023-11-22T14:30:00.001Z         True   
4  DeviceTimeStamp         4       1  2023-10-27T21:00:00.001Z        False   

                timestamp                 createdAt  \
0 2023-11-14 05:00:00.001  2023-11-14T05:00:00.001Z   
1 2023-12-01 23:17:00.001  2023-12-01T23:17:00.001Z   
2 2023-12-05 14:00:00.001  2023-12-05T14:00:00.001Z   
3 2023-11-22 14:30:00.001  2023-11-22T14:30:00.001Z   
4 2023-10-27 21:00:00.001  2023-10-27T21:00:00.001Z   

                                               owner  \
0  b3f06b84-ceb1-4f65-8407-13cd12663818::b3f06b84...   
1  fat06b84-ceb1-4f65-8407-13cd12663818::fat06b84...   
2  b3f06b84-ceb1-4f65-8407

In [368]:
# Specify columns to keep
selected_columns = ['userID', 'deviceID', 'eventStatus', 'timestamp']

# Create a new DataFrame with only the selected columns
df_selected = df[selected_columns]

# Separate records into 'true' and 'false' using df_selected
df_true_selected = df_selected[df_selected['eventStatus'] == True]
df_false_selected = df_selected[df_selected['eventStatus'] == False]

# Display the DataFrames
print("True Records (Selected Columns):")
print(df_true_selected)

print("\nFalse Records (Selected Columns):")
print(df_false_selected)


True Records (Selected Columns):
     userID  deviceID  eventStatus               timestamp
1         2         0         True 2023-12-01 23:17:00.001
2         1         2         True 2023-12-05 14:00:00.001
3         0         1         True 2023-11-22 14:30:00.001
6         1         4         True 2023-11-28 05:00:00.001
10        1         4         True 2023-10-27 14:01:00.001
..      ...       ...          ...                     ...
949       1         4         True 2023-11-07 14:00:00.001
951       2         0         True 2023-11-26 23:01:00.001
954       0         1         True 2023-11-24 20:00:00.001
956       0         1         True 2023-11-18 14:00:00.001
959       1         2         True 2023-11-25 20:00:00.001

[480 rows x 4 columns]

False Records (Selected Columns):
     userID  deviceID  eventStatus               timestamp
0         0         1        False 2023-11-14 05:00:00.001
4         1         4        False 2023-10-27 21:00:00.001
5         2         0  

In [369]:
# Sort 'true' records by timestamp in descending order
df_true_selected = df_true_selected.sort_values(by='timestamp', ascending=False)

# Sort 'false' records by timestamp in descending order
df_false_selected = df_false_selected.sort_values(by='timestamp', ascending=False)

# Display the sorted DataFrames
print("True Records (Selected Columns - Sorted by Timestamp):")
print(df_true_selected)

print("\nFalse Records (Selected Columns - Sorted by Timestamp):")
print(df_false_selected)


True Records (Selected Columns - Sorted by Timestamp):
     userID  deviceID  eventStatus               timestamp
124       2         0         True 2023-12-30 23:30:00.001
126       1         2         True 2023-12-30 23:10:00.001
710       1         2         True 2023-12-30 20:10:00.001
367       1         4         True 2023-12-07 05:00:00.001
598       1         4         True 2023-12-06 14:00:00.001
..      ...       ...          ...                     ...
504       1         4         True 2023-10-21 07:12:00.001
677       1         4         True 2023-10-21 05:04:00.001
188       1         4         True 2023-10-20 14:01:00.001
133       1         4         True 2023-10-20 07:03:00.001
240       1         4         True 2023-10-20 05:10:00.001

[480 rows x 4 columns]

False Records (Selected Columns - Sorted by Timestamp):
     userID  deviceID  eventStatus               timestamp
567       1         2        False 2023-12-30 22:00:00.001
721       0         1        False 202

In [370]:
# Function to extract hours and minutes from timestamp
def extract_hours_minutes(df):
    df['hours'] = df['timestamp'].dt.hour
    df['minutes'] = df['timestamp'].dt.minute
    return df

# Select the latest 10 records for each unique combination of 'userID' and 'deviceID' where 'eventStatus' is True
df_true_selected_latest_10 = (
    df_true_selected
    .groupby(['userID', 'deviceID'])
    .apply(lambda x: x.head(10) if len(x) >= 10 else x.tail(len(x)))
    .pipe(extract_hours_minutes)  # Extract hours and minutes
    .reset_index(drop=True)
)

# Select the latest 10 records for each unique combination of 'userID' and 'deviceID' where 'eventStatus' is False
df_false_selected_latest_10 = (
    df_false_selected
    .groupby(['userID', 'deviceID'])
    .apply(lambda x: x.head(10) if len(x) >= 10 else x.tail(len(x)))
    .pipe(extract_hours_minutes)  # Extract hours and minutes
    .reset_index(drop=True)
)

# Display the DataFrames
print("Latest 10 True Records (Selected Columns - Sorted by Timestamp - with Hours and Minutes):")
print(df_true_selected_latest_10)

print("\nLatest 10 False Records (Selected Columns - Sorted by Timestamp - with Hours and Minutes):")
print(df_false_selected_latest_10)


Latest 10 True Records (Selected Columns - Sorted by Timestamp - with Hours and Minutes):
    userID  deviceID  eventStatus               timestamp  hours  minutes
0        0         1         True 2023-11-29 20:00:00.001     20        0
1        0         1         True 2023-11-29 14:00:00.001     14        0
2        0         1         True 2023-11-28 23:00:00.001     23        0
3        0         1         True 2023-11-28 20:00:00.001     20        0
4        0         1         True 2023-11-28 14:00:00.001     14        0
5        0         1         True 2023-11-27 23:00:00.001     23        0
6        0         1         True 2023-11-27 20:00:00.001     20        0
7        0         1         True 2023-11-27 14:00:00.001     14        0
8        0         1         True 2023-11-26 23:00:00.001     23        0
9        0         1         True 2023-11-26 20:00:00.001     20        0
10       1         2         True 2023-12-30 23:10:00.001     23       10
11       1         2  

In [373]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# from sklearn.externals import joblib  # Use joblib for saving models

# Concatenate the 'Latest 10 True Records' and 'Latest 10 False Records' DataFrames
df_combined = pd.concat([df_true_selected_latest_10, df_false_selected_latest_10], ignore_index=True)

# Separate features (X) and target variable (y)
X = df_combined[['userID', 'deviceID', 'hours', 'minutes']]
y = df_combined['eventStatus']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train_scaled, y_train)
y_pred_logreg = logreg_model.predict(X_test_scaled)

# Save Logistic Regression model to a .joblib file
joblib.dump(logreg_model, 'logreg_model.joblib')

# SVM
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

# Save SVM model to a .joblib file
joblib.dump(svm_model, 'svm_model.joblib')

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

# Save Random Forest model to a .joblib file
joblib.dump(rf_model, 'rf_model.joblib')

# Evaluate the models
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

print("\nSupport Vector Machine (SVM):")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

print("\nRandom Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Logistic Regression:
Accuracy: 0.5333333333333333
Classification Report:
               precision    recall  f1-score   support

       False       0.50      0.36      0.42        14
        True       0.55      0.69      0.61        16

    accuracy                           0.53        30
   macro avg       0.53      0.52      0.51        30
weighted avg       0.53      0.53      0.52        30


Support Vector Machine (SVM):
Accuracy: 0.6333333333333333
Classification Report:
               precision    recall  f1-score   support

       False       1.00      0.21      0.35        14
        True       0.59      1.00      0.74        16

    accuracy                           0.63        30
   macro avg       0.80      0.61      0.55        30
weighted avg       0.78      0.63      0.56        30


Random Forest:
Accuracy: 0.9666666666666667
Classification Report:
               precision    recall  f1-score   support

       False       0.93      1.00      0.97        14
        Tr

In [None]:
# import pandas as pd
# import joblib
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# # from sklearn.externals import joblib  # Use joblib for saving models

# # Concatenate the 'Latest 10 True Records' and 'Latest 10 False Records' DataFrames
# df_combined = pd.concat([df_true_selected_latest_10, df_false_selected_latest_10], ignore_index=True)

# # Separate features (X) and target variable (y)
# X = df_combined[['userID', 'deviceID', 'hours', 'minutes']]
# y = df_combined['eventStatus']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# # Standardize the features using StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Logistic Regression
# logreg_model = LogisticRegression(random_state=42)
# logreg_model.fit(X_train_scaled, y_train)
# y_pred_logreg = logreg_model.predict(X_test_scaled)

# # Save Logistic Regression model to a .joblib file
# joblib.dump(logreg_model, 'logreg_model.joblib')

# # SVM
# svm_model = SVC(random_state=42)
# svm_model.fit(X_train_scaled, y_train)
# y_pred_svm = svm_model.predict(X_test_scaled)

# # Save SVM model to a .joblib file
# joblib.dump(svm_model, 'svm_model.joblib')

# # Random Forest
# rf_model = RandomForestClassifier(random_state=42)
# rf_model.fit(X_train_scaled, y_train)
# y_pred_rf = rf_model.predict(X_test_scaled)

# # Save Random Forest model to a .joblib file
# joblib.dump(rf_model, 'rf_model.joblib')

# # Evaluate the models
# print("Logistic Regression:")
# print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
# print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

# print("\nSupport Vector Machine (SVM):")
# print("Accuracy:", accuracy_score(y_test, y_pred_svm))
# print("Classification Report:\n", classification_report(y_test, y_pred_svm))

# print("\nRandom Forest:")
# print("Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("Classification Report:\n", classification_report(y_test, y_pred_rf))


# **FOR TESTING**

In [374]:
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime

# Load the saved models
logreg_model = joblib.load('logreg_model.joblib')
svm_model = joblib.load('svm_model.joblib')
rf_model = joblib.load('rf_model.joblib')

# Function to collect data, perform calculations, and make predictions
def predict_event_status(user_id, device_id, timestamp_str):
    # Convert the timestamp string to a pandas datetime object
    timestamp = pd.to_datetime(timestamp_str, format='%Y-%m-%d %H:%M:%S')

    # Extract hour and minute from the timestamp
    hour = timestamp.hour
    minute = timestamp.minute

    # Create a DataFrame with the collected data
    new_data = pd.DataFrame({
        'userID': [user_id],
        'deviceID': [device_id],
        'hours': [hour],
        'minutes': [minute]
    })

    print('hours', hour, 'minutes', minute)

    # Apply label encoding to 'userID' and 'deviceID'
    label_encoder = LabelEncoder()
    new_data['userID'] = label_encoder.fit_transform(new_data['userID'])
    new_data['deviceID'] = label_encoder.fit_transform(new_data['deviceID'])

    # Preprocess the new data
    scaler = StandardScaler()
    new_data_scaled = scaler.fit_transform(new_data[['userID', 'deviceID', 'hours', 'minutes']])

    # Make predictions using the loaded models
    logreg_prediction = logreg_model.predict(new_data_scaled)
    svm_prediction = svm_model.predict(new_data_scaled)
    rf_prediction = rf_model.predict(new_data_scaled)

    # Return the predictions
    return {
        'Logistic Regression Prediction': logreg_prediction[0],
        'SVM Prediction': svm_prediction[0],
        'Random Forest Prediction': rf_prediction[0]
    }

# Example usage
user_id = 'fat06b84-ceb1-4f65-8407-13cd12663818'
device_id = '111j6h66-938q-278r-8e24-4t99e27q8371'
timestamp_str = '2023-11-14T23:10:34.685496'

predictions = predict_event_status(user_id, device_id, timestamp_str)
print(predictions,)


FileNotFoundError: [Errno 2] No such file or directory: 'logistic regression_trained_model.joblib'

In [None]:
# import awswrangler as wr
# import pandas as pd
# from pathlib import Path

# wr.dynamodb.put_csv(path="results.csv", table_name="DeviceTimeStamp-zvvc26kofnethfd47fmdbb2skm-dev")