In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone

# Load the dataset
data = pd.read_csv('sensor_data_1000.csv')

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 0', 'timestamp'])

# Encode the 'location' column
label_encoder = LabelEncoder()
data['location'] = label_encoder.fit_transform(data['location'])

# Normalize the features
scaler = StandardScaler()
features = data.drop(columns=['anomalized'])
scaled_features = scaler.fit_transform(features)
scaled_data = pd.DataFrame(scaled_features, columns=features.columns)
scaled_data['anomalized'] = data['anomalized']

# Split the dataset by location
locations = scaled_data['location'].unique()
location_data = {loc: scaled_data[scaled_data['location'] == loc].drop(columns=['location']) for loc in locations}

# Create training and test sets for each location
train_data = {}
test_data = {}
for loc in locations:
    loc_data = location_data[loc]
    train, test = train_test_split(loc_data, test_size=0.2, random_state=42)
    train_data[loc] = train
    test_data[loc] = test

# Define the list of models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Training function for each model
def train_models(data, models):
    trained_models = {}
    for name, model in models.items():
        model_clone = clone(model)
        X_train = data.drop(columns=['anomalized']).values
        y_train = data['anomalized'].values
        model_clone.fit(X_train, y_train)
        trained_models[name] = model_clone
    return trained_models

# Evaluate models and print metrics
def evaluate_models(test_data, test_labels, models, location):
    metrics = {}
    for name, model in models.items():
        y_pred = model.predict(test_data)
        accuracy = accuracy_score(test_labels, y_pred)
        precision = precision_score(test_labels, y_pred)
        recall = recall_score(test_labels, y_pred)
        f1 = f1_score(test_labels, y_pred)
        metrics[name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        print(f'Location: {location}, Model: {name}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')
    return metrics

# Aggregate predictions function
def aggregate_predictions(test_data, models):
    predictions = np.array([model.predict_proba(test_data)[:, 1] for model in models.values()])
    avg_prediction = np.mean(predictions, axis=0)
    final_prediction = (avg_prediction > 0.5).astype(int)
    return final_prediction

# Train models on each client's data, evaluate them, and aggregate results
global_models = {}
all_test_data = np.vstack([test_data[loc].drop(columns=['anomalized']).values for loc in locations])
all_test_labels = np.hstack([test_data[loc]['anomalized'].values for loc in locations])

for loc in locations:
    print(f'\nTraining and evaluating models for location: {loc}')
    global_models[loc] = train_models(train_data[loc], models)
    evaluate_models(test_data[loc].drop(columns=['anomalized']).values, test_data[loc]['anomalized'].values, global_models[loc], loc)

# Aggregate predictions from all models of all locations
final_predictions = aggregate_predictions(all_test_data, {name: global_models[loc][name] for loc in locations for name in models})
accuracy = accuracy_score(all_test_labels, final_predictions)
print(f'\nFinal Aggregated Model Accuracy: {accuracy}')



Training and evaluating models for location: -1.697749375254331
Location: -1.697749375254331, Model: Logistic Regression, Accuracy: 0.8383333333333334, Precision: 0.8798076923076923, Recall: 0.8218562874251497, F1 Score: 0.8498452012383901
Location: -1.697749375254331, Model: Random Forest, Accuracy: 0.9991666666666666, Precision: 1.0, Recall: 0.9985029940119761, F1 Score: 0.9992509363295881
Location: -1.697749375254331, Model: SVM, Accuracy: 0.9316666666666666, Precision: 0.9521604938271605, Recall: 0.9236526946107785, F1 Score: 0.9376899696048633
Location: -1.697749375254331, Model: KNN, Accuracy: 0.9066666666666666, Precision: 0.954248366013072, Recall: 0.874251497005988, F1 Score: 0.9125

Training and evaluating models for location: -0.9354945537115701
Location: -0.9354945537115701, Model: Logistic Regression, Accuracy: 0.8291666666666667, Precision: 0.8438538205980066, Recall: 0.8206785137318255, F1 Score: 0.8321048321048321
Location: -0.9354945537115701, Model: Random Forest, Ac

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone

# Load the dataset
data = pd.read_csv('sensor_data_1000.csv')

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 0', 'timestamp'])

# Encode the 'location' column
label_encoder = LabelEncoder()
data['location_encoded'] = label_encoder.fit_transform(data['location'])

# Normalize the features
scaler = StandardScaler()
features = data.drop(columns=['anomalized', 'location'])
scaled_features = scaler.fit_transform(features)
scaled_data = pd.DataFrame(scaled_features, columns=features.columns)
scaled_data['anomalized'] = data['anomalized']
scaled_data['location'] = data['location']
scaled_data['location_encoded'] = data['location_encoded']

# Split the dataset by location
locations = data['location'].unique()
location_data = {loc: scaled_data[scaled_data['location'] == loc].drop(columns=['location_encoded', 'location']) for loc in locations}

# Create training and test sets for each location
train_data = {}
test_data = {}
for loc in locations:
    loc_data = location_data[loc]
    train, test = train_test_split(loc_data, test_size=0.2, random_state=42)
    train_data[loc] = train
    test_data[loc] = test

# Define the list of models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Training function for each model
def train_models(data, models):
    trained_models = {}
    for name, model in models.items():
        model_clone = clone(model)
        X_train = data.drop(columns=['anomalized']).values
        y_train = data['anomalized'].values
        model_clone.fit(X_train, y_train)
        trained_models[name] = model_clone
    return trained_models

# Evaluate models and print metrics
def evaluate_models(test_data, test_labels, models, location):
    metrics = {}
    for name, model in models.items():
        y_pred = model.predict(test_data)
        accuracy = accuracy_score(test_labels, y_pred)
        precision = precision_score(test_labels, y_pred)
        recall = recall_score(test_labels, y_pred)
        f1 = f1_score(test_labels, y_pred)
        metrics[name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        print(f'Location: {location}, Model: {name}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')
    return metrics

# Aggregate predictions function
def aggregate_predictions(test_data, models):
    predictions = np.array([model.predict_proba(test_data)[:, 1] for model in models.values()])
    avg_prediction = np.mean(predictions, axis=0)
    final_prediction = (avg_prediction > 0.5).astype(int)
    return final_prediction

# Train models on each client's data, evaluate them, and aggregate results
global_models = {}
all_test_data = np.vstack([test_data[loc].drop(columns=['anomalized']).values for loc in locations])
all_test_labels = np.hstack([test_data[loc]['anomalized'].values for loc in locations])

for loc in locations:
    print(f'\nTraining and evaluating models for location: {loc}')
    global_models[loc] = train_models(train_data[loc], models)
    evaluate_models(test_data[loc].drop(columns=['anomalized']).values, test_data[loc]['anomalized'].values, global_models[loc], loc)

# Aggregate predictions from all models of all locations
all_global_models = {f'{loc}_{name}': global_models[loc][name] for loc in locations for name in models}
final_predictions = aggregate_predictions(all_test_data, all_global_models)
accuracy = accuracy_score(all_test_labels, final_predictions)
print(f'\nFinal Aggregated Model Accuracy: {accuracy}')



Training and evaluating models for location: Point 1
Location: Point 1, Model: Logistic Regression, Accuracy: 0.8383333333333334, Precision: 0.8798076923076923, Recall: 0.8218562874251497, F1 Score: 0.8498452012383901
Location: Point 1, Model: Random Forest, Accuracy: 0.9991666666666666, Precision: 1.0, Recall: 0.9985029940119761, F1 Score: 0.9992509363295881
Location: Point 1, Model: SVM, Accuracy: 0.9316666666666666, Precision: 0.9521604938271605, Recall: 0.9236526946107785, F1 Score: 0.9376899696048633
Location: Point 1, Model: KNN, Accuracy: 0.9066666666666666, Precision: 0.954248366013072, Recall: 0.874251497005988, F1 Score: 0.9125

Training and evaluating models for location: Point 2
Location: Point 2, Model: Logistic Regression, Accuracy: 0.8291666666666667, Precision: 0.8438538205980066, Recall: 0.8206785137318255, F1 Score: 0.8321048321048321
Location: Point 2, Model: Random Forest, Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0
Location: Point 2, Model: SVM, Accu