# Gerekli Kütüphaneler Yükleniyor

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

import pandas as pd
import numpy as np
import os
from datetime import datetime
from collections import Counter
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from matplotlib import pyplot

from scipy.fftpack import fft

import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE, RFECV, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, RocCurveDisplay, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

import xgboost as xgb
import lightgbm as lgb

import warnings

warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

In [None]:
n_cpu = os.cpu_count()
print("Number of CPUs in the system:", n_cpu)

In [None]:
#NUM_OF_FEATURES=int(input("Number of Features. If you want to continue without feature selection, enter 0."))

# Veriler okunuyor

In [None]:
anaKlasor = "C:\\Users\\cihan.aksop\\Desktop\\"

X_train = pd.read_csv(os.path.join(anaKlasor, 'X_train.txt'), sep='\\s+', header=None)
X_test = pd.read_csv(os.path.join(anaKlasor, 'X_test.txt'), sep='\\s+', header=None)
y_train = pd.read_csv(os.path.join(anaKlasor, 'y_train.txt'), sep='\\s+', header=None)
y_test = pd.read_csv(os.path.join(anaKlasor, 'y_test.txt'), sep='\\s+', header=None)
print("Data Loaded Successfully")
print(f"Shape of train data: {X_train.shape}")
print(f"Shape of test data: {X_test.shape}")

In [None]:
# Combine training and testing data for analysis
X = pd.concat([X_train, X_test], ignore_index=True)
y = pd.concat([y_train, y_test], ignore_index=True)
y.columns=["activity"]

In [None]:
# Load activity labels
activity_labels = pd.read_csv(os.path.join(anaKlasor, 'activity_labels.txt'), sep=' ', header=None, names=['id', 'activity_name'])
y['activity_name'] = y['activity'].map(activity_labels.set_index('id')['activity_name'])

In [None]:
X_train.head()

In [None]:
display(X_train.describe())

In [None]:
print("Number of missing values")
print(X_train.isnull().mean())

In [None]:
print("Data types:")
print(X_train.dtypes)

In [None]:
# Remove duplicated columns
tekil_deger_sayisi = X_train.nunique()
tekil_degerli_sutunlar = tekil_deger_sayisi[tekil_deger_sayisi == 1].index
print("Number of Duplicated Columns: " + str(len(tekil_degerli_sutunlar)))
X_train.drop(tekil_degerli_sutunlar, axis=1, inplace=True)

In [None]:
# Remove duplicated rows
X_train.drop_duplicates(inplace = True)
X_train.reset_index(drop=True, inplace=True)

In [None]:
print(f"Shape of the new dataset:{X_train.shape}")

In [None]:
print("Frequencies of y train values")
print(y_train.value_counts())

In [None]:
# Değişkenler arasındaki korelasyon matrisi çizdiriliyor
pd.concat([X_train, y_train]).corr().style.background_gradient()

In [None]:
def add_fft_features(X):
    fft_features = np.abs(fft(X, axis=1))
    return np.hstack((X, fft_features))
#X_train = add_fft_features(X_train)
#X_test = add_fft_features(X_test)

In [None]:
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y['activity'])
selected_feature_indices = selector.get_support(indices=True)
selected_features = [f'Feature {i}' for i in selected_feature_indices]

In [None]:
# Add selected features back to a DataFrame for visualization
X_selected_df = pd.DataFrame(X_selected, columns=selected_features)
X_selected_df['activity_name'] = y['activity_name']

In [None]:
print(f"Data size after feature selection: {X_train.shape}")

In [None]:
# 1. Activity Distribution
activity_distribution = y['activity_name'].value_counts(normalize=True) * 100
plt.figure(figsize=(8, 5))
activity_distribution.plot(kind="bar", color="skyblue", edgecolor="black")
plt.title("Activity Distribution")
plt.xlabel("Activity")
plt.ylabel("Percentage (%)")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# 2. Correlation Matrix for Selected Features
correlation_matrix = X_selected_df[selected_features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Selected Features")
plt.tight_layout()
plt.show()

In [None]:
# 3. Pairplot of Selected Features by Activity
sns.pairplot(X_selected_df, vars=selected_features[:3], hue="activity_name", palette="Set2")
plt.suptitle("Pairplot of Selected Features by Activity", y=1.02)
plt.show()

In [None]:
# 4. Feature Trends (Selected Features)
plt.figure(figsize=(8, 5))
for feature in selected_features[:3]:
    for activity in X_selected_df['activity_name'].unique():
        subset = X_selected_df[X_selected_df['activity_name'] == activity].head(50)
        plt.plot(subset.index, subset[feature].rolling(5).mean(), label=f"{activity} ({feature})", alpha=0.7)

plt.title("Feature Trends: Smoothed Selected Features")
plt.xlabel("Sample Index")
plt.ylabel("Feature Value")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# 5. Clustering Example with Selected Features
kmeans = KMeans(n_clusters=3, random_state=42)
X_selected_df['cluster'] = kmeans.fit_predict(X_selected_df[selected_features])
plt.figure(figsize=(8, 5))
sns.scatterplot(
    x=selected_features[0], y=selected_features[1],
    hue="cluster", palette="viridis", data=X_selected_df, legend="full")
plt.title("Clustering Results Based on Selected Features")
plt.xlabel(selected_features[0])
plt.ylabel(selected_features[1])
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

In [None]:
# 6. Feature Importance Visualization
feature_scores = selector.scores_[selected_feature_indices]
plt.figure(figsize=(10, 5))
plt.barh(selected_features, feature_scores, color="skyblue")
plt.title("Feature Importance Based on ANOVA F-Test")
plt.xlabel("F-Score")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

In [None]:
# 7. Boxplot for Each Selected Feature by Activity
plt.figure(figsize=(12, 8))
for i, feature in enumerate(selected_features[:3]):
    plt.subplot(1, 3, i+1)
    sns.boxplot(x="activity_name", y=feature, data=X_selected_df, palette="Set3")
    plt.title(f"{feature} Distribution by Activity")
    plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# 8. Histogram of Selected Features
plt.figure(figsize=(10, 6))
for feature in selected_features[:3]:
    sns.histplot(X_selected_df[feature], kde=True, label=feature, alpha=0.7)
plt.title("Histogram of Selected Features")
plt.xlabel("Feature Value")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
def preprocess_data(X_train, X_test, y_train, y_test, sample_size=100000):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    X_train, y_train = resample(X_train, y_train, n_samples=100000, random_state=42)
    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)
    print(f"Data Preprocessing Completed. Training samples: {len(X_train)}")
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)

# Makine Öğrenmesi Algoritmaları Uygulanıyor

In [None]:
# Makine öğrenmesi yöntemleri ve parametreleri belirleniyor
accuracies = {}
parameter_group = {}
parameter_group["Decision Tree"] = {'function': DecisionTreeClassifier(random_state=1),
                                    'parameters': {'criterion': ["entropy", "log_loss"],
                                                  'max_features': ['auto', 'sqrt', 'log2'],
                                                  'ccp_alpha': [.001, .0001, .00001],
                                                  'max_depth' : [5, 10, 15],}}

parameter_group["K-Neightbors"] = {'function': KNeighborsClassifier(),
                                   'parameters': {'n_neighbors': [2, 5],
                                                  'weights': ["uniform", "distance"],
                                                  'algorithm': ["ball_tree", "kd_tree"],
                                                  'metric': ['euclidean', 'manhattan'],
                                                  'p': [1]}}
parameter_group["SVC"] = {'function': SVC(),
                          'parameters': {'C': [1, 10],
                                         'kernel': ["linear", "poly", "rbf", "sigmoid"],
                                         'degree': [5, 7, 10]}}
parameter_group["Random Forest"] = {'function': RandomForestClassifier(),
                                    'parameters': {'n_estimators': [5, 7, 10],
                                                   'criterion': ["gini", "entropy", "log_loss"],
                                                   'max_features': ["sqrt", "log2"]}}
parameter_group["Extra Trees"] = {'function': ExtraTreesClassifier(),
                                  'parameters': {'criterion': ["gini", "entropy", "log_loss"],
                                                 'max_features': ["sqrt", "log2"]}}

parameter_group["XGB"] = {'function': xgb.XGBClassifier(objective='multi:softmax', seed=42, num_class = len(np.unique(y_train)), max_depth=3),
                          'parameters': {'min_child_weight': [1, 3],
                                         'gamma': [0.5, 1],
                                         'subsample': [0.6, 0.8],
                                         'colsample_bytree': [0.6, 0.8]}}

parameter_group["LGBM"] = {'function': lgb.LGBMClassifier(objective = 'multiclass', boosting_type = "dart", num_class= len(np.unique(y_train)), verbose = 0, random_state=42),
                           'parameters': {'learning_rate': [0.01, .1],
                                          'n_estimators': [8,16],
                                          'reg_lambda' : [1,1.2]}}

parameter_group["Logistic"] = {'function': LogisticRegression(),
                               'parameters': {"penalty":["l2"]}}

#parameter_group["Gradient Boosting"] = {'function': GradientBoostingClassifier(random_state=42),
#                                        'parameters': {"learning_rate": [0.01, 0.1],
#                                                        "max_depth":[3,8]
#                                                        }}

for method in parameter_group:
    print(method)
    classifier = parameter_group[method]["function"]
    model = GridSearchCV(classifier, parameter_group[method]["parameters"], n_jobs=-1, verbose=1, cv=5)
    model.fit(X_train, y_train)
    print(model.best_params_)

    y_predict = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_predict)
    conf_matrix = confusion_matrix(y_train, y_predict)

    print('TRAIN RESULTS')
    print('Train Accuracy: {:.3f}'.format(accuracy))
    print(classification_report(y_train, y_predict))
    print('Confusion matrix:')
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
    disp.plot()
    plt.show()

    y_predict = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_predict)
    conf_matrix = confusion_matrix(y_test, y_predict)

    print('TEST RESULTS')
    print('Test Accuracy: {:.3f}'.format(accuracy))
    print(classification_report(y_test, y_predict))
    print('Confusion matrix:')
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
    disp.plot()
    plt.show()

    accuracies[method] = {'model':  model.best_estimator_,
                                   'accuracy': accuracy}

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

num_classes = len(np.unique(y_train))

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# LSTM

In [None]:
class LSTM_MulticlassClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LSTM_MulticlassClassifier, self).__init__()
        self.fc1 = nn.LSTM(input_size, 64, 3, batch_first=True)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu = nn.ReLU()
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        x,_ = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]

model = LSTM_MulticlassClassifier(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(epoch)
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracies["LSTM"] = {'model':  model,
                                   'accuracy': correct / total}
    print(f'Accuracy of the network on the test data: {100 * correct / total} %')

# GRU

In [None]:
class GRU_MulticlassClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(GRU_MulticlassClassifier, self).__init__()
        self.fc1 = nn.GRU(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu = nn.ReLU()
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        x,_ = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]

model = GRU_MulticlassClassifier(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(epoch)
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracies["GRU"] = {'model':  model,
                                   'accuracy': correct / total}
    print(f'Accuracy of the network on the test data: {100 * correct / total} %')

In [None]:
class BiLSTM_MulticlassClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(BiLSTM_MulticlassClassifier, self).__init__()
        self.fc1 = nn.LSTM(input_size, 64, 3, bidirectional=True, batch_first=True)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 32)
        self.relu = nn.ReLU()
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        x,_ = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]

model = BiLSTM_MulticlassClassifier(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(epoch)
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracies["BiLSTM"] = {'model':  model,
                                   'accuracy': correct / total}
    print(f'Accuracy of the network on the test data: {100 * correct / total} %')

# Meta Learner

In [None]:
X_traditional_train = np.c_[accuracies["Decision Tree"]["model"].predict(X_train).T, 
          accuracies["Random Forest"]["model"].predict(X_train).T,
          accuracies["Extra Trees"]["model"].predict(X_train).T,
          accuracies["XGB"]["model"].predict(X_train).T,
          accuracies["LGBM"]["model"].predict(X_train).T,
          accuracies["Logistic"]["model"].predict(X_train).T]

X_neural_train = np.c_[torch.max(accuracies["LSTM"]["model"](X_train).data, 1).indices.numpy().T,
                torch.max(accuracies["GRU"]["model"](X_train).data, 1).indices.numpy().T,
                torch.max(accuracies["BiLSTM"]["model"](X_train).data, 1).indices.numpy().T]

X_traditional_test = np.c_[accuracies["Decision Tree"]["model"].predict(X_test).T, 
          accuracies["Random Forest"]["model"].predict(X_test).T,
          accuracies["Extra Trees"]["model"].predict(X_test).T,
          accuracies["XGB"]["model"].predict(X_test).T,
          accuracies["LGBM"]["model"].predict(X_test).T,
          accuracies["Logistic"]["model"].predict(X_test).T]

X_neural_test = np.c_[torch.max(accuracies["LSTM"]["model"](X_test).data, 1).indices.numpy().T,
                torch.max(accuracies["GRU"]["model"](X_test).data, 1).indices.numpy().T,
                torch.max(accuracies["BiLSTM"]["model"](X_test).data, 1).indices.numpy().T]

X_meta_train = np.c_[X_traditional_train, X_neural_train]
X_meta_test = np.c_[X_traditional_test, X_neural_test]

In [None]:
meta_learner = LogisticRegression(max_iter=1000, random_state=42)
meta_learner.fit(X_meta_train, y_train)

y_predict = meta_learner.predict(X_meta_train)
accuracy = accuracy_score(y_train, y_predict)
conf_matrix = confusion_matrix(y_train, y_predict)

print('TRAIN RESULTS')
print('Train Accuracy: {:.3f}'.format(accuracy))
print(classification_report(y_train, y_predict))
print('Confusion matrix:')
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot()
plt.show()

y_predict = meta_learner.predict(X_meta_test)
accuracy = accuracy_score(y_test, y_predict)
conf_matrix = confusion_matrix(y_test, y_predict)

print('TEST RESULTS')
print('Test Accuracy: {:.3f}'.format(accuracy))
print(classification_report(y_test, y_predict))
print('Confusion matrix:')
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot()
plt.show()