In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.compose         import ColumnTransformer
from sklearn.impute          import SimpleImputer
from sklearn.preprocessing   import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics         import accuracy_score, balanced_accuracy_score, classification_report
from sklearn.pipeline        import Pipeline, make_pipeline

from lightgbm           import LGBMClassifier
from sklearn.ensemble   import RandomForestClassifier

from time import time

In [2]:
data = pd.read_csv(r'C:\Users\Igor\Documents\GitHub\AI-Engineering\Chapter 2\Trackit Fitness App\1 Data\dataset_5secondWindow.csv')
# data = data[data['target'] != 'Bus']
# data = data[data['target'] != 'Train']

In [9]:
# data.groupby('user').size().sort_values()

In [11]:
def split_data(data):
    
    users = data.user.unique().tolist()
    # users.sort()
    print(users)

    # train_users = users[2:]
    # test_users = users[1:3]

    train_users = ['U1', 'U3', 'U4', 'U5', 'U6', 'U7', 'U8', 'U9', 'U12', 'U10', 'U11', 'U13']
    test_users = ['U2']

    train_data = data[data['user'].isin(train_users)]
    test_data = data[data['user'].isin(test_users)]

    return train_data,test_data


df_train, df_test = split_data(data)
print(df_train.shape, df_test.shape)

# df_train.to_csv('dataset_5secondWindow_TRAIN.csv', index=False)
# df_test.to_csv('dataset_5secondWindow_TEST.csv', index=False)


def clean_data(df_train, df_test):

    df_train = df_train.drop(['activityrecognition#0', 'time', 'id', 'user'], axis=1)
    df_train.columns = df_train.columns.str.replace('android.sensor.', '').str.replace('#', '_') # make features more readable
    df_train_null = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False)
    df_train_null = df_train_null.index[df_train_null >= 0.75]

    df_test = df_test.drop(['activityrecognition#0', 'time', 'id', 'user'], axis=1)
    df_test.columns = df_test.columns.str.replace('android.sensor.', '').str.replace('#', '_')
    df_test_null = (df_test.isnull().sum() / len(df_test)).sort_values(ascending = False)
    df_test_null = df_test_null.index[df_test_null >= 0.75]

    all_null = list(set(set(df_train_null) | set(df_test_null)))

    df_train = df_train.drop(all_null, axis=1)
    df_test  = df_test.drop(all_null, axis=1)

    print(df_train.shape, df_test.shape)
    return df_train, df_test


df_train, df_test = clean_data(df_train, df_test)
y = df_train['target']
x = df_train.drop('target', axis=1)
print(x.shape, y.shape)

['U12', 'U1', 'U8', 'U7', 'U10', 'U13', 'U6', 'U11', 'U3', 'U2', 'U4', 'U9', 'U5']
(5622, 70) (271, 70)
(5622, 56) (271, 56)
(5622, 55) (5622,)


In [5]:
my_classifiers = {
  "LightGBM": LGBMClassifier(max_depth= 5, n_estimators= 270, num_leaves=20, min_data_in_leaf=80, learning_rate= 0.069),
  "Random Forest": RandomForestClassifier(verbose=False)
}
                  
num_4_models = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')),
                                ('scaler', MinMaxScaler(feature_range=(0, 10)))
                        ])

preprocessor = ColumnTransformer(transformers=[ ('num', num_4_models, x.columns.to_list()) ], remainder='drop')
my_classifiers_copy = {name: make_pipeline(preprocessor, model) for name, model in my_classifiers.items()}


x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=0, test_size=0.2, stratify=y)
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in my_classifiers_copy.items():
        
        start_time = time()
        model.fit(x_train, y_train)
        predictions = model.predict(x_val)
        total_time = time() - start_time

        print(f'Finished {model_name} in {round(total_time, 2)} seconds')
        
        results = results.append({"Model": model_name,
                                  "Accuracy": accuracy_score(y_val, predictions)*100,
                                  "Bal Acc.": balanced_accuracy_score(y_val, predictions)*100,
                                  "Time": total_time},
                                  ignore_index=True)
                                
results_ord = pd.DataFrame(results)
results_ord = results_ord.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
print(results_ord)


Finished LightGBM in 1.24 seconds
Finished Random Forest in 1.49 seconds
           Model   Accuracy   Bal Acc.      Time
0       LightGBM  98.755556  98.759345  1.240697
1  Random Forest  97.422222  97.398394  1.490397


In [7]:
yy = df_test['target']
xx = df_test.drop('target', axis=1)

for model_name, model in my_classifiers_copy.items():
        
        start_time = time()
        preds = model.predict(xx)
        total_time = time() - start_time

        print(f'Finished {model_name} in {round(total_time, 2)} seconds, with an accuracy of: {accuracy_score(preds, yy)*100}')

# print(classification_report(yy, preds))

Finished LightGBM in 0.01 seconds, with an accuracy of: 87.4538745387454
Finished Random Forest in 0.04 seconds, with an accuracy of: 84.13284132841329


In [8]:
best_model = LGBMClassifier(max_depth= 5, n_estimators= 270, num_leaves=20, min_data_in_leaf=80, learning_rate= 0.069)
best_model_pipe = make_pipeline(preprocessor, best_model)

best_model_pipe.fit(x, y)
# joblib.dump(best_model_pipe, 'model_lgb.pkl')



['model_lgb.pkl']