#### Carpineti C., Lomonaco V., Bedogni L., Di Felice M., Bononi L., "Custom Dual Transportation Mode Detection by Smartphone Devices Exploiting Sensor Diversity", in Proceedings of the 14th Workshop on Context and Activity Modeling and Recognition (IEEE COMOREA 2018), Athens, Greece, March 19-23, 2018
#### https://tempesta.cs.unibo.it/projects/us-tm2017/index.html

### 1. All imports in one place

In [2]:
import pandas     as pd
import matplotlib.pyplot as plt

import joblib
import optuna

from sklearn.compose         import ColumnTransformer
from sklearn.impute          import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics         import accuracy_score, balanced_accuracy_score
from sklearn.pipeline        import Pipeline, make_pipeline
from lightgbm                import LGBMClassifier
from sklearn.ensemble        import RandomForestClassifier

from time import time

### 2. Data Analisys and Exploration

In [5]:
data = pd.read_csv('dataset_5secondWindow.csv')
data = data.drop('Unnamed: 0', axis=1)

In [9]:
# data.describe()
# data.head(15)
# data.tail(15)
# data.isnull().sum()
# data.groupby('target').size()
# data.groupby('user').size().sort_values()
# data.shape

In [None]:
# data.corr()['target'].sort_values().plot.barh(figsize=(45,35))

In [None]:
# data = data[data['target'] != 'Bus']
# data = data[data['target'] != 'Train']

#### First split for training and test sets

In [None]:
def split_data(data):
    
    # users = data.user.unique().tolist()
    # users.sort()    
    # train_users = users[:-3]
    # test_users = users[-3:]

    train_users = ['U1', 'U3', 'U4', 'U5', 'U6', 'U7', 'U8', 'U10', 'U11', 'U13']
    test_users = ['U2', 'U9', 'U12']

    train_data = data[data['user'].isin(train_users)]
    test_data = data[data['user'].isin(test_users)]
    
    return train_data,test_data

df_train, df_test = split_data(data)
df_train.shape, df_test.shape

# df_train.to_csv('dataset_5secondWindow_TRAIN.csv', index=False)
# df_test.to_csv('dataset_5secondWindow_TEST.csv', index=False)

In [None]:
# df_train.groupby('user').size(), df_test.groupby('user').size()

#### Split test data for individual users

In [None]:
# def split_data(data):

#     user1 = ['U9']
#     user2 = ['U2']
#     user3 = ['U12']

#     test_data1 = data[data['user'].isin(user1)]
#     test_data2 = data[data['user'].isin(user2)]
#     test_data3 = data[data['user'].isin(user3)]

#     return test_data1, test_data2, test_data3

# user1, user2, user3 = split_data(df)
# user1.shape, user2.shape, user3.shape

# user1.to_csv('test_user1.csv', index=False)
# user2.to_csv('test_user2.csv', index=False)
# user3.to_csv('test_user3.csv', index=False)

### 3. Feature Engineering

#### Drop irrelevant features. 
#### Drop features with mostly missing values.

In [None]:
df_train = df_train.drop(['activityrecognition#0', 'time', 'id', 'user'], axis=1)
df_train.columns = df_train.columns.str.replace('android.sensor.', '').str.replace('#', '_') # make features more readable
df_train_null = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False)
df_train_null = df_train_null.index[df_train_null >= 0.75]

df_test = df_test.drop(['activityrecognition#0', 'time', 'id', 'user'], axis=1)
df_test.columns = df_test.columns.str.replace('android.sensor.', '').str.replace('#', '_')
df_test_null = (df_test.isnull().sum() / len(df_test)).sort_values(ascending = False)
df_test_null = df_test_null.index[df_test_null >= 0.75]

all_null = list(set(set(df_train_null) | set(df_test_null)))

df_train = df_train.drop(all_null, axis=1)
df_test  = df_test.drop(all_null, axis=1)

df_train.shape, df_test.shape

#### Dropped Features

In [None]:
# all_null
# data.head(1)

In [None]:
# data.columns = data.columns.str.replace('android.sensor.', '').str.replace('#', '_')

# droped_indices = [data.columns.get_loc(a) for a in all_null]
# droped_indices.sort()

# droped_indices_new = droped_indices + [0, 2, 68, 69] # 0 id, 2 activityrecognition#0, 68 target, 69 user
# droped_indices_new.sort()

# droped_indices_new

#### Features sets to test

In [None]:
#### 1. Sensors of first class of classification
# cols1 = [c for c in df_train.columns if 'accelerometer' in c or 'sound' in c or 'gyroscope' in c]
# df_train1 = df_train[cols1]
# df1.columns

#### 2. Sensors of second class of classification
# cols2 = [c for c in df_train.columns if 'accelerometer' in c or 'sound' in c or 'gyroscope' in c or 'orientation' in c or 'linear_acceleration' in c or 'rotation_vector']
# df_train2 = df_train[cols2]
# df2.columns

#### 3. Sensors of third class of classification
# cols3 = [c for c in df_train.columns if 'accelerometer' in c or 'sound' in c or 'gyroscope' in c or 'orientation' in c or 'linear_acceleration' in c or 'rotation_vector' or 'speed' in c]
# df_train3 = df_train[cols3]
# x3.columns

# x1, x2, x3 = df_train1, df_train2, df_train3
# features_list = {"Feature_Set_1": x1,
#                  "Feature_Set_2": x2,
#                  "Feature_Set_3": x3
# }

In [None]:
y = df_train['target']
x = df_train.drop('target', axis=1)

### 4. Select Machine Learning Algorithms

#### Create a pipeline to include the imputer that handles missing values

In [None]:
my_classifiers = {"LightGBM": LGBMClassifier(boosting_type='gbdt', max_depth=25, n_estimators=129, learning_rate=0.01, min_data_in_leaf=104),
                  "RandomForest": RandomForestClassifier(criterion='entropy', max_depth=10, max_features=10, min_samples_split=3, n_estimators=300)
                  }
                  
num_4_models = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')) ])

preprocessor = ColumnTransformer(transformers=[ ('num', num_4_models, x.columns.to_list()) ], remainder='drop')

my_classifiers_copy = {name: make_pipeline(preprocessor, model) for name, model in my_classifiers.items()}

#### Train selected models and calculate the accuracy

In [None]:
# for f_names, x in features_list.items():
x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=0, test_size=0.2, stratify=y)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in my_classifiers_copy.items():
        
        start_time = time()

        model.fit(x_train, y_train)
        predictions = model.predict(x_val)

        total_time = time() - start_time

        print(f'Finished {model_name} in {round(total_time, 2)} seconds')
        
        results = results.append({"Model": model_name,
                                  "Accuracy": accuracy_score(y_val, predictions)*100,
                                  "Bal Acc.": balanced_accuracy_score(y_val, predictions)*100,
                                  "Time": total_time},
                                  ignore_index=True)
                                
results_ord = pd.DataFrame(results)
results_ord = results_ord.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
print(results_ord)

#### Test selected models

In [None]:
yy = df_test['target']
xx = df_test.drop('target', axis=1)

In [None]:
for model_name, model in my_classifiers_copy.items():
        
        start_time = time()

        preds = model.predict(xx)

        total_time = time() - start_time

        print(f'Finished {model_name} in {round(total_time, 2)} seconds, with an accuracy of: {accuracy_score(preds, yy)*100}')

In [None]:
break

### 5. Hyper-Parameter Tuning

In [None]:
rfc_params = {"criterion": ["gini", "entropy"], "max_depth": [1, 3, 5, 7, 10], "max_features": [1, 3, 5, 7, 10], "n_estimators": [50, 100, 200, 300, 500], "min_samples_split": [1, 3, 5, 7, 10]}
rfc = RandomForestClassifier()

rfc_grid = GridSearchCV(rfc, rfc_params, cv=10, n_jobs=-1, verbose=2)

rfc_grid.fit(preprocessor.transform(x_train), y_train)
print("Best params: " + str(rfc_grid.best_params_))

In [None]:
break

In [None]:
def objective(trial):

    boosting_type = trial.suggest_categorical("boosting_type", ["gbdt", "rf"])
    max_depth = trial.suggest_int('max_depth', 1, 32)
    n_estimators = trial.suggest_int("n_estimators", 100, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 10, 500)
    
    lg_lgbm = LGBMClassifier(boosting_type=boosting_type, max_depth=max_depth, 
                             n_estimators=n_estimators, learning_rate=learning_rate,
                             min_data_in_leaf=min_data_in_leaf)

    score = cross_val_score(lg_lgbm, x, y, n_jobs=-1, cv=2)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

### 6. Save Best Performing Model

In [None]:
best_model = LGBMClassifier(boosting_type='gbdt', max_depth=25, n_estimators=129, learning_rate=0.01, min_data_in_leaf=104)

best_model_pipe = make_pipeline(preprocessor, best_model)

best_model_pipe.fit(x, y)
joblib.dump(best_model_pipe, 'model_lgb78.pkl')