## The goal is to prepare a submission file

In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
df_train = pd.read_csv("/content/train_set.csv")
df_train = df_train.drop(['_temporary_index_column'], axis=1)
df_test = pd.read_csv("/content/test_set.csv")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# NOTE read_csv on this big excel may be corrupted
# 1. the last row may or many not has NaNs !!!  # we will have to read_csv multiple times or remove the nans or do function that tests for validity of the df...
# 2. The size of the patient_id may or may not be at 25705 patients in some cases it is much smaller
print(df_train.dtypes)
df_train.tail()

patient_id         int64
timestamp         object
measurement_x    float64
measurement_y    float64
measurement_z    float64
label            float64
dtype: object


Unnamed: 0,patient_id,timestamp,measurement_x,measurement_y,measurement_z,label
697438,22414,2012-08-30 11:56:06.941000,0.604405,0.215392,0.038842,0.0
697439,22414,2012-08-30 11:56:06.991000,0.606304,0.220688,0.032672,0.0
697440,22414,2012-08-30 11:56:07.041000,0.608131,0.22601,0.033954,0.0
697441,22414,2012-08-30 11:56:07.091000,0.609884,0.231357,0.028358,0.0
697442,22414,2012-08-30 11:56:07.141000,0.611564,0.236727,0.024,


In [None]:
df_train = df_train.dropna()
df_train.tail()

Unnamed: 0,patient_id,timestamp,measurement_x,measurement_y,measurement_z,label
697437,22414,2012-08-30 11:56:06.891000,0.602433,0.210122,0.038738,0.0
697438,22414,2012-08-30 11:56:06.941000,0.604405,0.215392,0.038842,0.0
697439,22414,2012-08-30 11:56:06.991000,0.606304,0.220688,0.032672,0.0
697440,22414,2012-08-30 11:56:07.041000,0.608131,0.22601,0.033954,0.0
697441,22414,2012-08-30 11:56:07.091000,0.609884,0.231357,0.028358,0.0


In [None]:
# NOTE the last row may have NaNs !!!
# We will have to remove it or load the csv again
print(df_test.dtypes)
df_test.tail()

Unnamed: 0                   int64
_temporary_index_column      int64
patient_id                   int64
timestamp                   object
measurement_x              float64
measurement_y              float64
measurement_z              float64
dtype: object


Unnamed: 0.1,Unnamed: 0,_temporary_index_column,patient_id,timestamp,measurement_x,measurement_y,measurement_z
199995,199995,999995,32055,2013-06-15 13:48:29.549790,0.563614,0.043217,0.043949
199996,199996,999996,32055,2013-06-15 13:48:29.599790,0.565715,0.04129,0.045241
199997,199997,999997,32055,2013-06-15 13:48:29.599790,0.567835,0.039383,0.042212
199998,199998,999998,32055,2013-06-15 13:48:29.699790,0.569973,0.037497,0.043228
199999,199999,999999,32055,2013-06-15 13:48:29.749790,0.572129,0.035632,0.044271


#Scaling to 0-1

In [None]:
# Min Max scaling
def scale_features(df):
  features = ["measurement_x", "measurement_y", "measurement_z"]

  # scaling between 0 and 1
  scaler = MinMaxScaler()
  df_scaled = pd.DataFrame(scaler.fit_transform(df[features]),
                                columns=features)
  df_scaled["patient_id"] = df["patient_id"]

  return df_scaled


In [None]:
df_train_scaled = scale_features(df_train)
df_train_scaled.head()

df_test_scaled = scale_features(df_test)
df_test_scaled.head()

Unnamed: 0,measurement_x,measurement_y,measurement_z,patient_id
0,0.405127,0.455079,0.143013,25705
1,0.405057,0.456963,0.145083,25705
2,0.40497,0.458846,0.138726,25705
3,0.404866,0.460727,0.132912,25705
4,0.404744,0.462608,0.125686,25705


#Feature extraction

In [None]:
# function recieves df and returns extracts features from the measurements as a function of time

def extract_features(df):

    # Group the input dataframe by patient_id
    grouped = df.groupby('patient_id')

    # Use agg to calculate mean, std, max, min for x, y, and z for each group
    agg_dict = {
        'measurement_x': ['mean', 'max', 'min'],
        'measurement_y': ['mean', 'max', 'min'],
        'measurement_z': ['mean', 'max', 'min']
    }
    patient_stats = grouped.agg(agg_dict)

    # Flatten the resulting dataframe's multi-index columns
    patient_stats.columns = [f"{col[0]}_{col[1]}" for col in patient_stats.columns]

    return patient_stats.reset_index()

In [None]:
# Feature extraction for both train and test

df_train_scaled_features = extract_features(df_train_scaled)
df_train_scaled_features.head()

df_test_scaled_features = extract_features(df_test_scaled)
df_test_scaled_features.head()

Unnamed: 0,patient_id,measurement_x_mean,measurement_x_max,measurement_x_min,measurement_y_mean,measurement_y_max,measurement_y_min,measurement_z_mean,measurement_z_max,measurement_z_min
0,25705,0.40375,0.405127,0.401592,0.471803,0.493432,0.455079,0.090097,0.145083,0.013128
1,25706,0.404278,0.405232,0.401592,0.458305,0.493432,0.428724,0.116238,0.156549,0.013128
2,25707,0.404278,0.405232,0.401592,0.458305,0.493432,0.428724,0.116238,0.156549,0.013128
3,25708,0.404278,0.405232,0.401592,0.458305,0.493432,0.428724,0.116238,0.156549,0.013128
4,25709,0.404278,0.405232,0.401592,0.458305,0.493432,0.428724,0.116238,0.156549,0.013128


In [None]:
# We need to set back the labels to the df for the training dataset
df_train_scaled_features["label"] = df_train.groupby("patient_id").mean()["label"]
df_train_scaled_features.head()

Unnamed: 0,patient_id,measurement_x_mean,measurement_x_max,measurement_x_min,measurement_y_mean,measurement_y_max,measurement_y_min,measurement_z_mean,measurement_z_max,measurement_z_min,label
0,0,0.562517,0.575057,0.554517,0.370117,0.415782,0.323488,0.136671,0.174698,0.043764,1.0
1,1,0.562961,0.576261,0.554517,0.371688,0.418831,0.323488,0.137122,0.174698,0.043764,1.0
2,2,0.562961,0.576261,0.554517,0.371688,0.418831,0.323488,0.137122,0.174698,0.043764,1.0
3,3,0.562961,0.576261,0.554517,0.371688,0.418831,0.323488,0.137122,0.174698,0.043764,1.0
4,4,0.656316,0.672988,0.639554,0.465884,0.47037,0.449697,0.17035,0.188466,0.136082,1.0


#Train classifier with best parameters

In [None]:
# train RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

seed = 10

clf = RandomForestClassifier(random_state=seed)

# last row are labels
X_train = df_train_scaled_features.iloc[:, :-1].values
y_train = df_train_scaled_features.iloc[:, -1].values

clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     15237
         1.0       1.00      1.00      1.00      5502
         2.0       1.00      1.00      1.00      1676

    accuracy                           1.00     22415
   macro avg       1.00      1.00      1.00     22415
weighted avg       1.00      1.00      1.00     22415



# Preparing test df

In [None]:
# df_train_scaled_features = extract_features(df_train_scaled)
# df_train_scaled_features.head()

df_test_scaled_features.head()
y_test_pred = clf.predict(df_test_scaled_features)
print(y_test_pred)



[1. 1. 1. ... 2. 0. 1.]




In [None]:
# see the distribution of the answers
unique, counts = np.unique(y_test_pred, return_counts=True)
print(np.asarray((unique, counts)).T)

df_y_test_pred = pd.DataFrame(y_test_pred, columns = ['label'])
df_y_test_pred_label = df_y_test_pred
#freq_count(y_test_pred)

[[0.000e+00 2.964e+03]
 [1.000e+00 2.239e+03]
 [2.000e+00 1.148e+03]]


In [None]:
# concantinate the patient_id and predicted label for the submission file

test_patient_ids = df_test_scaled_features['patient_id']

test_patient_ids = pd.DataFrame(test_patient_ids, columns = ['patient_id'])

df_y_test_pred_label.shape
#test_patient_ids.shape

result_df_test = pd.concat([df_y_test_pred_label,test_patient_ids], axis=1)
result_df_test.head()
#result_df_test.tail()

Unnamed: 0,label,patient_id
0,1.0,25705
1,1.0,25706
2,1.0,25707
3,1.0,25708
4,1.0,25709


In [None]:
# save csv file

result_df_test.head()

result_df_test.to_csv('test_set_submission_file.csv', index=False)

