## Requiremnets

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import ast
import warnings
warnings.filterwarnings('ignore')

In [21]:
ls

 Volume in drive C has no label.
 Volume Serial Number is BA74-BB49

 Directory of C:\Users\gayathri.m\Desktop

16-04-2025  10:51    <DIR>          .
10-04-2025  10:24    <DIR>          ..
14-04-2025  11:45                52 .env
16-04-2025  10:27    <DIR>          .ipynb_checkpoints
13-02-2025  18:45            47,668 3D_VFX_Pipeline_Documentation.docx
25-03-2025  12:50    <DIR>          asset_browser
12-02-2025  17:38    <DIR>          asset_manager_sql_test
16-04-2025  10:51            65,205 Assignment.ipynb
16-04-2025  09:36           931,440 data.csv
16-04-2025  09:35            90,711 evaluation_ext.csv
16-04-2025  09:35            60,475 ML-Task (7).pdf
27-03-2025  10:59             2,368 MongoDBCompass.lnk
05-02-2025  10:52    <DIR>          ProductionPipeline
04-02-2025  10:48             2,109 Spark.lnk
13-02-2025  10:09    <DIR>          sql
10-02-2025  17:55    <DIR>          ss
14-04-2025  14:49             7,906 td.html
04-02-2025  10:15             1,415 Visual Studio C

In [22]:
def parse_array_string(s):
    if isinstance(s, str):
        try:
            return np.array(ast.literal_eval(s.replace('\n', '')))
        except:
            try:
                s = re.sub(r'[\[\]]', '', s)
                return np.array([float(x) for x in s.split() if x])
            except:
                return np.array([0.0])  
    elif isinstance(s, (int, float)):
        return np.array([float(s)])
    else:
        return np.array([0.0])

In [23]:
def extract_array_features(arr):
    if len(arr) == 0:
        return [0, 0, 0, 0, 0, 0]
    
    return [
        np.mean(arr),     
        np.std(arr),        
        np.min(arr),       
        np.max(arr),        
        np.median(arr),    
        np.ptp(arr)         
    ]


In [24]:
def process_array_columns(df, columns):
    feature_df = pd.DataFrame()
    for col in columns:
        if col in df.columns:
            parsed_arrays = df[col].apply(parse_array_string)
            features = parsed_arrays.apply(extract_array_features)
            feature_names = [f"{col}_mean", f"{col}_std", f"{col}_min", f"{col}_max", f"{col}_median", f"{col}_range"]
            feature_df[feature_names] = pd.DataFrame(features.tolist(), index=df.index)
    
    return feature_df

In [28]:
%%time
print("extracting useful information ...")
def extract_datetime_features(df):
    df["datetime"]=df["cdate"].apply(lambda x:pd.to_datetime(str(x),format='%Y%m%d%H%M'))
    df["hour"]=df["datetime"].dt.hour
    df['day'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    df['dayofweek'] = df['datetime'].dt.dayofweek
    
    return df

extracting useful information ...
CPU times: total: 0 ns
Wall time: 997 μs


## ALL Steps in Main Function

In [30]:
%%time
def main():
    print("Loading files...")
    try:
        train_data = pd.read_csv('data.csv')
        eval_data = pd.read_csv('evaluation_ext.csv')
        print("Train shape:", train_data.shape)
        print("Eval shape:", eval_data.shape)
    except FileNotFoundError:
        print("Files not found.")
        return

    print(train_data.head(2))
    print("Missing values in train:", train_data.isnull().sum().sum())

    train_data = extract_datetime_features(train_data)
    eval_data = extract_datetime_features(eval_data)

    array_columns = []
    for col in train_data.columns:
        if col not in ['deviceid', 'cdate', 'datetime', 'sit_stand', 'hour', 'day', 'month', 'dayofweek']:
            val = train_data[col].iloc[0]
            if isinstance(val, str) and ('[' in val or '-' in val or '.' in val):
                array_columns.append(col)

    print(f"Found {len(array_columns)} array-like columns: {array_columns[:5]}")

    print("Handling array columns...")
    train_features = process_array_columns(train_data, array_columns)
    eval_features = process_array_columns(eval_data, array_columns)

    datetime_cols = ['hour', 'day', 'month', 'dayofweek']
    for col in datetime_cols:
        train_features[col] = train_data[col]
        eval_features[col] = eval_data[col]

    for col in train_data.columns:
        if col not in ['deviceid', 'cdate', 'datetime', 'sit_stand'] + array_columns + datetime_cols:
            try:
                train_features[col] = pd.to_numeric(train_data[col], errors='coerce')
                eval_features[col] = pd.to_numeric(eval_data[col], errors='coerce')
            except:
                print("Skipping:", col)

    train_features = train_features.fillna(0)
    eval_features = eval_features.fillna(0)

    print("Final train features shape:", train_features.shape)

    if 'sit_stand' not in train_data.columns:
        print("Target missing.")
        return

    X = train_features
    y = train_data['sit_stand']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Train size:", X_train.shape[0])
    print("Val size:", X_val.shape[0])

    print("Fitting basic model...")
    basic_model = RandomForestClassifier(n_estimators=100, random_state=42)
    basic_model.fit(X_train, y_train)
    y_pred_basic = basic_model.predict(X_val)
    acc_basic = accuracy_score(y_val, y_pred_basic)
    print("Basic model accuracy:", round(acc_basic, 4))

    print("Running full pipeline...")
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10]
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

    try:
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print("Best params:", grid_search.best_params_)
    except Exception as e:
        print("Grid search failed:", e)
        best_model = basic_model

    print("Evaluating...")
    y_pred = best_model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print("Validation accuracy:", round(acc, 4))

    print(classification_report(y_val, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_val, y_pred))

    if hasattr(best_model, 'feature_importances_'):
        importance = best_model.feature_importances_
    elif hasattr(best_model, 'named_steps') and hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        importance = best_model.named_steps['classifier'].feature_importances_
    else:
        importance = None

    if importance is not None:
        fi = pd.DataFrame({
            'Feature': X.columns,
            'Importance': importance
        }).sort_values('Importance', ascending=False)
        print("Top features:")
        print(fi.head(10))

    print("Predicting on eval data...")
    eval_preds = best_model.predict(eval_features)
    eval_data['sit_stand_predicted'] = eval_preds
    eval_data.to_csv('evaluation_with_predictions.csv', index=False)
    print("Predictions saved.")

    print("Eval prediction distribution:")
    print(pd.Series(eval_preds).value_counts(normalize=True))

    print("Done.")

if __name__ == "__main__":
    main()


Loading files...
Train shape: (300, 13)
Eval shape: (30, 12)
   Unnamed: 0   deviceid         cdate  \
0           0  S1I1A1123  202304061110   
1           1  S1I1A1124  202304061110   

                                                  ax  \
0  [ -5. -32. -35. -33. -34. -38. -37. -34. -38. ...   
1  [ -4. -28. -28. -28. -28. -33. -31. -30. -33. ...   

                                                  ay  \
0  [ -9. -54. -59. -61. -60. -59. -57. -61. -58. ...   
1  [ -9. -58. -62. -61. -61. -62. -62. -60. -60. ...   

                                                  az  \
0  [ 3. 17. 20. 19. 19. 20. 21. 19. 20. 19. 19. 2...   
1  [ 4. 25. 26. 28. 26. 20. 25. 27. 23. 17. 17. 1...   

                                                  gx  \
0  [-11620.  22050.  -5110.  -6720.  14000.   105...   
1  [  8610.  -5460.   8540. -12250.  -5250.  1344...   

                                                  gy  \
0  [ -7420.  10640.    350.   3360.   1960.  -568...   
1  [-14140.  -9800.    1