In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import sklearn_crfsuite

In [34]:
data = pd.read_csv('features_classification.csv', index_col='Unnamed: 0')

In [35]:
data = data.sort_values(by=['day_id'])

In [36]:
targets = data['target']

In [37]:
data = data.drop(['target', 'day', 'id', 'day_id', 'appCat.entertainment'], axis=1)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.2, random_state=42)

In [39]:
def create_lr(features, labels):
    lr_classifier = LogisticRegression(solver='saga')
    lr_classifier.fit(features, labels)

    return lr_classifier

In [40]:
def create_rf(features, labels):
    rf_classifier = RandomForestClassifier()
    rf_classifier.fit(features, labels)

    return rf_classifier

In [41]:
def create_svm(features, labels):
    svm_classifier = SVC()
    svm_classifier.fit(features, labels)

    return svm_classifier

In [42]:
clf_lr = create_lr(X_train, y_train)



In [43]:
clf_rf = create_rf(X_train, y_train)

In [44]:
clf_svm = create_svm(X_train, y_train)

In [45]:
y_pred_rf = clf_rf.predict(X_test)

In [46]:
y_pred_lr = clf_lr.predict(X_test)

In [47]:
y_pred_svm = clf_svm.predict(X_test)

In [48]:
def create_crf(full_data, full_targets, split):
    features = []
    targets = []
    for i in range(len(data) - 7):
        features_week = []
        targets_week = []
        for j in range(7):
            features_week.append(dict(full_data.iloc[i + j]))
            targets_week.append(str(full_targets.iloc[i + j]))
        features.append(features_week)
        targets.append(targets_week)
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=split, random_state=42)
    crf_classifier = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf_classifier.fit(features, targets)
    
    return crf_classifier, X_test, y_test

In [49]:
clf_crf, test_features, test_targets = create_crf(data, targets, 0.2)

In [50]:
y_pred_crf = clf_crf.predict(test_features)

In [51]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         5
           6       0.40      0.12      0.18        50
           7       0.58      0.88      0.70       128
           8       0.56      0.32      0.41        59
           9       0.00      0.00      0.00         1

    accuracy                           0.56       244
   macro avg       0.26      0.22      0.22       244
weighted avg       0.52      0.56      0.50       244



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         5
           6       0.54      0.38      0.45        50
           7       0.62      0.81      0.71       128
           8       0.62      0.44      0.51        59
           9       0.00      0.00      0.00         1

    accuracy                           0.61       244
   macro avg       0.30      0.27      0.28       244
weighted avg       0.59      0.61      0.59       244



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        50
           7       0.52      1.00      0.69       128
           8       0.00      0.00      0.00        59
           9       0.00      0.00      0.00         1

    accuracy                           0.52       244
   macro avg       0.09      0.17      0.11       244
weighted avg       0.28      0.52      0.36       244



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [54]:
y_pred_flat = [label for seq in y_pred_crf for label in seq]
y_test_flat = [label for seq in test_targets for label in seq]

print(classification_report(y_test_flat, y_pred_flat))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.50      0.09      0.15        11
           5       1.00      0.04      0.07        27
           6       0.60      0.23      0.34       376
           7       0.59      0.88      0.71       857
           8       0.60      0.38      0.46       406
           9       1.00      0.19      0.32        16

    accuracy                           0.59      1694
   macro avg       0.61      0.26      0.29      1694
weighted avg       0.60      0.59      0.55      1694



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [64]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],            
    'max_depth': [None, 10, 20, 30, 50],              
    'min_samples_split': [2, 5, 10],                  
    'min_samples_leaf': [1, 2, 4],                    
    'max_features': [None, 'sqrt', 'log2'],         
    'bootstrap': [True, False],                      
    'criterion': ['gini', 'entropy', 'log_loss']      
}

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50, 
    cv=5,
    verbose=2,
    n_jobs=-1,
    scoring='f1'
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [65]:
y_pred_rf_hyp = random_search.predict(X_test)

In [66]:
print(classification_report(y_test, y_pred_rf_hyp))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         5
           6       0.45      0.36      0.40        50
           7       0.63      0.79      0.70       128
           8       0.58      0.42      0.49        59
           9       0.00      0.00      0.00         1

    accuracy                           0.59       244
   macro avg       0.28      0.26      0.26       244
weighted avg       0.56      0.59      0.57       244



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

all_features = data.columns.tolist()

new_best = 0.1
best = 0

new_results = {0: all_features, 0.1: all_features}
results = {}

while new_best > best:
    all_features = new_results[new_best]
    results = new_results
    new_results = {}
    best = new_best
    for feature_combo in tqdm(combinations(all_features, len(all_features)-1)):
        selected_features = list(feature_combo)
        X_subset = data[selected_features]
    
        model = RandomForestClassifier(random_state=42)
        scores = cross_val_score(model, X_subset, targets, cv=5, scoring='accuracy')
        mean_score = scores.mean()
    
        new_results[mean_score] = selected_features
        new_best = max(new_results.keys())

23it [00:51,  2.25s/it]
22it [00:51,  2.34s/it]


In [31]:
print(best)
print(results[best]) #only need to drop entertainment

0.5188828172434731
['mood', 'activity', 'circumplex.valence', 'circumplex.arousal', 'appCat.other', 'appCat.finance', 'appCat.social', 'appCat.travel', 'appCat.game', 'appCat.builtin', 'appCat.communication', 'call', 'appCat.unknown', 'appCat.office', 'appCat.weather', 'sms', 'screen', 'appCat.utilities', 'wake_up', 'sleep', 'week_day', 'week_mood']
