In [745]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option("display.max_rows", 999)
import numpy as np
from IPython.display import display

def read(fp):
    """ Reads a file into a dataframe"""
    df = pd.read_csv(fp)
    return df

people = read("data/people.csv")
activities_train = read("data/act_train.csv")
activities_test = read("data/act_test.csv")
print("Successfully loaded datasets!")

Successfully loaded datasets!


### Explore People

The first 10 characteristics of each person are nominal features that need to be encoded into numerical format for further analysis.  The rest of the characteristics are booleans, which can then be represented as either 0 or 1. Note that **`char_38`** is the only integer value, which will need to be normalized to not carry more weight than the other columns.  

In [2]:
# shape 
print("People df has {} rows and {} columns".format(people.shape[0], people.shape[1]))

# display first five people file
print(display(people.head()))

People df has 189118 rows and 41 columns


Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,type 2,type 4,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,True,True,True,False,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,type 2,type 2,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,type 2,type 2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,84


None


#### Groups

There is a large number of people in group 17314.

In [4]:
print "# of people in each group:\n", people["group_1"].value_counts()[:10]                                                     

# of people in each group:
group 17304    77314
group 667       1538
group 8386      1046
group 9280       666
group 450        659
group 1482       484
group 15723      461
group 3229       423
group 17899      414
group 3598       365
Name: group_1, dtype: int64


### Explore Activities

There are nine characteristics associated with activities that are type 1.  Other activity categories do not contain these characteristics, but will contain char_10 information; type 1 does not contain any char_10 info.

In [496]:
print("Activities train df has {} rows and {} columns".format(activities_train.shape[0], activities_train.shape[1]))

# A look into type 1 activities
type_one_activity = display(activities_train[activities_train["activity_category"] == "type 1"][:5])
print("Activities test df has {} rows and {} columns".format(activities_test.shape[0], activities_test.shape[1]))

Activities train df has 2197291 rows and 15 columns


Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
52,ppl_100025,act1_9923,2022-11-25,type 1,type 3,type 5,type 1,type 1,type 6,type 3,type 3,type 6,type 8,,0
105,ppl_100033,act1_198174,2022-07-26,type 1,type 36,type 11,type 5,type 1,type 6,type 1,type 1,type 4,type 1,,0
106,ppl_100033,act1_214090,2023-06-15,type 1,type 24,type 6,type 6,type 3,type 1,type 3,type 4,type 5,type 1,,0
107,ppl_100033,act1_230588,2023-02-28,type 1,type 2,type 2,type 3,type 3,type 5,type 2,type 2,type 4,type 2,,0
108,ppl_100033,act1_271874,2022-07-26,type 1,type 2,type 5,type 3,type 2,type 6,type 1,type 1,type 6,type 8,,0


Activities test df has 498687 rows and 14 columns


### Process dates

In [746]:
def col_to_datetime(df, column):
    """Converts a column to datetime
    
    Also drops the original datetime column
    """
    df = df.copy()
    df[column] = pd.to_datetime(df[column])
    return df

In [747]:
def process_date(df):
    """Extract year, month, day from datetime column"""
    df = df.copy()
    return (df.assign(year = lambda df: df.date.dt.year,
                      month = lambda df: df.date.dt.month,
                      day = lambda df: df.date.dt.day)
                      .drop('date', axis = 1))

### Normalize columns

In [748]:
def normalize(df, column):
    """Normalize a column"""
    df.copy()
    df[column] = df[[column]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df

### Missing values#

In [749]:
def drop(df, columns):
    """Drop column"""
    df = df.copy()
    return df.drop(columns, axis = 1)

### Categorical Variables

In [750]:
def process_cat_variables(df, columns, substring, prefix):
    """Categorical variables are converted into numbers"""
    df = df.copy()
    categorical_columns = [col for col, col_data in df.iteritems() if col_data.dtype == object and substring in col]
    df[categorical_columns] = (df[categorical_columns].fillna('type -999')
                                                      .apply(lambda column: column.str.lstrip(prefix).astype(int)))
    return df

### Booleans

In [751]:
def process_bool_columns(df, columns, substring):
    """Boolean categorical variables are also converted to numbers"""
    df = df.copy()
    bool_columns = [col for col, col_data in df.iteritems() if col_data.dtype == bool and substring in col]
    df[bool_columns] = df[bool_columns].apply(lambda column: column.astype(int))
    return df

### Combine Levels

Group several levels based on their frequency distribution.  

In [752]:
def bin_other_category(df, col):
    """Groups values into bins for the activity_category"""
    df = df.copy()
    df[col] = pd.cut(df[col], bins = 9, labels = False)
    return df

def bin_char_category(df, columns, substring):
    """
    Groups values into bins for categorical columns  that begin with 'char'
    """
    df = df.copy()
    bin_columns = [col for col, col_data in df.iteritems() if col_data.dtype == int and substring in col]
    df[bin_columns] = df[bin_columns].apply(lambda x: pd.cut(x, bins = 9, labels = False))
    return df

### One-hot Encoding

In [753]:
def one_hot_encode(df, columns, substring):
    """Convert categorical variables into numerical format"""
    df = df.copy()
    cat_columns = [col for col, col_data in df.iteritems() if col_data.dtype == int and substring in col]
    dummies = pd.get_dummies(df[cat_columns])
    return df

### Method Chaining to clean people and activities data

In [754]:
def process_people(df):
    """Perform processing on people data
    
    Dates are transformed to datetime, 
    relevant columns are normalized, 
    categorical variables are encoded,
    levels are grouped into bins
    """
    df = df.copy()
    return (df.pipe(col_to_datetime, "date")
              .pipe(process_date)
              .pipe(normalize, "char_38")
              .pipe(process_cat_variables, df.columns, "char", 'type ')
              .pipe(process_cat_variables, df.columns, "group", "group ")
              .pipe(bin_char_category, df.columns, "char")
              .pipe(process_bool_columns, df.columns, "char"))
              
processed_people = people.pipe(process_people)
processed_people.head()

Unnamed: 0,people_id,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,day,month,year
0,ppl_100,8,17304,4,0,1,4,2,3,1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
1,ppl_100002,8,8688,8,5,2,4,2,3,1,3,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,1,1,0,0,1,1,1,1,1,1,1,0,0.76,6,1,2021
2,ppl_100003,8,33592,8,0,2,4,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,0.99,10,6,2022
3,ppl_100004,8,22593,8,8,8,8,4,5,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0.76,20,7,2022
4,ppl_100006,8,6534,8,8,8,8,2,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0.84,27,7,2022


In [755]:
def process_activities(df):
    """Perform processing on activities data
    
    Dates are transformed to datetime, 
    categorical variables are encoded,
    missing values are replaced,
    levels are grouped into bins
    """
    df = df.copy()
    return (df.pipe(col_to_datetime, "date")
              .pipe(process_date)
              .pipe(process_cat_variables, df.columns, "char", 'type ')
              .pipe(process_cat_variables, df.columns, "activity_cat", 'type ')
              .pipe(bin_other_category, "activity_category")
              .pipe(bin_char_category, df.columns, "char")
              .pipe(process_bool_columns, df.columns, "char"))
              
# preprocess activites
processed_activities_train = activities_train.pipe(process_activities)
processed_activities_test = activities_test.pipe(process_activities)
processed_activities_train.head()

Unnamed: 0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome,day,month,year
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,0,0,0,0,26,8,2023
1,ppl_100,act2_2434093,1,0,0,0,0,0,0,0,0,0,0,0,27,9,2022
2,ppl_100,act2_3404049,1,0,0,0,0,0,0,0,0,0,0,0,27,9,2022
3,ppl_100,act2_3651215,1,0,0,0,0,0,0,0,0,0,0,0,4,8,2023
4,ppl_100,act2_4109017,1,0,0,0,0,0,0,0,0,0,0,0,26,8,2023


### Join dataframes

Add data from the people dataframe to activity dataframes 

In [756]:
training_data = pd.merge(processed_activities_train, processed_people, on = 'people_id', how = 'left', suffixes=('_activities', '_people'))
test_data = pd.merge(processed_activities_test, processed_people, on = 'people_id', how = 'left', suffixes=('_activities', '_people'))

### Extract class label

In [757]:
# class label
y_train = training_data["outcome"]

cols = ["people_id", "activity_id"]
training_data = training_data.pipe(drop, cols)
test_data = test_data.pipe(drop, cols)
training_data.head()

Unnamed: 0,activity_category,char_1_activities,char_2_activities,char_3_activities,char_4_activities,char_5_activities,char_6_activities,char_7_activities,char_8_activities,char_9_activities,char_10_activities,outcome,day_activities,month_activities,year_activities,char_1_people,group_1,char_2_people,char_3_people,char_4_people,char_5_people,char_6_people,char_7_people,char_8_people,char_9_people,char_10_people,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,day_people,month_people,year_people
0,4,0,0,0,0,0,0,0,0,0,0,0,26,8,2023,8,17304,4,0,1,4,2,3,1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
1,1,0,0,0,0,0,0,0,0,0,0,0,27,9,2022,8,17304,4,0,1,4,2,3,1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
2,1,0,0,0,0,0,0,0,0,0,0,0,27,9,2022,8,17304,4,0,1,4,2,3,1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
3,1,0,0,0,0,0,0,0,0,0,0,0,4,8,2023,8,17304,4,0,1,4,2,3,1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021
4,1,0,0,0,0,0,0,0,0,0,0,0,26,8,2023,8,17304,4,0,1,4,2,3,1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,29,6,2021


### Partition Training and Testing Sets

In [758]:
date_cols = ["day_people", "month_people", "year_people", "day_activities", "month_activities", "year_activities"]
# drop unwanted columns
X_train = training_data.pipe(drop, ["outcome", "day_people", "month_people", "year_people", "day_activities", "month_activities", "year_activities"])
X_test = test_data.pipe(drop, date_cols)

In [759]:
# create validation set
def split(X, y):
    mini_train_size = int(X.shape[0] * .8)
    X_train_mini, X_val = X.iloc[:mini_train_size], X.iloc[mini_train_size:]
    y_train_mini, y_val = y[:mini_train_size], y[mini_train_size:]
    return X_train_mini, X_val, y_train_mini, y_val

X_train_mini, X_val, y_train_mini, y_val = split(X_train, y_train)

### Pipeline transforms with estimator, then get cross val scores

In [761]:
from sklearn.preprocessing import StandardScaler
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.cross_validation import cross_val_score

def cross_val(X, y):
    scores = cross_val_score(estimator = pipe_lr,
                                         X = X,
                                         y = y,
                                         cv = 5,
                                         n_jobs = 1)
    print('CV accuracy scores: %s' % scores)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [762]:
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('lda', LDA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1))])

cross_val(X_train_mini, y_train_mini)

CV accuracy scores: [ 0.83780616  0.8061991   0.81394105  0.84232833  0.85104931]
CV accuracy: 0.830 +/- 0.017


### Metrics

In [763]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def predict(clf, X_train, y_train, X_test):
    clf.fit(X_train, y_train)
    y_pred_train = pipe_lr.predict(X_train)
    y_pred_test = pipe_lr.predict(X_test)
    return y_pred_train, y_pred_test

def calc_metrics(clf, y_pred_train, y_pred_test, y_test):
    # f1 scores
    print "f1 score on training set:", f1_score(y_train_mini, y_pred_train)
    print "f1 score on testing set:", f1_score(y_test, y_pred_test)

    # area under curve
    auc_score = roc_auc_score(y_test, y_pred_test)
    print("Area under the curve:", auc_score)

# validation predictions & metrics
y_pred_train, y_pred_test = predict(pipe_lr, X_train_mini, y_train_mini, X_val)
calc_metrics(pipe_lr, y_pred_train, y_pred_test, y_val)

f1 score on training set: 0.822185277778
f1 score on testing set: 0.82495647153
('Area under the curve:', 0.82628490354521578)


In [767]:
# predictions for test set
pipe_lr.fit(X_train, y_train)
y_pred_train = pipe_lr.predict(X_train)
y_pred_test = pipe_lr.predict(X_test)

### Output

In [772]:
activity_ids = activities_test["activity_id"]
output = pd.DataFrame({"activity_id": activity_ids, "outcome": y_pred_test})
output.to_csv("performance/prediction.csv", header=True, index=False)