In [101]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
from IPython.display import display

# read datasets
people = pd.read_csv("data/people.csv")
activities_train = pd.read_csv("data/act_train.csv")
activities_test = pd.read_csv("data/act_test.csv")
dataframes = [people, activities_train, activities_test]
print("Successfully loaded datasets!")

Successfully loaded datasets!


### Explore People

In [102]:
# shape 
print("People df has {} rows and {} columns".format(people.shape[0], people.shape[1]))

# display first five people file
print(display(people.head()))

People df has 189118 rows and 41 columns


Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,type 2,type 4,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,True,True,True,False,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,type 2,type 2,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,type 2,type 2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,84


None


In [67]:
print len(people["people_id"].unique())

189118


#### Data types

The first 10 characteristics of each person are nominal features that need to be encoded into numerical format for further analysis.  The rest of the characteristics are booleans, which can then be represented as either 0 or 1. Note that **`char_38`** is the only integer value, which will need to be normalized to not carry more weight than the other columns.  

#### Groups

There is a large number of people in group 17314.  It would be interesting to find out why.

In [57]:
print "# of people in each group:\n", people["group_1"].value_counts()[:10]                                                     

# of people in each group:
group 17304    77314
group 667       1538
group 8386      1046
group 9280       666
group 450        659
group 1482       484
group 15723      461
group 3229       423
group 17899      414
group 3598       365
Name: group_1, dtype: int64


### Explore Activities

There are nine characteristics associated with activities that are type 1.  Other activity categories do not contain these characteristics, but will contain char_10 information; type 1 does not contain any char_10 info.

In [58]:
print("Activities train df has {} rows and {} columns".format(activities_train.shape[0], activities_train.shape[1]))

# A look into type 1 activities
type_one_activity = display(activities_train[activities_train["activity_category"] == "type 1"][:3])
print type_one_activity

Activities train df has 2197291 rows and 15 columns


Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
52,ppl_100025,act1_9923,2022-11-25,type 1,type 3,type 5,type 1,type 1,type 6,type 3,type 3,type 6,type 8,,0
105,ppl_100033,act1_198174,2022-07-26,type 1,type 36,type 11,type 5,type 1,type 6,type 1,type 1,type 4,type 1,,0
106,ppl_100033,act1_214090,2023-06-15,type 1,type 24,type 6,type 6,type 3,type 1,type 3,type 4,type 5,type 1,,0


None


In [8]:
print("Activities test df has {} rows and {} columns".format(activities_test.shape[0], activities_test.shape[1]))

# display first five rows in activities test file
print(display(activities_test.unique))

Activities test df has 498687 rows and 14 columns
151295


#### Join dataframes

Add data from the people dataframe to activity dataframes 

In [103]:
def change_column_name(df):    
    # get char columns
    columns = [col for col in df.columns if "char" in col]
    
    # copy original df
    df_copy = df.copy()
    
    for col in columns:
        # change columns that begin with 'char'
        new_col = "ppl_" + col
        # rename column
        df_copy.rename(columns = {col:new_col}, inplace = True)
        
    return df_copy

def join_df(activity_df, people_df, split_size):
    # edit people df column names to avoid naming conflicts
    people_df_2 = change_column_name(people_df)
    
    # get new columns
    columns = people_df_2.columns
    
    # create new dataframe with these column names
    new_df = pd.DataFrame(columns = columns)
    
    # add people data to each row of this new dataframe
    activity_df_copy = activity_df.copy()[:split_size] # testing env
    
    for i in range(0, len(activity_df_copy)):
        # get people_id from activities dataframe
        people_id = activity_df_copy.loc[i, "people_id"]
        
        # get person's data for people_id from people df
        people_data = people_df_2[people_df_2["people_id"] == people_id]
        
        # append this data to new df
        new_df = new_df.append(people_data, ignore_index = True)
        
    # drop a few unwanted columns 
    new_df.drop(["people_id", "date"], axis=1, inplace=True)
    
    # join dfs
    result = pd.concat([activity_df_copy, new_df], axis=1, join='inner') 
    
    return result

In [104]:
# combine people and activities for training and test sets
training_data = join_df(activities_train, people, split_size = 5000)
test_data = join_df(activities_test, people, split_size = 1000)
dataframes = [training_data, test_data]

# save progress
training_data.to_csv(path_or_buf = "processed_data/train.csv")
test_data.to_csv(path_or_buf = "processed_data/test.csv")

# a sample of the new training data
training_data.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome,ppl_char_1,group_1,ppl_char_2,ppl_char_3,ppl_char_4,ppl_char_5,ppl_char_6,ppl_char_7,ppl_char_8,ppl_char_9,ppl_char_10,ppl_char_11,ppl_char_12,ppl_char_13,ppl_char_14,ppl_char_15,ppl_char_16,ppl_char_17,ppl_char_18,ppl_char_19,ppl_char_20,ppl_char_21,ppl_char_22,ppl_char_23,ppl_char_24,ppl_char_25,ppl_char_26,ppl_char_27,ppl_char_28,ppl_char_29,ppl_char_30,ppl_char_31,ppl_char_32,ppl_char_33,ppl_char_34,ppl_char_35,ppl_char_36,ppl_char_37,ppl_char_38
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0,type 2,group 17304,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36.0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36.0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36.0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36.0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0,type 2,group 17304,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36.0


In [208]:
def load_processed_data():
    train = pd.read_csv("processed_data/train.csv", index_col = 0)
    test = pd.read_csv("processed_data/test.csv", index_col = 0)
    return train, test
training_data, test_data = load_processed_data()

#### Datetime

In [106]:
# Convert date column to datetime
def date_time(column):
    column = pd.to_datetime(column)
    return column

training_data["date"] = date_time(training_data["date"])
test_data["date"] = date_time(test_data["date"])

#### Handling categorical data and booleans

For the characteristics that take type 1, type 2, type 3 etc, the word 'type' will be dropped and the integers will be one-hot encoded.  For rows that contain missing values, they will be zeroed and then encoded. Group_1 columsn will also be encoded.  For columns containing True or False values, these will be converted to either 0 or 1.  

In [108]:
def dummy_features(df):
    columns = ["activity_category", "char_1", "char_2", 
                     "char_3", "char_4", "char_5", "char_6", 
                     "char_7", "char_8", "char_9", "char_10",
                     "ppl_char_1", "ppl_char_2", "ppl_char_3",
                     "ppl_char_4", "ppl_char_5", "ppl_char_6",
                     "ppl_char_7", "ppl_char_8", "ppl_char_9",
                     "group_1"]
    
    # Create dummy features for selected columns
    encoded_data = pd.get_dummies(df[columns])
    
    # drop columns for old labels
    df.drop(columns, axis=1, inplace=True)
    
    # join dataframe containing encoded data
    df = df.join(encoded_data, how = 'outer')
    return df

def to_boolean(x):
    if x == True:
        return 1
    elif x == False:
        return 0
    else:
        return x

In [109]:
# generate dummy features for selected columns
training_data = dummy_features(training_data)

# applies a function to each element to process boolean type values but leave all elements unchanged
training_data = training_data.applymap(lambda x: to_boolean(x))

test_data = dummy_features(test_data)
test_data = test_data.applymap(lambda x: to_boolean(x))

#### Handle continuous data

In [111]:
# normalize column
training_data["ppl_char_38"] = training_data[["ppl_char_38"]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
test_data["ppl_char_38"] = test_data[["ppl_char_38"]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

### Extract class label

In [112]:
y_train = training_data["outcome"]

training_data.drop(["outcome"], axis = 1, inplace = True)
training_data.head()

Unnamed: 0,people_id,activity_id,date,ppl_char_10,ppl_char_11,ppl_char_12,ppl_char_13,ppl_char_14,ppl_char_15,ppl_char_16,ppl_char_17,ppl_char_18,ppl_char_19,ppl_char_20,ppl_char_21,ppl_char_22,ppl_char_23,ppl_char_24,ppl_char_25,ppl_char_26,ppl_char_27,ppl_char_28,ppl_char_29,ppl_char_30,ppl_char_31,ppl_char_32,ppl_char_33,ppl_char_34,ppl_char_35,ppl_char_36,ppl_char_37,ppl_char_38,activity_category_type 1,activity_category_type 2,activity_category_type 3,activity_category_type 4,activity_category_type 5,activity_category_type 6,activity_category_type 7,char_1_type 1,char_1_type 10,char_1_type 11,char_1_type 12,char_1_type 13,char_1_type 15,char_1_type 16,char_1_type 17,char_1_type 19,char_1_type 2,char_1_type 20,char_1_type 23,char_1_type 24,char_1_type 25,char_1_type 26,char_1_type 29,char_1_type 3,char_1_type 30,char_1_type 36,char_1_type 4,char_1_type 41,char_1_type 5,char_1_type 6,char_1_type 7,char_1_type 8,char_1_type 9,char_2_type 1,char_2_type 10,char_2_type 11,char_2_type 12,char_2_type 13,char_2_type 14,char_2_type 16,char_2_type 17,char_2_type 19,char_2_type 2,char_2_type 25,char_2_type 26,char_2_type 29,char_2_type 3,char_2_type 4,char_2_type 5,char_2_type 6,char_2_type 7,char_2_type 8,char_2_type 9,char_3_type 1,char_3_type 2,char_3_type 3,char_3_type 4,char_3_type 5,char_3_type 6,char_3_type 7,char_3_type 8,char_3_type 9,char_4_type 1,char_4_type 2,char_4_type 3,char_4_type 4,char_4_type 5,char_4_type 6,char_5_type 1,char_5_type 2,char_5_type 3,char_5_type 4,char_5_type 5,char_5_type 6,char_6_type 1,char_6_type 2,char_6_type 3,char_6_type 4,char_6_type 5,char_7_type 1,char_7_type 2,char_7_type 3,char_7_type 4,char_7_type 5,char_7_type 6,char_7_type 7,char_7_type 8,char_8_type 1,char_8_type 10,char_8_type 11,char_8_type 12,char_8_type 13,char_8_type 14,char_8_type 15,char_8_type 16,char_8_type 18,char_8_type 2,char_8_type 3,char_8_type 4,char_8_type 5,char_8_type 6,char_8_type 7,char_8_type 8,char_8_type 9,char_9_type 1,char_9_type 10,char_9_type 12,char_9_type 13,char_9_type 14,char_9_type 15,char_9_type 16,char_9_type 17,char_9_type 18,char_9_type 2,char_9_type 3,char_9_type 4,char_9_type 5,char_9_type 6,char_9_type 7,char_9_type 8,char_9_type 9,char_10_type 1,char_10_type 103,char_10_type 1033,char_10_type 1038,char_10_type 1058,char_10_type 1065,char_10_type 1069,char_10_type 1070,char_10_type 1073,char_10_type 1089,char_10_type 1097,char_10_type 110,char_10_type 1109,char_10_type 111,char_10_type 1111,char_10_type 1116,char_10_type 114,char_10_type 1146,char_10_type 1154,char_10_type 117,char_10_type 1188,char_10_type 1205,char_10_type 1217,char_10_type 123,char_10_type 1231,char_10_type 1251,char_10_type 1259,char_10_type 1261,char_10_type 1264,char_10_type 1271,char_10_type 1302,char_10_type 1305,char_10_type 131,char_10_type 1313,char_10_type 132,char_10_type 133,char_10_type 1341,char_10_type 1352,char_10_type 137,char_10_type 1370,char_10_type 1372,char_10_type 138,char_10_type 1382,char_10_type 1384,char_10_type 1398,char_10_type 141,char_10_type 143,char_10_type 1441,char_10_type 1451,char_10_type 1485,char_10_type 1496,char_10_type 1501,char_10_type 1502,char_10_type 151,char_10_type 1527,char_10_type 155,char_10_type 156,char_10_type 157,char_10_type 1619,char_10_type 166,char_10_type 167,char_10_type 1674,char_10_type 1675,char_10_type 1676,char_10_type 1716,char_10_type 1727,char_10_type 1771,char_10_type 1772,char_10_type 183,char_10_type 184,char_10_type 1876,char_10_type 19,char_10_type 190,char_10_type 1906,char_10_type 194,char_10_type 1953,char_10_type 1959,char_10_type 197,char_10_type 198,char_10_type 199,char_10_type 2,char_10_type 20,char_10_type 201,char_10_type 2020,char_10_type 2023,char_10_type 203,char_10_type 2032,char_10_type 2037,char_10_type 207,char_10_type 2108,char_10_type 214,char_10_type 2225,char_10_type 2260,char_10_type 227,char_10_type 23,char_10_type 230,char_10_type 2316,...,ppl_char_7_type 23,ppl_char_7_type 24,ppl_char_7_type 25,ppl_char_7_type 3,ppl_char_7_type 4,ppl_char_7_type 5,ppl_char_7_type 6,ppl_char_7_type 7,ppl_char_7_type 8,ppl_char_7_type 9,ppl_char_8_type 1,ppl_char_8_type 2,ppl_char_8_type 3,ppl_char_8_type 4,ppl_char_8_type 5,ppl_char_8_type 6,ppl_char_8_type 7,ppl_char_8_type 8,ppl_char_9_type 1,ppl_char_9_type 2,ppl_char_9_type 3,ppl_char_9_type 4,ppl_char_9_type 5,ppl_char_9_type 6,ppl_char_9_type 7,ppl_char_9_type 8,ppl_char_9_type 9,group_1_group 10089,group_1_group 10233,group_1_group 1024,group_1_group 10409,group_1_group 10603,group_1_group 10605,group_1_group 10664,group_1_group 1102,group_1_group 11040,group_1_group 11102,group_1_group 11143,group_1_group 11516,group_1_group 1179,group_1_group 11807,group_1_group 11871,group_1_group 1233,group_1_group 12451,group_1_group 12624,group_1_group 1268,group_1_group 12731,group_1_group 12784,group_1_group 1303,group_1_group 13191,group_1_group 1323,group_1_group 13545,group_1_group 13865,group_1_group 1387,group_1_group 1401,group_1_group 14038,group_1_group 14102,group_1_group 14277,group_1_group 14626,group_1_group 14879,group_1_group 1490,group_1_group 15313,group_1_group 15470,group_1_group 15723,group_1_group 15916,group_1_group 16348,group_1_group 1654,group_1_group 16579,group_1_group 16762,group_1_group 16885,group_1_group 16905,group_1_group 17304,group_1_group 17386,group_1_group 17528,group_1_group 17557,group_1_group 17602,group_1_group 17627,group_1_group 17649,group_1_group 17672,group_1_group 17909,group_1_group 18035,group_1_group 18063,group_1_group 18137,group_1_group 18242,group_1_group 18272,group_1_group 18279,group_1_group 18415,group_1_group 18441,group_1_group 18442,group_1_group 18594,group_1_group 18630,group_1_group 18906,group_1_group 18966,group_1_group 1913,group_1_group 19133,group_1_group 19236,group_1_group 19480,group_1_group 19527,group_1_group 19662,group_1_group 1971,group_1_group 19768,group_1_group 19853,group_1_group 20002,group_1_group 20266,group_1_group 20470,group_1_group 21014,group_1_group 21079,group_1_group 21314,group_1_group 2139,group_1_group 21502,group_1_group 21589,group_1_group 21901,group_1_group 21968,group_1_group 21985,group_1_group 22206,group_1_group 22481,group_1_group 22612,group_1_group 22614,group_1_group 22741,group_1_group 22808,group_1_group 2308,group_1_group 23148,group_1_group 2325,group_1_group 2352,group_1_group 2362,group_1_group 23698,group_1_group 23826,group_1_group 2401,group_1_group 24193,group_1_group 24280,group_1_group 2449,group_1_group 249,group_1_group 25169,group_1_group 2538,group_1_group 25842,group_1_group 26217,group_1_group 27538,group_1_group 27862,group_1_group 28414,group_1_group 28984,group_1_group 29362,group_1_group 29437,group_1_group 30182,group_1_group 30844,group_1_group 31187,group_1_group 31542,group_1_group 32343,group_1_group 32519,group_1_group 32870,group_1_group 33262,group_1_group 33592,group_1_group 33595,group_1_group 33699,group_1_group 33913,group_1_group 34775,group_1_group 3493,group_1_group 35053,group_1_group 3537,group_1_group 3538,group_1_group 35931,group_1_group 3598,group_1_group 36096,group_1_group 36905,group_1_group 37134,group_1_group 37488,group_1_group 37633,group_1_group 38420,group_1_group 38523,group_1_group 3873,group_1_group 38745,group_1_group 38766,group_1_group 3886,group_1_group 38924,group_1_group 39163,group_1_group 39166,group_1_group 39309,group_1_group 39664,group_1_group 40246,group_1_group 4146,group_1_group 4152,group_1_group 41627,group_1_group 42007,group_1_group 4204,group_1_group 4289,group_1_group 432,group_1_group 44217,group_1_group 450,group_1_group 4556,group_1_group 45749,group_1_group 46913,group_1_group 46957,group_1_group 4699,group_1_group 4724,group_1_group 4742,group_1_group 4744,group_1_group 476,group_1_group 48070,group_1_group 48532,group_1_group 48590,group_1_group 486,group_1_group 49,group_1_group 49305,group_1_group 5035,group_1_group 5040,group_1_group 50513,group_1_group 50737,group_1_group 5149,group_1_group 5399,group_1_group 5641,group_1_group 5693,group_1_group 5826,group_1_group 584,group_1_group 6153,group_1_group 627,group_1_group 637,group_1_group 6423,group_1_group 649,group_1_group 6534,group_1_group 6668,group_1_group 667,group_1_group 6705,group_1_group 6757,group_1_group 678,group_1_group 6826,group_1_group 6836,group_1_group 7011,group_1_group 7124,group_1_group 7256,group_1_group 7331,group_1_group 7350,group_1_group 7518,group_1_group 7636,group_1_group 768,group_1_group 7737,group_1_group 7743,group_1_group 7936,group_1_group 8006,group_1_group 8541,group_1_group 858,group_1_group 8590,group_1_group 8688,group_1_group 8921,group_1_group 9104,group_1_group 9107,group_1_group 9280,group_1_group 9405,group_1_group 9439,group_1_group 9603,group_1_group 9726,group_1_group 9728
0,ppl_100,act2_1734928,2023-08-26,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ppl_100,act2_2434093,2022-09-27,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ppl_100,act2_3404049,2022-09-27,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,ppl_100,act2_3651215,2023-08-04,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,ppl_100,act2_4109017,2023-08-26,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0.36,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Drop columns

In [113]:
def drop(df, columns):
    df.drop(columns, axis = 1, inplace = True)
    return df

In [114]:
columns = ["people_id", "activity_id", "date"]
X_train = drop(training_data, columns)
X_test = drop(test_data, columns)

### Validation Set

In [156]:
# create validation set
def split(X, y):
    mini_train_size = int(X.shape[0] * .8)
    X_train_mini, X_val = X.iloc[:mini_train_size], X.iloc[mini_train_size:]
    y_train_mini, y_val = y[:mini_train_size], y[mini_train_size:]
    return X_train_mini, X_val, y_train_mini, y_val

X_train_mini, X_val, y_train_mini, y_val = split(X_train, y_train)

### Pipeline transforms with estimator, then get cross val scores

In [157]:
from sklearn.preprocessing import StandardScaler
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.cross_validation import cross_val_score

def cross_val(X, y):
    scores = cross_val_score(estimator = pipe_lr,
                         X = X,
                         y = y,
                         cv = 10,
                         n_jobs = 1)
    print('CV accuracy scores: %s' % scores)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('lda', LDA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1))])





CV accuracy scores: [ 0.72069825  0.98753117  0.89775561  0.57605985  0.715       0.865
  0.83709273  0.76190476  0.77443609  0.64160401]
CV accuracy: 0.778 +/- 0.117


#### Metrics

In [165]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

pipe_lr.fit(X_train_mini, y_train_mini)
y_pred_train = pipe_lr.predict(X_train_mini)
y_pred_test = pipe_lr.predict(X_val)

# f1 scores
print "f1 score on training set:", f1_score(y_train_mini, y_pred_train)
print "f1 score on testing set:", f1_score(y_val, y_pred_test)

# area under curve
auc_score = roc_auc_score(y_val, y_pred_test)
print("Area under the curve:", auc_score)

f1 score on training set: 0.977975528365
f1 score on testing set: 0.711351351351
('Area under the curve:', 0.74699662223997065)


In [39]:

def feature_selection():
    from sklearn.ensemble import RandomForestClassifier

    # get feature columns
    feat_labels = training_data.columns[3:]
    sample_training = training_data.iloc[:, 3:]

    # init forest
    forest = RandomForestClassifier(n_estimators = 150, random_state = 0, n_jobs = -1)
    forest.fit(sample_training, y)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]

    # choose features with a .01 score or greater
    n_features = [feature for feature in importances if feature > .004]

    feat_importances = {}
    for i, label in enumerate(feat_labels):
        feat_importances[label] = importances[i]

    relevant_feature_labels = []
    for key, val in feat_importances.items():
        for feat_val in n_features:
            if feat_val == val:
                relevant_feature_labels.append(key)
    return relevant_feature_labels