In [55]:
import pandas as pd
import numpy as np
from IPython.display import display

# read datasets
people = pd.read_csv("data/people.csv")
activities_train = pd.read_csv("data/act_train.csv")
activities_test = pd.read_csv("data/act_test.csv")
dataframes = [people, activities_train, activities_test]
print("Successfully loaded datasets!")

Successfully loaded datasets!


### Explore People

In [56]:
# shape 
print("People df has {} rows and {} columns".format(people.shape[0], people.shape[1]))

# display first five people file
print(display(people.head()))

People df has 189118 rows and 41 columns


Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,...,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,...,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,...,False,False,True,False,False,False,True,True,False,84


None


#### Data types

The first 10 characteristics of each person are nominal features that need to be encoded into numerical format for further analysis.  The rest of the characteristics are booleans, which can then be represented as either 0 or 1. Note that **`char_38`** is the only integer value, which will need to be normalized to not carry more weight than the other columns.  

#### Groups

There is a large number of people in group 17314.  It would be interesting to find out why.

In [57]:
print "# of people in each group:\n", people["group_1"].value_counts()[:10]                                                     

# of people in each group:
group 17304    77314
group 667       1538
group 8386      1046
group 9280       666
group 450        659
group 1482       484
group 15723      461
group 3229       423
group 17899      414
group 3598       365
Name: group_1, dtype: int64


### Explore Activities

There are nine characteristics associated with activities that are type 1.  Other activity categories do not contain these characteristics, but will contain char_10 information; type 1 does not contain any char_10 info.

In [58]:
print("Activities train df has {} rows and {} columns".format(activities_train.shape[0], activities_train.shape[1]))

# A look into type 1 activities
type_one_activity = display(activities_train[activities_train["activity_category"] == "type 1"][:3])
print type_one_activity

Activities train df has 2197291 rows and 15 columns


Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
52,ppl_100025,act1_9923,2022-11-25,type 1,type 3,type 5,type 1,type 1,type 6,type 3,type 3,type 6,type 8,,0
105,ppl_100033,act1_198174,2022-07-26,type 1,type 36,type 11,type 5,type 1,type 6,type 1,type 1,type 4,type 1,,0
106,ppl_100033,act1_214090,2023-06-15,type 1,type 24,type 6,type 6,type 3,type 1,type 3,type 4,type 5,type 1,,0


None


In [59]:
print("Activities test df has {} rows and {} columns".format(activities_test.shape[0], activities_test.shape[1]))

# display first five rows in activities test file
print(display(activities_test.head(3)))

Activities test df has 498687 rows and 14 columns


Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
0,ppl_100004,act1_249281,2022-07-20,type 1,type 5,type 10,type 5,type 1,type 6,type 1,type 1,type 7,type 4,
1,ppl_100004,act2_230855,2022-07-20,type 5,,,,,,,,,,type 682
2,ppl_10001,act1_240724,2022-10-14,type 1,type 12,type 1,type 5,type 4,type 6,type 1,type 1,type 13,type 10,


None


#### Join dataframes

Add data from the people dataframe to activity dataframes 

In [60]:
def change_column_name(df):    
    # get char columns
    columns = [col for col in df.columns if "char" in col]
    
    # copy original df
    df_copy = df.copy()
    
    for col in columns:
        # change columns that begin with 'char'
        new_col = "ppl_" + col
        # rename column
        df_copy.rename(columns = {col:new_col}, inplace = True)
        
    return df_copy

def join_df(activity_df, people_df):
    # edit people df column names to avoid naming conflicts
    people_df_2 = change_column_name(people_df)
    
    # get new columns
    columns = people_df_2.columns
    
    # create new dataframe with these column names
    new_df = pd.DataFrame(columns = columns)
    
    # add people data to each row of this new dataframe
    activity_df_copy = activity_df.copy()[:5000] # testing env
    
    for i in range(0, len(activity_df_copy)):
        # get people_id from activities dataframe
        people_id = activity_df_copy.loc[i, "people_id"]
        
        # get person's data for people_id from people df
        people_data = people_df_2[people_df_2["people_id"] == people_id]
        
        # append this data to new df
        new_df = new_df.append(people_data, ignore_index = True)
        
    # drop a few unwanted columns 
    new_df.drop(["people_id", "date"], axis=1, inplace=True)
    
    # join dfs
    result = pd.concat([activity_df_copy, new_df], axis=1, join='inner') 
    
    return result

In [61]:
# combine people and activities for training and test sets
training_data = join_df(activities_train, people)
test_data = join_df(activities_test, people)
dataframes = [training_data, test_data]

# save progress
training_data.to_csv(path_or_buf = "processed_data/train.csv")
test_data.to_csv(path_or_buf = "processed_data/test.csv")

# a sample of the new training data
training_data.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,...,ppl_char_29,ppl_char_30,ppl_char_31,ppl_char_32,ppl_char_33,ppl_char_34,ppl_char_35,ppl_char_36,ppl_char_37,ppl_char_38
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,...,False,True,True,False,False,True,True,True,False,36.0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,...,False,True,True,False,False,True,True,True,False,36.0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,...,False,True,True,False,False,True,True,True,False,36.0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,...,False,True,True,False,False,True,True,True,False,36.0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,...,False,True,True,False,False,True,True,True,False,36.0


In [189]:
def load_processed_data():
    train = pd.read_csv("processed_data/train.csv", index_col = 0)
    test = pd.read_csv("processed_data/test.csv", index_col = 0)
    return train, test
training_data, test_data = load_processed_data()

#### Datetime

In [135]:
# Convert date column to datetime
def date_time(column):
    column = pd.to_datetime(column)
    return column

training_data["date"] = date_time(training_data["date"])
test_data["date"] = date_time(test_data["date"])

#### Handling categorical data

For the characteristics that take type 1, type 2, type 3 etc, the word 'type' will be dropped and the integers will be one-hot encoded.  For rows that contain missing values, they will be zeroed and then encoded. Group_1 columsn will also be encoded.  For columns containing True or False values, these will be converted to either 0 or 1.  

In [193]:
def dummy_features(df):
    columns = ["activity_category", "char_1", "char_2", 
                     "char_3", "char_4", "char_5", "char_6", 
                     "char_7", "char_8", "char_9", "char_10",
                     "ppl_char_1", "ppl_char_2", "ppl_char_3",
                     "ppl_char_4", "ppl_char_5", "ppl_char_6",
                     "ppl_char_7", "ppl_char_8", "ppl_char_9",
                     "group_1"]
    
    # Create dummy features for selected columns
    encoded_data = pd.get_dummies(df[columns])
    
    # drop columns for old labels
    df.drop(columns, axis=1, inplace=True)
    
    # join dataframe containing encoded data
    df = df.join(encoded_data, how = 'outer')
    return df

def to_boolean(x):
    if x == True:
        return 1
    elif x == False:
        return 0
    else:
        return x

In [194]:
# generate dummy features for selected columns
training_data = dummy_features(training_data)

In [195]:
# applies a function to each element to process boolean type values but leave all elements unchanged
training_data = training_data.applymap(lambda x: to_boolean(x))

#### Handle continuous data

In [196]:
# normalize column
training_data["ppl_char_38"] = training_data[["ppl_char_38"]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [201]:
# a look at the processed data
training_data.head()

Unnamed: 0,people_id,activity_id,date,outcome,ppl_char_10,ppl_char_11,ppl_char_12,ppl_char_13,ppl_char_14,ppl_char_15,...,group_1_group 8688,group_1_group 8921,group_1_group 9104,group_1_group 9107,group_1_group 9280,group_1_group 9405,group_1_group 9439,group_1_group 9603,group_1_group 9726,group_1_group 9728
0,ppl_100,act2_1734928,2023-08-26,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,ppl_100,act2_2434093,2022-09-27,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,ppl_100,act2_3404049,2022-09-27,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,ppl_100,act2_3651215,2023-08-04,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,ppl_100,act2_4109017,2023-08-26,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
