Hello World!
This notebook describes the decision tree based Machine Learning model I have created
to segment the users of Habits app.

# Looking around the data set and getting all colums in required format

In [80]:
# Import the required modules
import pandas as pd
import numpy as np
import scipy as sp

In [81]:
# simple function to read in the user data file.
# the argument parse_dates takes in a list of colums, which are to be parsed as date format
user_data_raw = pd.read_csv("janacare_user-engagement_Aug2014-Apr2016.csv", parse_dates = [-3,-2,-1])

In [82]:
# data metrics
user_data_raw.shape # Rows , colums

(372, 19)

In [83]:
# data metrics
user_data_raw.dtypes # data type of colums

user_id                                                        float64
num_modules_consumed                                           float64
num_glucose_tracked                                            float64
num_of_days_steps_tracked                                      float64
num_of_days_food_tracked                                       float64
num_of_days_weight_tracked                                     float64
insulin_a1c_count                                              float64
cholesterol_count                                              float64
hemoglobin_count                                               float64
watching_videos (binary - 1 for yes, blank/0 for no)           float64
weight                                                         float64
height                                                           int64
bmi                                                              int64
age                                                              int64
gender

As is visible from the last column data type, Pandas is not recognising it as date type format. 
This will make things difficult, so I delete this particular column and add a new one.
Since the data in *age_on_platform* can be recreated from *last_activity* & *first_login* colums

In [84]:
# drop last column
user_data_del_last_col = user_data_raw.drop("age_on_platform", 1)

In [85]:
# Check if colums has been deleted. Number of column changed from 19 to 18
user_data_del_last_col.shape

(372, 18)

In [86]:
# Copy data frame 'user_data_del_last_col' into a new one
user_data = user_data_del_last_col

In [87]:
# Create new column 'age_on_platform' which has the corresponding value in date type format
user_data["age_on_platform"] = user_data_del_last_col["last_activity"]-user_data_del_last_col["first_login"]

In [88]:
# Check the result in first few rows
user_data["age_on_platform"].head(5)

0   151 days
1   129 days
2   211 days
3   235 days
4     3 days
Name: age_on_platform, dtype: timedelta64[ns]

#### The column name *watching_videos (binary - 1 for yes, blank/0 for no)* is too long and has special chars, lets change it to *watching_videos*

In [89]:
user_data = user_data.rename(columns = {'watching_videos (binary - 1 for yes, blank/0 for no)':'watching_videos'})

In [None]:
# Some basic statistical information on the data
user_data.describe()

# Data Clean up

In the last section of looking around, I saw that a lot of rows do not have any values or have garbage values(see first row of the table above).
This can cause errors when computing anything using the values in these rows, hence a clean up is required.

We will clean up only those columns, that are being used for features.

* **num_modules_consumed**
* **num_glucose_tracked**
* **num_of_days_food_tracked**
* **watching_videos**

The next two colums will not be cleaned, as they contain time data which in my opinion should not be imputed
* **first_login**
* **last_activity**

In [None]:
# Lets check the health of the data set
user_data.info()

The second column of the above table describes, the number of non-null values in the respective column.
As is visible for the columns of interest for us,
eg. *num_modules_consumed* has ONLY 69 values out of possible 371 total

In [90]:
# Lets remove all columns from the data set that do not have to be imputed - 
user_data_to_impute = user_data.drop(["user_id", "watching_videos", "num_of_days_steps_tracked", "num_of_days_weight_tracked", "insulin_a1c_count", "weight", "height", "bmi", "age", "gender", "has_diabetes", "first_login", "last_activity", "age_on_platform", "hemoglobin_count", "cholesterol_count"], 1 )

In [None]:
user_data_to_impute.info()

### The next 3 cells describes the steps to Impute data using KNN strategy, sadly this is not working well for our data set! One possible reason could be that the column is too sparse to find a neighbourer !

In [None]:
# Import Imputation method KNN
from fancyimpute import KNN

In [None]:
# First lets convert the Pandas Dataframe into a Numpy array. We do this since the data frame needs to be transposed,
# which is only possible if the format is an Numpy array.
user_data_to_impute_np_array = user_data_to_impute.as_matrix()
# Lets Transpose it
user_data_to_impute_np_array_transposed = user_data_to_impute_np_array.T

In [None]:
# usage X_filled_knn = KNN(k=3).complete(X_incomplete)
user_data_imputed_knn_np_array = KNN(k=5).complete(user_data_to_impute_np_array_transposed)

### The above 3 steps are for KNN based Imputation, did not work well. As visible 804 items could not be imputed for and get replaced with zero

In [91]:
# Lets use simpler method that is provided by Scikit Learn itself
# import the function
from sklearn.preprocessing import Imputer

In [92]:
# Create an object of class Imputer, with the relvant parameters
imputer_object = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)

In [93]:
# Impute the data and save the generated Numpy array
user_data_imputed_np_array = imputer_object.fit_transform(user_data_to_impute)

#### the *user_data_imputed_np_array* is a NumPy array, we need to convert it back to Pandas data frame

In [94]:
# create a list of tuples, with the column name and data type for all existing columns in the Numpy array.
# exact order of columns has to be maintained
column_names_of_imputed_np_array = ['num_modules_consumed', 'num_glucose_tracked', 'num_of_days_food_tracked']
# create the Pandas data frame from the Numpy array
user_data_imputed_data_frame = pd.DataFrame(user_data_imputed_np_array, columns=column_names_of_imputed_np_array)
# Check if the data frame created now is proper
user_data_imputed_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 3 columns):
num_modules_consumed        372 non-null float64
num_glucose_tracked         372 non-null float64
num_of_days_food_tracked    372 non-null float64
dtypes: float64(3)
memory usage: 8.8 KB


### Now lets add back the useful colums that we had removed from data set, these are
* *last_activity*
* *first_login*
* *age_on_platform*
* *watching_videos*

In [95]:
# using the Series contructor from Pandas
user_data_imputed_data_frame['first_login'] = pd.Series(user_data['first_login'])
user_data_imputed_data_frame['last_activity'] = pd.Series(user_data['last_activity'])
user_data_imputed_data_frame['age_on_platform'] = pd.Series(user_data['age_on_platform'])
# Check if every thing is Ok
user_data_imputed_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 6 columns):
num_modules_consumed        372 non-null float64
num_glucose_tracked         372 non-null float64
num_of_days_food_tracked    372 non-null float64
first_login                 372 non-null datetime64[ns]
last_activity               302 non-null datetime64[ns]
age_on_platform             302 non-null timedelta64[ns]
dtypes: datetime64[ns](2), float64(3), timedelta64[ns](1)
memory usage: 17.5 KB


#### As mentioned in column description for *watching_videos* a blank or no value, means '0' also know as 'Not watching'
#### Since Scikit Learn models can ONLY deal with numerical values, lets convert all blanks to '0'

In [96]:
user_data_imputed_data_frame['watching_videos'] = pd.Series(user_data['watching_videos'].fillna(0))
user_data_imputed_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 7 columns):
num_modules_consumed        372 non-null float64
num_glucose_tracked         372 non-null float64
num_of_days_food_tracked    372 non-null float64
first_login                 372 non-null datetime64[ns]
last_activity               302 non-null datetime64[ns]
age_on_platform             302 non-null timedelta64[ns]
watching_videos             372 non-null float64
dtypes: datetime64[ns](2), float64(4), timedelta64[ns](1)
memory usage: 20.4 KB


### Finally the columns *first_login*, *last_activity*, *age_on_platform* have missing values, as evident from above table. Since this is time data, that in my opinion should not be imputed, we will drop/delete the columns.

In [97]:
# Since only these three columns are having null values, we can run the function *dropna()* on the whole data frame
user_data_imputed_data_frame.dropna(axis=0, inplace=True)
user_data_imputed_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 0 to 370
Data columns (total 7 columns):
num_modules_consumed        302 non-null float64
num_glucose_tracked         302 non-null float64
num_of_days_food_tracked    302 non-null float64
first_login                 302 non-null datetime64[ns]
last_activity               302 non-null datetime64[ns]
age_on_platform             302 non-null timedelta64[ns]
watching_videos             302 non-null float64
dtypes: datetime64[ns](2), float64(4), timedelta64[ns](1)
memory usage: 18.9 KB


# Labelling the Raw data

Now comes the code that will based on the rules mentioned below label the provided data, so it can be used as trainning data for the classifer.

This tables defines the set of rules used to assign labels for Traning data

| label               | age_on_platform      | last_activity             | num_modules_comsumed        | num_of_days_food_tracked | num_glucose_tracked         | watching_videos  |
|---------------------|----------------------|---------------------------|-----------------------------|--------------------------|-----------------------------|------------------|
| Generic (ignore)    | Converted to days    | to be Measured from 16Apr | Good >= 3/week Bad < 3/week | Good >= 30 Bad < 30      | Good >= 4/week Bad < 4/week | Good = 1 Bad = 0 |
| good_new_user       | >= 30 days && < 180  | <= 2 days                 | >= 12                       | >= 20                    | >= 16                       | Good = 1         |
| bad_new_user        | >= 30 days && < 180  | > 2 days                  | < 12                        | < 20                     | < 16                        | Bad = 0          |
| good_mid_term_user  | >= 180 days && < 360 | <= 7 days                 | >= 48                       | >= 30                    | >= 96                       | Good = 1         |
| bad_mid_term_user   | >= 180 days && <360  | > 7 days                  | < 48                        | < 30                     | < 96                        | Bad = 0          |
| good_long_term_user | >= 360 days          | <= 14 days                | >= 48                       | >= 30                    | >= 192                      | Good = 1         |
| bad_long_term_user  | >= 360 days          | > 14 days                 | < 48                        | < 30                     | < 192                       | Bad = 0          |

In [99]:
one_month = 30
#one_month = one_month.astype(int)
six_month = 180

In [105]:
for index, row in user_data_imputed_data_frame.iterrows():
    if (row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int) >= one_month and (row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int) < six_month:
        user_data_imputed_data_frame["label"] = 1
    elif ((row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int)) >=180  and ((row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int))< 360:
        user_data_imputed_data_frame["label"] = 3
    elif ((row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int))>= 360:
        user_data_imputed_data_frame["label"] = 5
    else:
        row["label"] = 0
user_data_imputed_data_frame.info()
user_data_imputed_data_frame.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 0 to 370
Data columns (total 8 columns):
num_modules_consumed        302 non-null float64
num_glucose_tracked         302 non-null float64
num_of_days_food_tracked    302 non-null float64
first_login                 302 non-null datetime64[ns]
last_activity               302 non-null datetime64[ns]
age_on_platform             302 non-null timedelta64[ns]
watching_videos             302 non-null float64
label                       302 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(1), timedelta64[ns](1)
memory usage: 21.2 KB


Unnamed: 0,num_modules_consumed,num_glucose_tracked,num_of_days_food_tracked,age_on_platform,watching_videos,label
count,302.0,302.0,302.0,302,302.0,302.0
mean,12.072464,17.824758,29.576923,142 days 21:32:11.125827,0.321192,3.0
std,6.508527,21.23903,23.781469,168 days 20:03:46.790780,0.467709,0.0
min,1.0,1.0,1.0,-300 days +00:00:00,0.0,3.0
25%,12.072464,17.769231,29.576923,28 days 00:00:00,0.0,3.0
50%,12.072464,17.769231,29.576923,108 days 00:00:00,0.0,3.0
75%,12.072464,17.769231,29.576923,234 days 12:00:00,1.0,3.0
max,78.0,260.0,229.0,667 days 00:00:00,1.0,3.0


In [103]:
for index, row in user_data_imputed_data_frame.iterrows():
    if row["age_on_platform"] >= np.timedelta64(30, 'D') and row["age_on_platform"] < np.timedelta64(180, 'D'):
        user_data_imputed_data_frame["label"] = 1
    elif ((row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int)) >=180  and ((row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int))< 360:
        user_data_imputed_data_frame["label"] = 3
    elif ((row["age_on_platform"] / np.timedelta64(1, 'D')).astype(int))>= 360:
        user_data_imputed_data_frame["label"] = 5
        
    