In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale

# You can scale your data or normalize

In [2]:
# generally scaling your data means conforming that data to have a range from 0 to 1
# however in sklearn this actually NORMALIZES your data, which is more robust to outliers
df = pd.read_csv('daily_activities_and_happiness.csv')

In [3]:
df.head()

Unnamed: 0,daily_minutes_hobby,daily_minutes_exercise,daily_minutes_grooming,daily_minutes_commuting,daily_minutes_tv,daily_minutes_talking_to_friend,happiness_rating
0,19,16,14,20,95,22,2
1,17,23,8,64,72,14,1
2,20,21,18,53,30,18,2
3,23,21,20,65,22,15,2
4,28,7,19,38,36,14,2


In [4]:
scale??

[0;31mSignature:[0m [0mscale[0m[0;34m([0m[0mX[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mwith_mean[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mwith_std[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;34m@[0m[0m_deprecate_positional_args[0m[0;34m[0m
[0;34m[0m[0;32mdef[0m [0mscale[0m[0;34m([0m[0mX[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mwith_mean[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mwith_std[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Standardize a dataset along any axis.[0m
[0;34m[0m
[0;34m    Center to the mean and component wise scale to unit variance.[0m
[0;34m[0m
[0;34m    Read more in the :ref:`User Guide <preprocessing_scaler>`.[0m
[0;34m[

# Think about extracting data from irregular
# and heterogenous time series

In [5]:
from numpy.random import poisson, randint, choice

In [6]:
poisson(lam=5, size=10)

array([ 2,  6,  9,  6,  5,  5, 10,  4,  5,  1])

In [7]:
user_dates = []
user_vals = []
possible_indices = list(range(10))
for _ in range(10):
    dates = pd.date_range('2017-06-11', periods=10, freq='d')
    num_indices = randint(low=1, high=10)
    indices = choice(a=possible_indices, size=num_indices, replace=False)
    use_dates = dates[sorted(indices)]
    use_vals = poisson(lam=10, size=num_indices)
    user_dates.append(use_dates)
    user_vals.append(use_vals)

df = pd.DataFrame({'dates': user_dates,
                   'vals': user_vals})

In [8]:
df

Unnamed: 0,dates,vals
0,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 16, 12, 11, 10, 11, 12, 14]"
1,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[13, 6, 7, 9, 12, 10, 8]"
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[10, 10, 7, 13, 11, 12, 8, 10, 9]"
3,"DatetimeIndex(['2017-06-13', '2017-06-20'], dt...","[7, 5]"
4,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[10]
5,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[7, 15, 9, 9, 13, 10, 11]"
6,"DatetimeIndex(['2017-06-13', '2017-06-14', '20...","[5, 8, 3]"
7,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[15, 7, 13, 8, 8, 8, 9]"
8,"DatetimeIndex(['2017-06-11', '2017-06-16', '20...","[3, 11, 6]"
9,"DatetimeIndex(['2017-06-12', '2017-06-19'], dt...","[16, 8]"


In [9]:
df.loc[2]['dates']

DatetimeIndex(['2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15',
               '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19',
               '2017-06-20'],
              dtype='datetime64[ns]', freq=None)

# Think about how you might characterize these individuals

In [10]:
# high use/ low use individuals
df['usage_val'] = df.vals.apply(lambda x: len(x))

In [11]:
df.head()

Unnamed: 0,dates,vals,usage_val
0,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 16, 12, 11, 10, 11, 12, 14]",8
1,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[13, 6, 7, 9, 12, 10, 8]",7
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[10, 10, 7, 13, 11, 12, 8, 10, 9]",9
3,"DatetimeIndex(['2017-06-13', '2017-06-20'], dt...","[7, 5]",2
4,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[10],1


In [12]:
# how long was the 'user lifetime'?
df['user_lifetime'] = df.dates.apply(lambda x: max(x) - min(x))

In [13]:
df.head()

Unnamed: 0,dates,vals,usage_val,user_lifetime
0,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 16, 12, 11, 10, 11, 12, 14]",8,9 days
1,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[13, 6, 7, 9, 12, 10, 8]",7,7 days
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[10, 10, 7, 13, 11, 12, 8, 10, 9]",9,8 days
3,"DatetimeIndex(['2017-06-13', '2017-06-20'], dt...","[7, 5]",2,7 days
4,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[10],1,0 days


In [14]:
# what was the range of values a user input?
df['range_values'] = df.vals.apply(lambda x: max(x) - min(x))

In [15]:
df.head()

Unnamed: 0,dates,vals,usage_val,user_lifetime,range_values
0,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 16, 12, 11, 10, 11, 12, 14]",8,9 days,6
1,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[13, 6, 7, 9, 12, 10, 8]",7,7 days,7
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[10, 10, 7, 13, 11, 12, 8, 10, 9]",9,8 days,6
3,"DatetimeIndex(['2017-06-13', '2017-06-20'], dt...","[7, 5]",2,7 days,2
4,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[10],1,0 days,0


In [16]:
# identify users who provided input on a certain day
# (maybe it's revealing that they did log something on Father's Day)
df.dates.apply(lambda x: pd.Timestamp('2016-06-18').day in [d.day for d in x])

0     True
1     True
2     True
3    False
4     True
5     True
6     True
7     True
8    False
9    False
Name: dates, dtype: bool