In [1]:
import pandas as pd
import numpy as np
import itertools

## Join data and demographics into one csv

In [2]:
df_main = pd.read_csv("cleaned_hm.csv")
df_demographics = pd.read_csv("demographic.csv")

In [3]:
# Join demographics and cleaned_hm into one dataframe
# and drop na (e.g. rows without ground_truth_category)
df = df_main.join(df_demographics.set_index('wid'), on="wid")
# df = df.dropna(axis='index')
df.to_csv("happy_moments_complete_unprocessed.csv")

## Explore, filter, and bin data

### Age

In [4]:
# Convert ages to integers and drop rows without ages
df.age = pd.to_numeric(df.age, errors='coerce')
df = df.dropna(subset=['age'])

# Filter out the under 18 y.o. and over 100 y.o.
df = df[(df.age >= 18) & (df.age < 100)]

# Create age bins
df['age_bins'] = pd.cut(x=df.age, bins=[0, 21, 25, 30, 35, 40, 50, 100])
df.age_bins.value_counts()

(25, 30]     29527
(30, 35]     19510
(21, 25]     18617
(35, 40]     10424
(40, 50]      9582
(50, 100]     7045
(0, 21]       5521
Name: age_bins, dtype: int64

### Gender

In [5]:
df.gender.value_counts(dropna=False)
# There are 51 values that have null gender values -- drop those
df = df.dropna(subset=['gender'])
df.gender.value_counts(dropna=False)

m    57526
f    41955
o      694
Name: gender, dtype: int64

### Marital Status

In [6]:
df.marital.value_counts(dropna=False)
# There are 97 values that have null marital status values -- drop those
df = df.dropna(subset=['marital'])

# Bin into Single/Married/Other
df = df.replace({'divorced': 'other', 'separated': 'other', 'widowed': 'other'})
df.marital.value_counts(dropna=False)

single     53932
married    41232
other       4914
Name: marital, dtype: int64

### Parenthood

In [7]:
df.parenthood.value_counts(dropna=False)
# There are 32 values that have null parenthood values -- drop those
df = df.dropna(subset=['parenthood'])
df.parenthood.value_counts(dropna=False)

n    60710
y    39336
Name: parenthood, dtype: int64

### Reflection period

In [8]:
df.reflection_period.value_counts(dropna=False)

3m     50448
24h    49598
Name: reflection_period, dtype: int64

### Drop unnecessary columns

In [9]:
# Unnecessary columns: original_hm, modified, num_sentence, ground_truth_category, country
df = df.drop(['original_hm', 'modified', 'num_sentence', 'ground_truth_category', 'country', 'wid', 'reflection_period'], axis=1)
df.head()

Unnamed: 0,hmid,cleaned_hm,predicted_category,age,gender,marital,parenthood,age_bins
0,27673,I went on a successful date with someone I fel...,affection,35.0,m,single,n,"(30, 35]"
1,27674,I was happy when my son got 90% marks in his e...,affection,29.0,m,married,y,"(25, 30]"
2,27675,I went to the gym this morning and did yoga.,exercise,30.0,f,married,y,"(25, 30]"
3,27676,We had a serious talk with some friends of our...,bonding,28.0,f,married,n,"(25, 30]"
4,27677,I went with grandchildren to butterfly display...,affection,55.0,f,other,y,"(50, 100]"


In [10]:
df.to_csv("happy_moments_processed_full.csv")

In [12]:
sample = df.sample(20000)
sample.head()
sample.to_csv("happy_moments_small_sample.csv")

## Finding statistics for permutations of categories and filters

In [18]:
# Get permutations for categories
categories = ['achievement', 'affection', 'bonding', 'enjoy_the_moment', 
                'exercise', 'leisure', 'nature']

# Filters
filters = {'gender': {'male', 'female'}, 
           'marital': {'married', 'single', 'other'}, 
           'parenthood': {'y', 'n'}, 
           'reflection_period': {'3m', '24h'}, 
           'age_bins': {'(25, 30]',
                        '(30, 35]',
                        '(21, 25]',
                        '(35, 40]',
                        '(40, 50]',
                        '(50, 100]',
                        '(0, 21]'}}

filters_list = ['male', 'female', 'married', 'single', 'other', 'y', 'n', '3m', '24h',
                '(25, 30]', '(30, 35]', '(21, 25]', '(35, 40]', '(40, 50]', '(50, 100]',
                '(0, 21]']

In [17]:
combs = []

for i in range(1, len(filters_list)+1):
    els = [list(x) for x in itertools.combinations(filters_list, i)]
    combs.extend(els)
    
print(combs)

[['male'], ['female'], ['married'], ['single'], ['other'], ['y'], ['n'], ['3m'], ['24h'], ['(25, 30]'], ['(30, 35]'], ['(21, 25]'], ['(35, 40]'], ['(40, 50]'], ['(50, 100]'], ['(0, 21]'], ['male', 'female'], ['male', 'married'], ['male', 'single'], ['male', 'other'], ['male', 'y'], ['male', 'n'], ['male', '3m'], ['male', '24h'], ['male', '(25, 30]'], ['male', '(30, 35]'], ['male', '(21, 25]'], ['male', '(35, 40]'], ['male', '(40, 50]'], ['male', '(50, 100]'], ['male', '(0, 21]'], ['female', 'married'], ['female', 'single'], ['female', 'other'], ['female', 'y'], ['female', 'n'], ['female', '3m'], ['female', '24h'], ['female', '(25, 30]'], ['female', '(30, 35]'], ['female', '(21, 25]'], ['female', '(35, 40]'], ['female', '(40, 50]'], ['female', '(50, 100]'], ['female', '(0, 21]'], ['married', 'single'], ['married', 'other'], ['married', 'y'], ['married', 'n'], ['married', '3m'], ['married', '24h'], ['married', '(25, 30]'], ['married', '(30, 35]'], ['married', '(21, 25]'], ['married', '(3

In [None]:
filtered_combs = []
for c in filtered_combs:
    for el in c:
        

In [14]:
len(combs)

65535