In [89]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.dates import DateFormatter
from IPython.display import display
from datetime import datetime
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

# Load the PANAS data from the csv file

dateparse = lambda x: datetime.strptime(x, "%m/%d/%Y %H:%M:%S")

#load dataframe from csv
df = pd.read_csv("../data/panas_data_new.csv",parse_dates=['Timestamp'], date_parser=dateparse)

variable_dict = {
'Timestamp':'TIMESTAMP',
'Pair':'PAIR',
'Identifier': 'ID',
'Time of Day': 'WHEN_RECORDED',
'How many hours did you sleep last night? (beginning of day only)':	'SLEEP',
'Upset':'UPSET',
'Hostile':'HOSTILE',
'Alert':'ALERT',
'Ashamed':'ASHAMED',
'Inspired': 'INSPIRED',
'Nervous':'NERVOUS',
'Determined':'DETERMINED',
'Attentive':'ATTENTIVE',
'Afraid':'AFRAID',
'Active':'ACTIVE',
'Any comments / thoughts?':'COMMENTS',
'Positive':'POSITIVE_SCORE',
'Negative':'NEGATIVE_SCORE',
'Difference':'SCORE_DIFFERENCE'}

df_daily_emotion = df.rename(columns=variable_dict)
df_daily_emotion.dropna(how='all', axis=1, inplace=True)

# Before getting only the paired data, identify the total number of records
total_num_daily_emotion = len(df_daily_emotion)
df_daily_emotion = df_daily_emotion.loc[(~df_daily_emotion['PAIR'].isnull())]
df_daily_emotion = df_daily_emotion.sort_values(['PAIR','TIMESTAMP'])



# Descriptive Statistics

## Daily Emotion

In [90]:
### Number of Daily emotion records


In [91]:


#print("total daily emotion records: ", unique_pairs)

# want to ignore blank entries for the pairing so get them first

pairs_df = df_daily_emotion.groupby(['PAIR']).agg(num_pairs=pd.NamedAgg(column='PAIR', aggfunc='count'))
unique_pairs = len(pairs_df.loc[(pairs_df.num_pairs == 2)])

print('total daily emotion records', len(df_daily_emotion))
print('total paired start day/end of day daily emotion records',unique_pairs)

#print('total after break daily emotion records',len(df_clean.loc[(df_clean.WHEN_RECORDED == 'End of Day')]))


total daily emotion records 36
total paired start day/end of day daily emotion records 18


### Daily emotion scores

In [92]:

# count number of times day started positive
#df.loc[(df['col1'] == value) & (df['col2'] < value)]
pos_start_day = len(df_daily_emotion.loc[(df_daily_emotion.SCORE_DIFFERENCE > 0) & (df_daily_emotion.WHEN_RECORDED == 'Beginning of Day')])
neg_start_day = unique_pairs - pos_start_day
print("total positive morning: ", pos_start_day)
print("total negative morning: ", neg_start_day)

pos_end_day = len(df_daily_emotion.loc[(df_daily_emotion.SCORE_DIFFERENCE > 0) & (df_daily_emotion.WHEN_RECORDED == 'End of Day')])
neg_end_day = unique_pairs - pos_end_day
print("total positive night: ", pos_end_day)
print("total negative night: ", neg_end_day)

#print(df_clean.sort_values(['PAIR','TIMESTAMP']))
#print(df_clean[['TIMESTAMP','SCORE_DIFFERENCE']].diff())




total positive morning:  14
total negative morning:  4
total positive night:  12
total negative night:  6


### Differences in consecutive PANAS ratings

In [93]:
#now look at differences between start and end of day
df_daily_emotion['DELTA_PREV_DIFFERENCE'] = df_daily_emotion[['TIMESTAMP','SCORE_DIFFERENCE']].diff()['SCORE_DIFFERENCE']
#print(df_clean[['PAIR','WHEN_RECORDED','SCORE_DIFFERENCE','DELTA_PREV_DIFFERENCE']])
print('#Participants whose end of day total emotion had reduced since start of day: ', len(df_daily_emotion.loc[(df_daily_emotion.WHEN_RECORDED =='End of Day') & (df_daily_emotion.DELTA_PREV_DIFFERENCE <0)]))

# and differences from start of day and end of previous night
print('#Participants whose start of day total emotion had increased since end of previous day: ', len(df_daily_emotion.loc[(df_daily_emotion.WHEN_RECORDED =='Beginning of Day') & (df_daily_emotion.DELTA_PREV_DIFFERENCE >0)]))



#Participants whose end of day total emotion had reduced since start of day:  14
#Participants whose start of day total emotion had increased since end of previous day:  12


### Sleep distribution

In [94]:

print("Sleep distribution: avg {}, std {}".format(str(df_daily_emotion.SLEEP.mean()), str(df_daily_emotion.SLEEP.std())))


Sleep distribution: avg 6.870555555555556, std 0.879414726643754


In [95]:
# save the cleaned up data along with the emotion differences
df_daily_emotion.to_csv('../data/clean_daily_emotion.csv', index=False)

## Experience Sample Data

In [96]:
# now analyse the experience sample data
df_exp = pd.read_csv("../data/experience_sample_data_new.csv",parse_dates=['Timestamp'], date_parser=dateparse)

exp_variable_dict = {'Timestamp':'TIMESTAMP','Personal ID':'ID','Before-After break pairings':'PAIR','Before / After a break':'RELATION_BEAK','Duration of break activity (MINUTES)':'BREAK_DURATION', 'Current Perceived Stress Level':"PERCEIVED_STRESS",'Current Actual Stress Level':'GARMIN_STRESS','Activity Prior to Break Activity':'ACTIVITY_PRIOR_BREAK','Details of Break Activity':'BREAK_DETAILS','Was the break physically active?':'BREAK_PHYSICAL','Was the break indoors or outdoors?':'BREAK_LOCATION','Was the break alone or social?':'BREAK_SOCIAL','If it involved an electronic device, provide details':'BREAK_ELECTRONIC_DEVICE','Do you think the break activity helped?':'BREAK_HELPED','Date and Time of activity':'ACTUAL_ACTIVITY_TIME','Arousal/Valence [Aroused 2]':'AROUSAL_2','Arousal/Valence [Aroused 1]':'AROUSAL_1','Arousal/Valence [Aroused 0]':'AROUSAL_0','Arousal/Valence [Aroused -1]':'AROUSAL_-1','Arousal/Valence [Aroused -2]':'AROUSAL_-2','Adjectives / feelings':'FEELINGS','Why are you taking a break or why did you take a break?':'BREAK_REASON'}

# update columns names and remove the empty columns
df_clean_exp = df_exp.rename(columns=exp_variable_dict)
df_clean_exp.dropna(how='all', axis=1, inplace=True)
df_clean_exp['BREAK_ELECTRONIC_DEVICE'] = df_clean_exp['BREAK_ELECTRONIC_DEVICE'].replace(np.nan, '-')
df_clean_exp['BREAK_PHYSICAL'] = df_clean_exp['BREAK_PHYSICAL'].replace(np.nan, '-')
df_clean_exp['BREAK_LOCATION'] = df_clean_exp['BREAK_LOCATION'].replace(np.nan, '-')
df_clean_exp['BREAK_SOCIAL'] = df_clean_exp['BREAK_SOCIAL'].replace(np.nan, '-')



### Unique before/after reports

In [97]:
pairs_exp_df = df_clean_exp.groupby(['PAIR']).agg(num_pairs=pd.NamedAgg(column='PAIR', aggfunc='count'))
unique_exp_pairs = len(pairs_exp_df.loc[(pairs_exp_df.num_pairs == 2)])

print("total number of records: ", len(df_clean_exp))
print("unique pairs of before/after records: ", unique_exp_pairs)
print("total after break records:", len(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')]))


total number of records:  110
unique pairs of before/after records:  45
total after break records: 54


### Break details

In [98]:
# avg break duration
print("Break duration distribution: avg {}, std {}".format(str(df_clean_exp.BREAK_DURATION.mean()), str(df_clean_exp.BREAK_DURATION.std())))

Break duration distribution: avg 43.15942028985507, std 30.100626917648487


In [99]:

print(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].groupby(['BREAK_PHYSICAL'])['BREAK_PHYSICAL'].count())

BREAK_PHYSICAL
-           8
No         29
Yes        16
Yes, No     1
Name: BREAK_PHYSICAL, dtype: int64


In [100]:

print(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].groupby(['BREAK_SOCIAL'])['BREAK_SOCIAL'].count())

BREAK_SOCIAL
-                      6
Alone                 36
Alone, With others     5
With others            7
Name: BREAK_SOCIAL, dtype: int64


In [101]:

print(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].groupby(['BREAK_ELECTRONIC_DEVICE'])['BREAK_ELECTRONIC_DEVICE'].count())

BREAK_ELECTRONIC_DEVICE
-           17
Phone       26
Tablet       2
computer     9
Name: BREAK_ELECTRONIC_DEVICE, dtype: int64


In [102]:
print(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].groupby(['BREAK_LOCATION'])['BREAK_LOCATION'].count())

BREAK_LOCATION
-                                 6
Indoors                          29
Mixture of indoor and outdoor     7
Outdoors                         12
Name: BREAK_LOCATION, dtype: int64


In [103]:
df_clean_exp['BREAK_ELECTRONIC_DEVICE'] = df_clean_exp['BREAK_ELECTRONIC_DEVICE'].replace(np.nan, '-')
df_clean_exp['BREAK_PHYSICAL'] = df_clean_exp['BREAK_PHYSICAL'].replace(np.nan, '-')
df_clean_exp['BREAK_LOCATION'] = df_clean_exp['BREAK_LOCATION'].replace(np.nan, '-')
df_clean_exp['BREAK_SOCIAL'] = df_clean_exp['BREAK_SOCIAL'].replace(np.nan, '-')

#print(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].groupby(['BREAK_PHYSICAL','BREAK_LOCATION','BREAK_SOCIAL','BREAK_ELECTRONIC_DEVICE'], dropna=False)['BREAK_ELECTRONIC_DEVICE'].count())

table = pd.pivot_table(data=df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')],index=['BREAK_PHYSICAL','BREAK_LOCATION','BREAK_SOCIAL','BREAK_ELECTRONIC_DEVICE'], values=['ID'],aggfunc='count', dropna=True)

table['% Breaks'] = (table['ID']/table['ID'].sum())*100
print(table)

                                                                                         ID  \
BREAK_PHYSICAL BREAK_LOCATION                BREAK_SOCIAL       BREAK_ELECTRONIC_DEVICE       
-              -                             -                  -                         6   
               Indoors                       Alone              -                         1   
                                                                Phone                     1   
No             Indoors                       Alone              Phone                    11   
                                                                computer                  5   
                                             Alone, With others Phone                     3   
                                             With others        Phone                     1   
                                                                computer                  1   
               Mixture of indoor and outdoor Alone

### Reported and Perceived Stress Levels

In [104]:

print(df_clean_exp.groupby(['PERCEIVED_STRESS','GARMIN_STRESS'])['PERCEIVED_STRESS'].count())


PERCEIVED_STRESS  GARMIN_STRESS   
High              High (76 - 100)      1
                  Medium (51 - 75)     1
                  Rest (0 - 25)        2
Low               High (76 - 100)      9
                  Low (26 - 50)       26
                  Medium (51 - 75)     8
                  Rest (0 - 25)       17
Medium            High (76 - 100)      5
                  Low (26 - 50)       15
                  Medium (51 - 75)     7
                  Rest (0 - 25)        7
Rest              Low (26 - 50)        1
                  Medium (51 - 75)     1
                  Rest (0 - 25)       10
Name: PERCEIVED_STRESS, dtype: int64


### Number breaks reports per day

In [105]:
# number breaks reported per day
print(df_clean_exp.groupby(pd.Grouper(key="TIMESTAMP", freq="D"))['TIMESTAMP'].count())

TIMESTAMP
2022-05-13     4
2022-05-14     2
2022-05-15     4
2022-05-16    20
2022-05-17    15
2022-05-18    18
2022-05-19    11
2022-05-20     9
2022-05-21     4
2022-05-22     0
2022-05-23    19
2022-05-24     4
Freq: D, Name: TIMESTAMP, dtype: int64


### Number after breaks reports per person

In [106]:


print(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].groupby(['ID'])['ID'].count())




ID
Cameron     9
Jason       9
Katie      19
VJ          8
nadia       8
nadia       1
Name: ID, dtype: int64


In [108]:
# Sentiment analysis
with open('../data/feelings_training.csv', 'r') as fp:
    cl = NaiveBayesClassifier(fp, format="csv")

def get_sentiment(x):
    if type(x) is str:
        #print('value: ', x)
        #print('sentiment: ', TextBlob(x).sentiment.subjectivity)
        return TextBlob(x).sentiment.polarity

def get_custom_sentiment(x):
    if type(x) is str:
        result = cl.classify(x)
        if result=='pos':
            return 1
        elif result == 'neg':
            return -1
        return 0

df_sentiment = df_clean_exp['BREAK_HELPED'].apply(get_sentiment)

df_clean_exp['SENTIMENT_BREAK_HELPED'] = df_clean_exp['BREAK_HELPED'].apply(get_sentiment)

# now calc the average sentiment, and the std deviation, ignore NaN
print("Sentiment of break helped distribution: avg {}, std {}".format(str(df_clean_exp.SENTIMENT_BREAK_HELPED.mean()), str(df_clean_exp.SENTIMENT_BREAK_HELPED.std())))


df_clean_exp['SENTIMENT_FEELINGS'] = df_clean_exp['FEELINGS'].apply(get_custom_sentiment)

# now calc the average sentiment, and the std deviation, ignore NaN
print("Sentiment distribution before break: avg {}, std {}".format(str(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='Before break')].SENTIMENT_FEELINGS.mean()), str(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='Before break')].SENTIMENT_FEELINGS.std())))
print("Sentiment distribution after break: avg {}, std {}".format(str(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].SENTIMENT_FEELINGS.mean()), str(df_clean_exp.loc[(df_clean_exp.RELATION_BEAK =='After break')].SENTIMENT_FEELINGS.std())))

df_clean_exp.to_csv('../data/clean_experience_sample.csv', index=False)


Sentiment of break helped distribution: avg 0.1509803921568627, std 0.28480449951006126
Sentiment distribution before break: avg -0.09090909090909091, std 1.005037815259212
Sentiment distribution after break: avg 0.5925925925925926, std 0.8130659078513455
