In [2]:
import pandas as pd
import numpy as np
import time
import json
import pprint
import os
import pickle
import datetime
from collections import defaultdict
from monkeylearn import MonkeyLearn
from collections import Counter
from operator import itemgetter

In [2]:
# app_usage2 = pd.DataFrame()
# app_usage2 = pd.read_csv("dataset/dataset/app_usage/running_app_u04.csv")
# call_logs = pd.DataFrame()
# call_logs = pd.read_csv("dataset/dataset/call_log/call_log_u03.csv")
# call_logs['timestamp'] = call_logs['timestamp'].map(lambda timestamp: time.ctime(timestamp))
# app_usage2['timestamp'] = app_usage2['timestamp'].map(lambda timestamp: time.ctime(timestamp))
# print(app_usage2.RUNNING_TASKS_topActivity_mPackage.unique())


In [7]:
'''
Initializing the final dataframe with user IDs
'''

n = 60  # total number of users
start_date = datetime.datetime.strptime('Sun Mar 24 00:00:00 2013', "%a %b %d %H:%M:%S %Y")
end_date = start_date + datetime.timedelta(days=7)

final_df = pd.DataFrame()
final_df['User'] = ['u'+str(uid).zfill(len(str(n))) for uid in range(n)]  # zfill adds preceding zeros, e.g. u01 instead of u1


'''
Get the restaurants and cafes that the users visit and calculate frequency of each dining place
'''

visits = {}

for uid in range(n):
    dining = []
    filename = '../dataset/dinning/u'+str(uid).zfill(len(str(n)))+'.txt'
    
    # check if file exists in the directory
    if os.path.isfile(filename):
        file = open(filename)
        
        for line in file:
            row = line.split(',')
            date = datetime.datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S")
            # get first week's data
            if start_date < date < end_date:
                dining.append(row[1])
        
        visits['u'+str(uid).zfill(len(str(n)))] = dict(Counter(dining))
        
        # set 0 as the default number of visits at a particular cafe
        for cafe in dict(Counter(dining)).keys():
            final_df[cafe] = 0

for uid, dining in visits.items():
    for cafe, frequency in dining.items():
        # store the total number of visits at each restaurant by a particular user
        final_df.loc[final_df['User']==uid,cafe] = int(frequency)


'''
Get user activity details from piazza
'''

piazza_activity = pd.DataFrame()
piazza_activity = pd.read_csv('../dataset/education/piazza.csv')

final_df['Days online'] = piazza_activity['days online']
final_df['Views'] = piazza_activity['views']
final_df['Contributions'] = piazza_activity['contributions']
final_df['Questions'] = piazza_activity['questions']
final_df['Notes'] = piazza_activity['notes']
final_df['Answers'] = piazza_activity['answers']


'''
Get all the comments posted by the user and get their total count
'''

# ml = MonkeyLearn('6518261174d614bb7fd37b0fa83f93926a8648d1')
# module_id = 'cl_Jx8qzYJh'

user_comments = {}  # all the comments written by each user
total_comments = {}  # total number of comments written by each user

for uid in range(n):
    filename = '../dataset/EMA/response/Comment/Comment_u'+str(uid).zfill(len(str(n)))+'.json'
    
    if os.path.isfile(filename):
        with open(filename) as file:
            comments_dict = json.load(file)
            
            count = 0
            comments = []
            for item in comments_dict:
                if 'comment' in item:
                    date = datetime.datetime.strptime(time.ctime(item['resp_time']), "%a %b %d %H:%M:%S %Y")
                    if start_date < date < end_date:
                        comments.append(item['comment'])
                        count += 1
            
            total_comments['u'+str(uid).zfill(len(str(n)))] = count   
            
            comments_merged = ''
            for comment in comments:
                comments_merged += comment + ' '
            
            user_comments['u'+str(uid).zfill(len(str(n)))] = comments_merged


'''
Determine the sentiment of each user using their comments
'''

with open('sentiments.pkl', 'rb') as file:
    sentiments = pickle.load(file)

for uid, count in total_comments.items():
    final_df.loc[final_df['User']==uid, 'Total comments'] = int(count)
    final_df.loc[final_df['User']==uid, 'Comments positivity'] = sentiments[uid]


'''
Get the average stress level of each user in the interval of 7 days
'''

for uid in range(n):
    
    stress = []
    uid = 'u'+str(uid).zfill(len(str(n)))
    filename = '../dataset/EMA/response/Stress/Stress_'+uid+'.json'
    
    # check if file exists in the directory
    if os.path.isfile(filename):
        with open(filename) as file:
            stress_dict = json.load(file)
            
            count = 0
            sum_stress_level = 0
            for item in stress_dict:
                if 'level' in item:
                    sum_stress_level += int(item['level'])
                    count += 1
            
            if count == 0:
                avg_stress_level = 'No level found'
            else:
                avg_stress_level = float("{0:.2f}".format(sum_stress_level/count))
            
            final_df.loc[final_df['User']==uid, 'Stress level'] = avg_stress_level
            
    else:
        final_df.loc[final_df['User']==uid, 'Stress level'] = 'No input'

final_df


Unnamed: 0,User,King Arthur Flour Coffee Bar,53 Commons,Collis Cafe,Courtyard Cafe,Collis Market,Novack Cafe,Days online,Views,Contributions,Questions,Notes,Answers,Total comments,Comments positivity,Stress level
0,u00,0,0,0,0,0,0,49.0,162.0,144.0,0.0,67.0,22.0,8.0,0.023,2.3
1,u01,2,4,3,4,0,0,29.0,299.0,5.0,1.0,1.0,0.0,2.0,0.023,2.25
2,u02,4,3,7,3,0,0,57.0,299.0,0.0,0.0,0.0,0.0,1.0,0.023,2.07
3,u03,0,0,0,0,0,0,47.0,262.0,30.0,15.0,2.0,6.0,2.0,0.023,2.93
4,u04,5,4,17,3,0,0,27.0,101.0,1.0,1.0,0.0,0.0,7.0,0.023,1.86
5,u05,0,6,10,2,2,1,67.0,301.0,1.0,0.0,0.0,0.0,2.0,0.023,3.38
6,u06,0,0,0,0,0,0,43.0,201.0,12.0,1.0,0.0,0.0,,,No input
7,u07,0,5,0,9,0,0,67.0,308.0,45.0,22.0,0.0,5.0,7.0,0.025,3.25
8,u08,5,2,7,7,0,2,36.0,171.0,20.0,4.0,3.0,4.0,10.0,0.023,1.8
9,u09,2,4,4,1,0,4,82.0,300.0,26.0,11.0,0.0,3.0,0.0,0.023,3


In [38]:
for uid in range(n):
    filename = 'dataset/dinning/u'+str(uid).zfill(len(str(n)))+'.txt'
    if not os.path.isfile(filename):
        print(uid)

0
3
6
11
13
17
21
23
26
28
29
31
34
35
37
38
39
40
41
44
45
48
50
51
52
53
55
56
58


In [71]:
# for k, v in user_comments.items():
#     res = ml.classifiers.classify(module_id, [v], sandbox=False)
#     if res.result[0][0]['label']=='Positive' or res.result[0][0]['label']=='Neutral':
#         print("Positive")
#         comment_sentiments[k] = res.result[0][0]['probability']
#     if res.result[0][0]['label']=='Negative':
#         print("Negative")
#         comment_sentiments[k] = float(1 - float(res.result[0][0]['probability']))
# comment_sentiments
    

In [74]:
'''
list of unique apps across all users
'''

app_list = []

for uid in range(n):
    filename = 'dataset/app_usage/running_app_u'+str(uid).zfill(2)+'.csv'
    # check if file exists in the directory
    if os.path.isfile(filename):
        df = pd.read_csv(filename)
        app_list += list(df['RUNNING_TASKS_topActivity_mPackage'].unique())

# set(app_list)

In [None]:
app_usage = pd.DataFrame()
app_usage = pd.read_csv("dataset/dataset/app_usage/running_app_u02.csv")

call_logs = pd.DataFrame()
call_logs = pd.read_csv("dataset/dataset/call_log/call_log_u03.csv")
call_logs['timestamp'] = call_logs['timestamp'].map(lambda timestamp: time.ctime(timestamp))
app_usage['timestamp'] = app_usage['timestamp'].map(lambda timestamp: time.ctime(timestamp))
print(list(app_usage.RUNNING_TASKS_topActivity_mPackage.unique()))
json_stress = []
with open('dataset/dataset/EMA/response/Stress/Stress_u03.json') as f:
    json_stress = json.load(f)
#print(json_stress)
for i in json_stress:
    if 'level' not in i:
        #print(i)
        json_stress.remove(i)
pprint.pprint(json_stress)