<a href="https://colab.research.google.com/github/chechelan/0-chechelan/blob/main/RayMinder_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
This notebook is about the data collection for my graduation project RayMinder. a) a user sunscreen usage behavior dataset was generated by an exploratory study group which involving 20 participants' 7 days reaction to manually send reminders to reapply sunscreen, b)use the weather API (visualcrossing) to extract uv index and weather related data of participants' locations during the test days. c) Merge the 2 dataset to generate a dataset for machine learning.


In [None]:
# import packages for data extracting via API
import requests
import pandas as pd
import json

In [None]:
# set locations and date range
locations = ["beijing", "shanghai", "guangzhou","amsterdam","amersfoort","berlin","tilburg","utrecht"]
date_range = "2023-07-05/2023-07-11"
api_key = "26NKG48ALQA5LNWCUCW6FFLTU"

In [None]:
# extract data from API (hourly data) and save it in a dataframe
all_data = []
for location in locations:
    url = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{location}/{date_range}?unitGroup=metric&include=hours&key={api_key}&contentType=json"
    response = requests.get(url)

    if response.status_code != 200:
        print(f'Unexpected status code for {location}:', response.status_code)
        continue  # Skip to next iteration

    # Parse the results as JSON
    jsonData = response.json()

    # Extract city from the JSON
    city = jsonData['address']

    # Extract hourly data
    for day in jsonData['days']:
        date = day['datetime']  # Extract the date
        for hour in day['hours']:
            all_data.append({
                'city': city,
                'date': date,  # Add the date to the dictionary
                'datetime': hour['datetime'],
                'uvindex': hour['uvindex'],
                'temp': hour['temp'],
                'conditions': hour['conditions'],
                'icon': hour['icon'],
                'cloudcover': hour['cloudcover']
            })

# Convert the list of dictionaries into a DataFrame
df1 = pd.DataFrame(all_data)


In [None]:
# Print the DataFrame
print(df1)

         city        date  datetime  uvindex  temp        conditions  \
0     beijing  2023-07-05  00:00:00      0.0  22.0             Clear   
1     beijing  2023-07-05  01:00:00      0.0  22.0             Clear   
2     beijing  2023-07-05  02:00:00      0.0  22.1             Clear   
3     beijing  2023-07-05  03:00:00      0.0  20.0             Clear   
4     beijing  2023-07-05  04:00:00      0.0  21.0             Clear   
...       ...         ...       ...      ...   ...               ...   
1339  utrecht  2023-07-11  19:00:00      4.0  25.1          Overcast   
1340  utrecht  2023-07-11  20:00:00      1.0  24.5          Overcast   
1341  utrecht  2023-07-11  21:00:00      0.0  22.4  Partially cloudy   
1342  utrecht  2023-07-11  22:00:00      0.0  21.4          Overcast   
1343  utrecht  2023-07-11  23:00:00      0.0  18.5    Rain, Overcast   

                   icon  cloudcover  
0           clear-night         0.0  
1           clear-night         0.0  
2           clear-nig

In [None]:
# initiate google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Read in and merge the two datasets
df2 = pd.read_csv('gdrive/My Drive/collected.csv')

In [None]:
df2.head()

Unnamed: 0,userid,city,date,datetime,age,gender,skintype,skinconcerns,makeup,where,acitivity,SPF,forwhom,reminder_order,reminder,amount
0,7,amsterdam,2023/7/5,7:00:00,26,female,3,no,yes,indoors,breakfast,50,myself,1,yes,c
1,7,amsterdam,2023/7/5,9:00:00,26,female,3,no,yes,indoors,meeting,50,myself,2,no,e
2,7,amsterdam,2023/7/5,11:00:00,26,female,3,no,yes,indoors,,50,myself,3,no,e
3,7,amsterdam,2023/7/5,13:00:00,26,female,3,no,yes,indoors,,50,myself,4,no,e
4,7,amsterdam,2023/7/5,15:00:00,26,female,3,no,yes,indoors,,50,myself,5,no,e


In [None]:
# 'date' and 'datetime' columns are not in same type with the data fields from visualcrossing API, so need to format it in order to be used as key for dataset merge
from datetime import datetime
df2['date'] = pd.to_datetime(df2['date']).dt.strftime('%Y-%m-%d')
df2.head()

Unnamed: 0,userid,city,date,datetime,age,gender,skintype,skinconcerns,makeup,where,acitivity,SPF,forwhom,reminder_order,reminder,amount
0,7,amsterdam,2023-07-05,7:00:00,26,female,3,no,yes,indoors,breakfast,50,myself,1,yes,c
1,7,amsterdam,2023-07-05,9:00:00,26,female,3,no,yes,indoors,meeting,50,myself,2,no,e
2,7,amsterdam,2023-07-05,11:00:00,26,female,3,no,yes,indoors,,50,myself,3,no,e
3,7,amsterdam,2023-07-05,13:00:00,26,female,3,no,yes,indoors,,50,myself,4,no,e
4,7,amsterdam,2023-07-05,15:00:00,26,female,3,no,yes,indoors,,50,myself,5,no,e


In [None]:
df2['datetime'] = pd.to_datetime(df2['datetime'], format='%H:%M:%S').dt.strftime('%H:%M:%S')


In [None]:
df2.head()

Unnamed: 0,userid,city,date,datetime,age,gender,skintype,skinconcerns,makeup,where,acitivity,SPF,forwhom,reminder_order,reminder,amount
0,7,amsterdam,2023-07-05,07:00:00,26,female,3,no,yes,indoors,breakfast,50,myself,1,yes,c
1,7,amsterdam,2023-07-05,09:00:00,26,female,3,no,yes,indoors,meeting,50,myself,2,no,e
2,7,amsterdam,2023-07-05,11:00:00,26,female,3,no,yes,indoors,,50,myself,3,no,e
3,7,amsterdam,2023-07-05,13:00:00,26,female,3,no,yes,indoors,,50,myself,4,no,e
4,7,amsterdam,2023-07-05,15:00:00,26,female,3,no,yes,indoors,,50,myself,5,no,e


In [None]:
df2.shape

(840, 16)

In [None]:
df = pd.merge(df1, df2, on = ["city","date","datetime"]) #'id' is the common identifier
df.head()

Unnamed: 0,city,date,datetime,uvindex,temp,conditions,icon,cloudcover,userid,age,...,skintype,skinconcerns,makeup,where,acitivity,SPF,forwhom,reminder_order,reminder,amount
0,beijing,2023-07-05,08:00:00,4.0,27.3,Clear,clear-day,0.0,25,31,...,4,no,yes,indoors,,30,myself,1,yes,c
1,beijing,2023-07-05,10:00:00,8.0,31.0,Clear,clear-day,0.0,25,31,...,4,no,yes,indoors,,30,myself,2,yes,a
2,beijing,2023-07-05,12:00:00,10.0,38.0,Clear,clear-day,0.0,25,31,...,4,no,yes,indoors,,30,myself,3,yes,a
3,beijing,2023-07-05,14:00:00,9.0,38.8,Clear,clear-day,0.0,25,31,...,4,no,yes,indoors,,30,myself,4,yes,a
4,beijing,2023-07-05,16:00:00,7.0,40.0,Clear,clear-day,0.0,25,31,...,4,no,yes,indoors,,30,myself,5,no,e


In [None]:
df.shape

(840, 21)

In [None]:
print(df.dtypes)

city               object
date               object
datetime           object
uvindex           float64
temp              float64
conditions         object
icon               object
cloudcover        float64
userid              int64
age                 int64
gender             object
skintype            int64
skinconcerns       object
makeup             object
where              object
acitivity          object
SPF                 int64
forwhom            object
reminder_order      int64
reminder           object
amount             object
dtype: object


In [None]:
#save the dataset for machine learning model
df.to_csv('rmdataset.csv', index=False)