# <div align="center"> Carbon Impact - Data Generation </div>

<div align="center"> ROULET Maria Paula | ROUX Dorian - Bachelor 4</div>

---

### Libraries

In [2]:
import numpy as np
import random
import pandas as pd
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

### Read Data Files

In [4]:
sample_apps = pd.read_csv("Data/apps.csv", delimiter=';')
s = sample_apps.sample(10).reset_index()
s[['Application', 'Actions', 'Carbon_Impact']]

Unnamed: 0,Application,Actions,Carbon_Impact
0,Google,Basic Research,0.14
1,Skype,Video Conferencing,0.2
2,DuckDuckGo,Basic Research,0.33
3,Discord,Video Conferencing,0.43
4,Youtube,Newsfeed,0.46
5,Snapchat,Newsfeed,0.87
6,Tixeo,Video Conferencing,0.153
7,Facebook,Newsfeed,0.79
8,Pinterest,Newsfeed,1.3
9,Cisco Webex Meetings,Video Conferencing,0.22


### Functions

In [4]:
#Function to append the value to the corresponding dictionnary key
def append_dict(data_dict, lst_key, lst_value):
    for key, val in zip(lst_key, lst_value):
        data_dict[key].append(val)
    return data_dict

In [5]:
#Function that generate values corresponding to some conditions
def generate_time_data(current_daytime, weekdays):
    year = current_daytime.year
    month = current_daytime.month
    day = current_daytime.day
    weekday = weekdays[current_daytime.weekday()]
    time_hm = current_daytime.time()
    date = datetime.datetime(year, month, day)

    return date, year, month, day, weekday, time_hm

In [6]:
#Function that generate values corresponding to some conditions
def generate_values(sample_apps, user_browser, user_vconf, daily_cons, moment):
    
    #If Weekend (S&S)
    if moment == "Day Off":
        w_state = "Late"
        act = None
        app = None
        carbon = 0
        back_cons = random.uniform(0, 0.05)
        daily_cons += + carbon + back_cons
        return w_state, app, act, carbon, back_cons, daily_cons

    elif moment == "Late":
        w_state = "Late"
        act = None
        app = None
        carbon = 0
        back_cons = random.uniform(0, 0.1)
        daily_cons += + carbon + back_cons
        return w_state, app, act, carbon, back_cons, daily_cons
    
    
    elif moment == "High Consumption":
        w_state = "Working"
        rand_app = random.random()
        conditions_act = [rand_app <= 0.35, 0.25 < rand_app < 0.70, 0.7 <= rand_app]
        choice_act = ['Basic Research', 'Video Conferencing', 'Newsfeed']
        act = np.select(conditions_act, choice_act)
        conditions_app = [act == 'Basic Research', act == 'Video Conferencing', act == 'Newsfeed']
        choice_app = [user_browser, random.choice(user_vconf), random.choice(np.unique(sample_apps.Application[sample_apps.Actions == act]))]
        app = np.select(conditions_app, choice_app)
        carbon = float(sample_apps.Carbon_Impact[sample_apps.Application == app])
        back_cons = random.uniform(0, 0.1)
        if act == 'Newsfeed':
            duration = random.randint(1,10)
        elif act == 'Video Conferencing':
            duration = random.randint(20, 90)      
        else:
            duration = random.randint(1, 5)
        return w_state, app, act, carbon, back_cons, duration


    elif moment == "Normal Consumption":
        w_state = "Working"
        rand_app = random.random()
        conditions_act = [rand_app <= 0.4, 0.4 < rand_app < 0.6, 0.6 <= rand_app]
        choice_act = ['Basic Research', 'Video Conferencing', 'Newsfeed']
        act = np.select(conditions_act, choice_act)
        conditions_app = [act == 'Basic Research', act == 'Video Conferencing', act == 'Newsfeed']
        choice_app = [user_browser, random.choice(user_vconf), random.choice(np.unique(sample_apps.Application[sample_apps.Actions == act]))]
        app = np.select(conditions_app, choice_app)
        carbon = float(sample_apps.Carbon_Impact[sample_apps.Application == app])
        back_cons = random.uniform(0, 0.1)
        if act == 'Newsfeed':
            duration = random.randint(1,20)
        elif act == 'Video Conferencing':
            duration = random.randint(20, 90)      
        else:
            duration = random.randint(1, 5)
        return w_state, app, act, carbon, back_cons, duration
                
    elif moment == "No Consumption":
        w_state = "Working"
        act = None
        app = None
        carbon = 0
        back_cons = random.uniform(0.1, 0.25)
        daily_cons += + carbon + back_cons
        return w_state, app, act, carbon, back_cons, daily_cons
    
    elif moment == "Lunch Break":
        w_state = "Break"
        act = None
        app = None
        carbon = 0
        back_cons = random.uniform(0, 0.2)
        daily_cons += + carbon + back_cons
        return w_state, app, act, carbon, back_cons, daily_cons


In [7]:
def Weekly_Pattern():
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]  #Weekdays without Week-end
    ran = random.uniform(0, 1)
    day_lst = random.sample(days, 1) if ran < 0.25 else random.sample(days, 2) if 0.25 <= ran <= 0.75 else random.sample(days, 3)
    return day_lst

### Data Generation

In [23]:
#Create a dictionnary of all the information
app_dict = {"Employee ID":[], 
            "Full Date":[], "Date":[], "Year":[], "Month":[], "Day":[], "Weekday":[], "Week Number":[], "Time":[], 
            "Working State":[], "Action Type":[], "App":[], "Time Spent":[],
            "App Carbon Impact":[], "Background Consumption":[], "Daily Consumption":[]
}

weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]  #Weekdays
profiles = ["High Carbon Consumption", "Medium-High Carbon Consumption", "Medium-Low Carbon Consumption", "Low Carbon Consumption"] #Type of Consumption

#Begin and End Dates
start = datetime.datetime(2021, 1, 4, 9, 0) #2021-1-1 9AM
end = datetime.datetime(2021, 7, 1, 18, 0) #2021-3-1 6PM

#Initial information
num_employees = random.randint(5,10) #Number of Employees

for id in range(num_employees): #Loop for each Employee
    
    #Initial Employee and Time Information 
    current_day = start #Set the Initial Date
    week_num = int(current_day.strftime("%V")) #Set the Initial Week Calendar Number
    day_lst = Weekly_Pattern() #Set the Initial Weekly Pattern
    user_browser = random.choice(np.unique(sample_apps.Application[sample_apps.Actions == "Basic Research"])) #Select a single Browser
    user_vconf = [random.choice(np.unique(sample_apps.Application[sample_apps.Actions == "Video Conferencing"])) for _ in range(2)] #Select up to two different Video Conferencing App
    
    
    while current_day <= end: #Loop for each Day
        
        end_day = datetime.datetime(current_day.year, current_day.month, current_day.day, 18, 0) #Set the Daily End Work Hours
        daily_cons = 0 #Set the Daily Carbon Consumption as 0
        true_start = current_day + datetime.timedelta(minutes=random.randint(-20, 30)) #Define the True Time (when the employee arrives at the Office)
        lunch_break = False #Set the "Lunch Break" Boolean at False
        timestamp = current_day if true_start > current_day else true_start #Define the Initial Time depending on a Condition

        if week_num == int(current_day.strftime("%V")):
            pass
        else:
            week_num = int(current_day.strftime("%V"))
            day_lst = Weekly_Pattern()
                            
        #Check if the employee is Late
        if timestamp < true_start: #Check that the Initial Time is lower than the True Time
            for d in range(1, int((true_start - current_day).seconds/60)+1): #Loop for the Time the Employee is Late
                w_state, act, app, carbon, back_cons, daily_cons = generate_values(sample_apps, user_browser, user_vconf, daily_cons, "Late")  
                date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                append_dict(app_dict, list(app_dict.keys()), lst_values)
                timestamp += datetime.timedelta(minutes=1) #Incremente the Current Time by a minute
        else:
            pass    
        
        
        while timestamp <= end_day: #Loop for the Day
            if weekdays[current_day.weekday()] == "Sunday" or weekdays[current_day.weekday()] == "Saturday" and random.random() < 0.8:
                day_off = True
                high_use_app = False
                use_app = False
            else:
                day_off = False
                high_use_app = True if weekdays[current_day.weekday()] in day_lst else False
                use_app = True if high_use_app == True or random.random() <= 0.35 else False

            if high_use_app == True:
                moment = "High Consumption"
                w_state, act, app, carbon, back_cons, duration = generate_values(sample_apps, user_browser, user_vconf, daily_cons, moment)
                for d in range(1, duration+1):
                    daily_cons += carbon + back_cons
                    date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                    lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                    append_dict(app_dict, list(app_dict.keys()), lst_values)
                        
                    #If condition to set break and end of day
                    if lunch_break == False and timestamp > datetime.datetime(timestamp.year, timestamp.month, timestamp.day, 11, 30) and d/duration > 0.75:
                        lunch_break = True
                        duration = random.randint(30, 90)
                        for d in range(1, duration+1):
                            w_state, act, app, carbon, back_cons, daily_cons = generate_values(sample_apps, user_browser, user_vconf, daily_cons, "Lunch Break")  
                            date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                            lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                            append_dict(app_dict, list(app_dict.keys()), lst_values)
                            timestamp += datetime.timedelta(minutes=1)                    
                        break
                
                    if timestamp > end_day + datetime.timedelta(minutes = 59):
                        break
                    else:      
                        timestamp += datetime.timedelta(minutes=1)
            
            elif high_use_app == False and use_app == True:
                moment = "Normal Consumption"
                w_state, act, app, carbon, back_cons, duration = generate_values(sample_apps, user_browser, user_vconf, daily_cons, moment) 
                for d in range(1, duration+1):
                    daily_cons += carbon + back_cons
                    date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                    lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                    append_dict(app_dict, list(app_dict.keys()), lst_values)
                        
                    #If condition to set break and end of day
                    if lunch_break == False and timestamp > datetime.datetime(timestamp.year, timestamp.month, timestamp.day, 11, 30) and d/duration > 0.75:
                        lunch_break = True
                        duration = random.randint(30, 90)
                        for d in range(1, duration+1):
                            w_state, act, app, carbon, back_cons, daily_cons = generate_values(sample_apps, user_browser, user_vconf, daily_cons, "Lunch Break")  
                            date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                            lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                            append_dict(app_dict, list(app_dict.keys()), lst_values)
                            timestamp += datetime.timedelta(minutes=1)                    
                        break
                
                    if timestamp > end_day + datetime.timedelta(minutes = 59):
                        break
                    else:      
                        timestamp += datetime.timedelta(minutes=1)
                
            elif day_off == True and use_app == False: #If the Employee do not use an App with a Carbon Impact
                moment = "Day Off"
                duration = int((end - start).seconds/60)
                for d in range(1, duration):
                    w_state, act, app, carbon, back_cons, daily_cons = generate_values(sample_apps, user_browser, user_vconf, daily_cons, moment)  
                    date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                    lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                    append_dict(app_dict, list(app_dict.keys()), lst_values)
                    timestamp += datetime.timedelta(minutes=1) #Incremente the Current Time by a minute
                    
                    if timestamp > end_day + datetime.timedelta(minutes = 59):
                        break
                    else:      
                        timestamp += datetime.timedelta(minutes=1)
                                      
            else: #If the Employee do not use an App with a Carbon Impact
                moment = "No Consumption"
                duration = random.randint(0, 15)
                for d in range(1, duration+1):
                    w_state, act, app, carbon, back_cons, daily_cons = generate_values(sample_apps, user_browser, user_vconf, daily_cons, moment)  
                    date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                    lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                    append_dict(app_dict, list(app_dict.keys()), lst_values)
                    timestamp += datetime.timedelta(minutes=1)  
                    
            
                        #If condition to set break and end of day
                    if lunch_break == False and timestamp > datetime.datetime(timestamp.year, timestamp.month, timestamp.day, 11, 30) and d/duration > 0.75:
                        lunch_break = True
                        duration = random.randint(30, 90)
                        for d in range(1, duration+1):
                            w_state, act, app, carbon, back_cons, daily_cons = generate_values(sample_apps, user_browser, user_vconf, daily_cons, "Lunch Break")  
                            date, year, month, day, weekday, time = generate_time_data(timestamp, weekdays)
                            lst_values = [id, timestamp, date, year, month, day, weekday, week_num, time, w_state, act, app, d, carbon, back_cons, daily_cons]
                            append_dict(app_dict, list(app_dict.keys()), lst_values)
                            timestamp += datetime.timedelta(minutes=1)                    
                        break
                
                    if timestamp > end_day + datetime.timedelta(minutes = 59):
                        break
                    else:      
                        timestamp += datetime.timedelta(minutes=1)        
        current_day += datetime.timedelta(days=1)


Use_App = pd.DataFrame.from_dict(app_dict)

Use_App.to_csv("Data/time_apps.csv")

Use_App

Unnamed: 0,Employee ID,Full Date,Date,Year,Month,Day,Weekday,Week Number,Time,Working State,Action Type,App,Time Spent,App Carbon Impact,Background Consumption,Daily Consumption
0,0,2021-01-04 08:51:00,2021-01-04,2021,1,4,Monday,1,08:51:00,Working,Twitter,Newsfeed,1,0.6,0.039161,0.639161
1,0,2021-01-04 08:52:00,2021-01-04,2021,1,4,Monday,1,08:52:00,Working,Twitter,Newsfeed,2,0.6,0.039161,1.278321
2,0,2021-01-04 08:53:00,2021-01-04,2021,1,4,Monday,1,08:53:00,Working,Twitter,Newsfeed,3,0.6,0.039161,1.917482
3,0,2021-01-04 08:54:00,2021-01-04,2021,1,4,Monday,1,08:54:00,Working,Twitter,Newsfeed,4,0.6,0.039161,2.556643
4,0,2021-01-04 08:55:00,2021-01-04,2021,1,4,Monday,1,08:55:00,Working,Twitter,Newsfeed,5,0.6,0.039161,3.195803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765096,9,2021-07-01 18:08:00,2021-07-01,2021,7,1,Thursday,26,18:08:00,Working,,,11,0.0,0.146514,112.054906
765097,9,2021-07-01 18:10:00,2021-07-01,2021,7,1,Thursday,26,18:10:00,Working,,,12,0.0,0.182592,112.237497
765098,9,2021-07-01 18:12:00,2021-07-01,2021,7,1,Thursday,26,18:12:00,Working,,,13,0.0,0.112548,112.350046
765099,9,2021-07-01 18:14:00,2021-07-01,2021,7,1,Thursday,26,18:14:00,Working,,,14,0.0,0.208608,112.558654


In [24]:
Daily_cons = Use_App.groupby(['Employee ID', 'Week Number', 'Day', 'Month', 'Year'])['Daily Consumption'].max().reset_index().rename(columns = {'Daily Consumption':'Total Daily Consumption'})
Daily_cons['Daily Consumption Level'] = pd.cut(Daily_cons["Total Daily Consumption"], 4, labels=["Low", "Medium-Low", "Medium-High", "High"])
Daily_cons.head(5)

Unnamed: 0,Employee ID,Week Number,Day,Month,Year,Total Daily Consumption,Daily Consumption Level
0,0,1,4,1,2021,181.39022,Medium-High
1,0,1,5,1,2021,182.986337,Medium-High
2,0,1,6,1,2021,165.632762,Medium-High
3,0,1,7,1,2021,75.664756,Low
4,0,1,8,1,2021,167.149389,Medium-High


In [25]:
Weekly_cons = Daily_cons.groupby(['Employee ID', 'Week Number', 'Year'])['Total Daily Consumption'].sum().reset_index().rename(columns = {'Total Daily Consumption':'Total Weekly Consumption'})
Weekly_cons['Weekly Consumption Level'] = pd.cut(Weekly_cons["Total Weekly Consumption"], 4, labels=["Low", "Medium-Low", "Medium-High", "High"])
Weekly_cons.head(5)

Unnamed: 0,Employee ID,Week Number,Year,Total Weekly Consumption,Weekly Consumption Level
0,0,1,2021,787.106646,Medium-High
1,0,2,2021,689.018918,Medium-Low
2,0,3,2021,714.048331,Medium-Low
3,0,4,2021,699.37591,Medium-Low
4,0,5,2021,661.673112,Medium-Low


In [26]:
Monthly_cons = Daily_cons.groupby(['Employee ID', 'Month', 'Year'])['Total Daily Consumption'].sum().reset_index().rename(columns = {'Total Daily Consumption':'Total Monthly Consumption'})
Monthly_cons['Monthly Consumption Level'] = pd.cut(Monthly_cons["Total Monthly Consumption"], 4, labels=["Low", "Medium-Low", "Medium-High", "High"])
Monthly_cons.head(5)

Unnamed: 0,Employee ID,Month,Year,Total Monthly Consumption,Monthly Consumption Level
0,0,1,2021,2889.549805,Medium-High
1,0,2,2021,2768.152813,Medium-High
2,0,3,2021,3194.079026,Medium-High
3,0,4,2021,2934.06356,Medium-High
4,0,5,2021,3104.79544,Medium-High


In [27]:
fnl_UseApp = pd.merge(Use_App, Daily_cons.loc[ : , Daily_cons.columns != 'Week Number'], on = ["Employee ID", "Day", "Month", "Year"])
fnl_UseApp = pd.merge(fnl_UseApp, Weekly_cons, on = ["Employee ID", "Week Number", "Year"])
fnl_UseApp = pd.merge(fnl_UseApp, Monthly_cons, on = ["Employee ID", "Month", "Year"])

fnl_UseApp.to_csv("Data/fnl_UsaApp6m.csv")

fnl_UseApp.head(5)

Unnamed: 0,Employee ID,Full Date,Date,Year,Month,Day,Weekday,Week Number,Time,Working State,...,Time Spent,App Carbon Impact,Background Consumption,Daily Consumption,Total Daily Consumption,Daily Consumption Level,Total Weekly Consumption,Weekly Consumption Level,Total Monthly Consumption,Monthly Consumption Level
0,0,2021-01-04 08:51:00,2021-01-04,2021,1,4,Monday,1,08:51:00,Working,...,1,0.6,0.039161,0.639161,181.39022,Medium-High,787.106646,Medium-High,2889.549805,Medium-High
1,0,2021-01-04 08:52:00,2021-01-04,2021,1,4,Monday,1,08:52:00,Working,...,2,0.6,0.039161,1.278321,181.39022,Medium-High,787.106646,Medium-High,2889.549805,Medium-High
2,0,2021-01-04 08:53:00,2021-01-04,2021,1,4,Monday,1,08:53:00,Working,...,3,0.6,0.039161,1.917482,181.39022,Medium-High,787.106646,Medium-High,2889.549805,Medium-High
3,0,2021-01-04 08:54:00,2021-01-04,2021,1,4,Monday,1,08:54:00,Working,...,4,0.6,0.039161,2.556643,181.39022,Medium-High,787.106646,Medium-High,2889.549805,Medium-High
4,0,2021-01-04 08:55:00,2021-01-04,2021,1,4,Monday,1,08:55:00,Working,...,5,0.6,0.039161,3.195803,181.39022,Medium-High,787.106646,Medium-High,2889.549805,Medium-High


In [242]:
fnl_UseApp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 515770 entries, 0 to 515769
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   Employee ID                  515770 non-null  int64         
 1   Full Date                    515770 non-null  datetime64[ns]
 2   Date                         515770 non-null  datetime64[ns]
 3   Year                         515770 non-null  int64         
 4   Month                        515770 non-null  int64         
 5   Day                          515770 non-null  int64         
 6   Weekday                      515770 non-null  object        
 7   Week Number                  515770 non-null  int64         
 8   Time                         515770 non-null  object        
 9   Working State                515770 non-null  object        
 10  Action Type                  266157 non-null  object        
 11  App                       