In [1]:
%load_ext autoreload
%autoreload 2

import warnings
from datetime import datetime
import numpy as np
import pandas as pd
import country_converter as coco
import pytz
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

warnings.filterwarnings("ignore")
cc = coco.CountryConverter()
tqdm.pandas()

## Decide whether a question or answer was posted during working hour

Another way to view the activity is to just look within one day. We can calculate the percentage of questions and answers being posted during working hours and free time. We decided to compare this percentage between users from different countries since it can give us an idea of which countries’ users prefer to use the site during work.

In [2]:
# load preprocessed question
questions = pd.read_pickle('./question_with_location.pkl')
questions = questions.rename(columns={'question_creation_date': 'creation_date'})

In [3]:
# load preprocessed answer
answers = pd.read_pickle('./answer_with_location.pkl')

In [4]:
utc = pytz.utc
fmt = '%Y-%m-%d %H:%M:%S'
def working_hour(row):
    # cc.convert will not work for XKS, but luckily it is the only country code it doesn't recognize
    # we can manually set not_found to 'CH' to handle it since Kosovo (XKS) has the same timezone
    # as Switzerland (CH)
    if row.country_iso3 == 'XKS':
        iso2 = 'CH'
    else:
        iso2 = cc.convert(names=row.country_iso3, to='ISO2', not_found=None)
    # get the timezone of the country
    # if the country has multiple timezones, we choose the middle one in the list
    timezone = pytz.timezone(pytz.country_timezones[iso2][int(len(pytz.country_timezones[iso2]) / 2)])
    # localize as utc first
    dot_idx = row.creation_date.find('.')
    
    if dot_idx == -1:
        date = row.creation_date[:-4]
    else:
        date = row.creation_date[:dot_idx]
        
    loc_dt = utc.localize(datetime.strptime(date, '%Y-%m-%d %H:%M:%S'))
    
    # convert to local time
    loc_dt = loc_dt.astimezone(timezone)
    time = loc_dt.strftime(fmt)
    row['local_date'] = time
    
     # 9:00 to 17:00: working hour
    if (int(time[-8:-6]) >= 9) and (int(time[-8:-6]) <= 16):
        within = True
    else:
        within = False
    row['working_hour'] = within
    return row

In [5]:
# get working hour for each question and answer
local_date_questions = questions.progress_apply(working_hour, axis=1)
local_date_answers = answers.progress_apply(working_hour, axis=1)

100%|██████████| 3285128/3285128 [3:19:38<00:00, 274.25it/s]  
100%|██████████| 6899309/6899309 [6:40:13<00:00, 287.31it/s]   


In [6]:
# save them, because it takes a while to process
local_date_questions.to_pickle('./questions_with_working_hour.pkl')
local_date_answers.to_pickle('./answers_with_working_hour.pkl')

In [2]:
# load them back
q_wh = pd.read_pickle('./questions_with_working_hour.pkl')
a_wh = pd.read_pickle('./answers_with_working_hour.pkl')

In [5]:
# count working hour questions for each country
questions_countries = (q_wh.groupby('country_iso3').working_hour.sum()
                       / q_wh.groupby('country_iso3').working_hour.count()).to_frame(name='percentage')
questions_countries['wh_count'] = q_wh.groupby('country_iso3').working_hour.sum()
questions_countries['count'] = q_wh.groupby('country_iso3').working_hour.count()
questions_countries.head()

Unnamed: 0_level_0,percentage,wh_count,count
country_iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,0.357143,5.0,14
AFG,0.634286,888.0,1400
AGO,0.588235,110.0,187
ALA,0.8,28.0,35
ALB,0.605153,869.0,1436


In [6]:
# count working hour answers for each country
answers_conuntries = (a_wh.groupby('country_iso3').working_hour.sum()
                      / a_wh.groupby('country_iso3').working_hour.count()).to_frame(name='percentage')
answers_conuntries['wh_count'] = a_wh.groupby('country_iso3').working_hour.sum()
answers_conuntries['count'] = a_wh.groupby('country_iso3').working_hour.count()
answers_conuntries.head()

Unnamed: 0_level_0,percentage,wh_count,count
country_iso3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,0.387097,12.0,31
AFG,0.624317,914.0,1464
AGO,0.424419,73.0,172
ALA,0.44,11.0,25
ALB,0.646473,1558.0,2410


In [7]:
# get countries' full name
country_names = pd.read_csv('../Drive/IP2LOCATION-COUNTRY-MULTILINGUAL.CSV')
country_name_to_code = country_names[['COUNTRY_ALPHA3_CODE', 'COUNTRY_NAME']]
country_name_to_code = country_names[country_names.LANG == 'EN']

q_df = questions_countries.merge(country_name_to_code, left_index=True, right_on="COUNTRY_ALPHA3_CODE", how="left") \
                          .sort_values('percentage', ascending=False) \
                          .rename(columns={'COUNTRY_ALPHA3_CODE': 'country_iso3', 'COUNTRY_NAME': 'country'}) \
                          .reset_index(drop=True)
q_df = q_df[['country', 'country_iso3', 'wh_count', 'count', 'percentage']]

In [9]:
q_df.head(10)

Unnamed: 0,country,country_iso3,wh_count,count,percentage
0,Saint Barthelemy,BLM,1.0,1,1.0
1,Equatorial Guinea,GNQ,1.0,1,1.0
2,Svalbard and Jan Mayen,SJM,4.0,5,0.8
3,Åland Islands,ALA,28.0,35,0.8
4,Marshall Islands,MHL,3.0,4,0.75
5,Northern Mariana Islands,MNP,3.0,4,0.75
6,South Sudan,SSD,31.0,42,0.738095
7,Isle of Man,IMN,121.0,165,0.733333
8,Kiribati,KIR,16.0,22,0.727273
9,Cayman Islands,CYM,156.0,216,0.722222


In [10]:
# filter out countries with less than 50000 questions
q_df[q_df['count'].apply(lambda x: x > 50000)]

Unnamed: 0,country,country_iso3,wh_count,count,percentage
23,Netherlands,NLD,38404.0,58788,0.653263
39,France,FRA,58735.0,93187,0.630292
46,Germany,DEU,102859.0,165284,0.622317
49,United Kingdom,GBR,129028.0,208848,0.617808
52,Italy,ITA,35441.0,57792,0.613251
63,Spain,ESP,30583.0,50644,0.603882
69,Brazil,BRA,34671.0,58393,0.593753
82,Poland,POL,31670.0,54190,0.584425
85,India,IND,347194.0,598475,0.580131
94,Australia,AUS,39206.0,68434,0.572902


In [11]:
a_df = answers_conuntries.merge(country_name_to_code, left_index=True, right_on="COUNTRY_ALPHA3_CODE", how="left") \
                         .sort_values('percentage', ascending=False) \
                         .rename(columns={'COUNTRY_ALPHA3_CODE': 'country_iso3', 'COUNTRY_NAME': 'country'}) \
                         .reset_index(drop=True)
a_df = a_df[['country', 'country_iso3', 'wh_count', 'count', 'percentage']]

In [12]:
a_df.head(10)

Unnamed: 0,country,country_iso3,wh_count,count,percentage
0,Western Sahara,ESH,1.0,1,1.0
1,Northern Mariana Islands,MNP,1.0,1,1.0
2,Svalbard and Jan Mayen,SJM,1.0,1,1.0
3,Holy See (Vatican City State),VAT,20.0,22,0.909091
4,French Polynesia,PYF,451.0,517,0.87234
5,Congo,COG,164.0,222,0.738739
6,Belize,BLZ,50.0,69,0.724638
7,Vanuatu,VUT,21.0,29,0.724138
8,Seychelles,SYC,20.0,28,0.714286
9,Cayman Islands,CYM,311.0,445,0.698876


In [13]:
# filter out countries with less than 1000000 answers
a_df[a_df['count'].apply(lambda x: x > 100000)]

Unnamed: 0,country,country_iso3,wh_count,count,percentage
27,Netherlands,NLD,94172.0,151807,0.62034
30,United Kingdom,GBR,298708.0,483709,0.617537
47,France,FRA,138264.0,231429,0.597436
67,India,IND,649758.0,1107345,0.586771
74,Italy,ITA,59951.0,103259,0.580589
79,Germany,DEU,262386.0,455678,0.575815
91,Australia,AUS,101128.0,177981,0.568195
98,Poland,POL,76439.0,135445,0.564355
103,Canada,CAN,114224.0,203966,0.560015
107,United States,USA,819316.0,1471210,0.556899


People from the Netherlands, France, and the United Kingdom seem to be the most active during working hours. The percentage of the Russian Federation is surprisingly low. This is because the way we decided if it’s in the working hours or not was by taking one of the time zones in the country and converting the UTC timestamp in the dataset into local time. Countries like Russia and the United States have multiple time zones, which causes our estimation to be inaccurate.