In [82]:
import pandas as pd

I'd like to write a function which will take in a dataframe of applie mobility data and convert it to each date on a single row and individual columns for each of the 3 categories (walking, transit, driving). I'll use the German data to create it and then generalize to the other country information that we received in a similar format.

In [83]:
ger = pd.read_excel('./data/Transportation/apple_mobility_Germany.xlsx')

In [84]:
ger.head() #preview

Unnamed: 0,Value,Date,Super Region,Sub Region,Unit,Geography Type,Source,Sub-Sector,Frequency,Region,Country,Transportation Type
0,81.14,2021-01-24,GLOBAL DATA,NORTH RHINE-WESTPHALIA,INDEX,CITY,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,AACHEN,GERMANY,WALKING
1,127.55,2020-08-07,GLOBAL DATA,NORTH RHINE-WESTPHALIA,INDEX,CITY,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,AACHEN,GERMANY,WALKING
2,88.19,2021-01-25,GLOBAL DATA,NORTH RHINE-WESTPHALIA,INDEX,CITY,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,AACHEN,GERMANY,WALKING
3,135.66,2020-08-06,GLOBAL DATA,NORTH RHINE-WESTPHALIA,INDEX,CITY,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,AACHEN,GERMANY,WALKING
4,83.67,2021-01-22,GLOBAL DATA,NORTH RHINE-WESTPHALIA,INDEX,CITY,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,AACHEN,GERMANY,WALKING


In [100]:
ger['Geography Type'].value_counts()

CITY          20090
SUB-REGION    15216
Name: Geography Type, dtype: int64

In [85]:
ger['Sub Region'].value_counts()

NORTH RHINE-WESTPHALIA    6150
BADEN-W++RTTEMBERG        3280
BAVARIA                   2870
SAXONY                    1640
HESSE                     1640
LOWER SAXONY              1230
BREMEN (STATE)             820
Name: Sub Region, dtype: int64

In [86]:
ger['Region'].value_counts()

BAVARIA                   1234
LOWER SAXONY              1234
RHINELAND-PALATINATE      1234
NORTH RHINE-WESTPHALIA    1234
BADEN-W++RTTEMBERG        1234
BRANDENBURG               1234
SCHLESWIG-HOLSTEIN        1234
HESSE                     1234
SAXONY                    1234
BERLIN                    1230
DUSSELDORF                1230
HANNOVER                  1230
COLOGNE                   1230
STUTTGART                 1230
MANNHEIM                  1230
HAMBURG                   1230
NUREMBERG                 1230
BOCHUM - DORTMUND         1230
BREMEN (STATE)             822
SAXONY-ANHALT              822
THURINGIA                  822
SAARLAND                   822
MECKLENBURG-VORPOMMERN     822
AUGSBURG                   820
KARLSRUHE                  820
M+¦NCHENGLADBACH           820
DRESDEN                    820
MUNICH                     820
M++NSTER                   820
LEIPZIG                    820
AACHEN                     820
FRANKFURT                  820
BREMEN  

In [87]:
ger.isnull().sum()

Value                      0
Date                       0
Super Region               0
Sub Region             17676
Unit                       0
Geography Type             0
Source                     0
Sub-Sector                 0
Frequency                  0
Region                     0
Country                    0
Transportation Type        0
dtype: int64

I'll have to divide into regions since sub-region has so many nulls that I can't reasonably impute.

Altogether I'd like to have a 3 columns for each region in the dataframe similar to this:

| Date | Apple-(country)-(region)-transit | Apple-(country)-(region)-walking | Apple-(country)-(region)-driving |

The country signifier doesn't matter NOW but it will matter when we join this dataframe with other countries' mobility data.


In [88]:
country = ger['Country'][0]

In [89]:
types = list(dict(ger['Transportation Type'].value_counts()).keys())

In [90]:
regions = list(dict(ger['Region'].value_counts()).keys())

In [91]:
ger[(ger['Country']==country)&
    (ger['Region']==regions[0])&
    (ger['Transportation Type']==types[0])  
]['Value'].copy()

21766     63.98
21767    167.52
21768     67.65
21769    166.75
21770     71.01
          ...  
22173     96.92
22174    172.48
22175    158.09
22176    147.82
22177    135.92
Name: Value, Length: 412, dtype: float64

In [92]:
ger['test_column'] = ger[(ger['Country']==country)&
    (ger['Region']==regions[0])&
    (ger['Transportation Type']==types[0])  
]['Value'].copy() #test

In [93]:
ger[['Date', 'test_column']].notnull().sum()

Date           35306
test_column      412
dtype: int64

In [94]:
ger[ger['test_column'].notnull()]

Unnamed: 0,Value,Date,Super Region,Sub Region,Unit,Geography Type,Source,Sub-Sector,Frequency,Region,Country,Transportation Type,test_column
21766,63.98,2021-01-24,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,63.98
21767,167.52,2020-08-07,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,167.52
21768,67.65,2021-01-25,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,67.65
21769,166.75,2020-08-06,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,166.75
21770,71.01,2021-01-22,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,71.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22173,96.92,2020-08-30,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,96.92
22174,172.48,2020-09-04,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,172.48
22175,158.09,2020-09-03,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,158.09
22176,147.82,2020-09-02,GLOBAL DATA,,INDEX,SUB-REGION,"APPLE, INC.",APPLE MOBILITY INDEX,DAILY,BAVARIA,GERMANY,WALKING,147.82


In [95]:
ger.shape

(35306, 13)

In [104]:
def transmogrify(some_df):
    country = some_df['Country'][0] #each df is already limited to a single country
    types = list(dict(some_df['Transportation Type'].value_counts()).keys()) #between 1 and 3 depending on data size
    regions = list(dict(some_df['Region'].value_counts()).keys()) #list of all regions
    #print(country, types, regions[:5]) #preview
    #new_cols = []
    for transport in types:
        for region in regions:
            #new_cols.append((f'apple_{country}_{region}_{transport}')) #collect each new column name
            some_df[f'apple_{country}_{region}_{transport}'] = some_df[
                (some_df['Country'] == country)&
                (some_df['Region'] == region)&
                (some_df['Transportation Type'] == transport)
            ]['Value'] #create a new column of values from that country + region + type and name it accordingly
    return some_df#, new_cols
    

In [97]:
new_ger, cols = transmogrify(ger)

In [98]:
new_ger[cols].isnull().sum()

apple_GERMANY_BAVARIA_WALKING                   34894
apple_GERMANY_LOWER SAXONY_WALKING              34894
apple_GERMANY_RHINELAND-PALATINATE_WALKING      34894
apple_GERMANY_NORTH RHINE-WESTPHALIA_WALKING    34894
apple_GERMANY_BADEN-W++RTTEMBERG_WALKING        34894
                                                ...  
apple_GERMANY_LEIPZIG_TRANSIT                   35306
apple_GERMANY_AACHEN_TRANSIT                    35306
apple_GERMANY_FRANKFURT_TRANSIT                 35306
apple_GERMANY_BREMEN_TRANSIT                    35306
apple_GERMANY_WIESBADEN_TRANSIT                 35306
Length: 102, dtype: int64

In [99]:
print(
ger.shape,
new_ger.shape)

(35306, 115) (35306, 115)


In [102]:
usa_w = pd.read_excel('./data/Transportation/apple_mobility_US_Walking.xlsx')
usa_d = pd.read_excel('./data/Transportation/apple_mobility_US_Driving.xlsx')
usa_t = pd.read_excel('./data/Transportation/apple_mobility_US_Transit.xlsx')

In [103]:
usa_w.head() #preview

Unnamed: 0,Value,Date,Sub-Sector,Super Region,Frequency,Region,Country,Unit,Geography Type,Transportation Type,Source
0,82.56,2021-01-24,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,"APPLE, INC."
1,185.41,2020-08-07,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,"APPLE, INC."
2,78.86,2021-01-25,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,"APPLE, INC."
3,152.63,2020-08-06,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,"APPLE, INC."
4,108.2,2021-01-22,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,"APPLE, INC."


In [105]:
usa_w = transmogrify(usa_w)

In [106]:
usa_w

Unnamed: 0,Value,Date,Sub-Sector,Super Region,Frequency,Region,Country,Unit,Geography Type,Transportation Type,...,apple_UNITED STATES_TEXAS_WALKING,apple_UNITED STATES_KENTUCKY_WALKING,apple_UNITED STATES_OKLAHOMA_WALKING,apple_UNITED STATES_SOUTH DAKOTA_WALKING,apple_UNITED STATES_UTAH_WALKING,apple_UNITED STATES_MINNESOTA_WALKING,apple_UNITED STATES_ARIZONA_WALKING,apple_UNITED STATES_RHODE ISLAND_WALKING,apple_UNITED STATES_MONTANA_WALKING,apple_UNITED STATES_NEVADA_WALKING
0,82.56,2021-01-24,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
1,185.41,2020-08-07,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
2,78.86,2021-01-25,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
3,152.63,2020-08-06,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
4,108.20,2021-01-22,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,ILLINOIS,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21007,211.33,2020-08-30,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,DELAWARE,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
21008,277.22,2020-09-04,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,DELAWARE,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
21009,214.72,2020-09-03,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,DELAWARE,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,
21010,215.89,2020-09-02,APPLE MOBILITY INDEX,GLOBAL DATA,DAILY,DELAWARE,UNITED STATES,INDEX,SUB-REGION,WALKING,...,,,,,,,,,,


In [107]:
usa_w.isnull().sum()

Value                                           0
Date                                            0
Sub-Sector                                      0
Super Region                                    0
Frequency                                       0
                                            ...  
apple_UNITED STATES_MINNESOTA_WALKING       20600
apple_UNITED STATES_ARIZONA_WALKING         20600
apple_UNITED STATES_RHODE ISLAND_WALKING    20600
apple_UNITED STATES_MONTANA_WALKING         20600
apple_UNITED STATES_NEVADA_WALKING          20600
Length: 62, dtype: int64

In [109]:
usa_w.shape

(21012, 62)