In [29]:
import pandas as pd
import fiona
import matplotlib.pyplot as plt
pd.options.display.max_rows = 4000

**churn_usersprofile**: Socio-demographic characterization and the residence area of the bus users. Information like the gender, age class, county of origin of bus users can be very useful for this challenge.  
**churn_od**: Demand for public transportation in each county of origin and its respective parish of destination.  

**Goals**: Create a framework that will allow to measure the increase and decrease the passengers of public transport towards cities that are more inclusive and sustainable.  
**Outcome**: Given the public transports users’ profile in two periods, the aggregated OD matrix, and any other external or derived data, this exercise should produce the following major outcomes: Identify churn profiles and drivers, propose measures to win back lost segments and their expected impact.

In [9]:
up = pd.read_csv('https://wdl-data.fra1.digitaloceanspaces.com/pse/Churn_UsersProfile.txt',
                                 encoding = "ISO-8859-1",
                                sep='|')
churn_od = pd.read_csv('https://wdl-data.fra1.digitaloceanspaces.com/pse/Churn_OD.txt',
                                 encoding = "ISO-8859-1",
                                sep='|')

In [10]:
up.head(2)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Period,GenderDescription,AgeClassDescription,Average_BusUsers_per_Day
0,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,15-24,294.194206
1,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,25-34,1081.652817


In [7]:
churn_od.head(2)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Region_of_Public_Transportation,District_of_Public_Transportation,County_of_Public_Transportation,Dicofre_ParishCode_of_Public_Transportation,Demand_weight
0,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110608,0.307323
1,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110639,0.069997


In [11]:
print(up.shape, churn_od.shape)

(544, 7) (2253, 8)


In [12]:
set(up.Period)

{'Sep-19 to Feb-20', 'Sep-20 to Jan-21'}

# Analyzing Users Profile

In [13]:
up.columns

Index(['Region_of_Origin', 'District_of_Origin', 'County_of_Origin', 'Period',
       'GenderDescription', 'AgeClassDescription', 'Average_BusUsers_per_Day'],
      dtype='object')

In [14]:
up.County_of_Origin.unique()

array(['Amadora', 'Cascais', 'Lisboa', 'Loures', 'Mafra', 'Odivelas',
       'Oeiras', 'Sintra', 'Vila Franca de Xira', 'Alcochete', 'Almada',
       'Barreiro', 'Moita', 'Montijo', 'Palmela', 'Seixal', 'Sesimbra',
       'Setúbal', 'Espinho', 'Oliveira de Azeméis',
       'Santa Maria da Feira', 'Gondomar', 'Maia', 'Matosinhos',
       'Paredes', 'Porto', 'Póvoa de Varzim', 'Santo Tirso', 'Trofa',
       'Valongo', 'Vila do Conde', 'Vila Nova de Gaia', 'Arouca',
       'São João da Madeira', 'Vale de Cambra'], dtype=object)

 - We have single values for each combination of Period, County, Gender and Age

In [55]:
profiles_gb = up.groupby(['County_of_Origin','GenderDescription', 'AgeClassDescription']).size().to_frame()

In [56]:
single_period = profiles_gb[profiles_gb.values==1].reset_index()

## Find Variation Whenever Possible

In [90]:
two_periods = up.merge(single_period, how='outer', indicator=True).query('_merge != "both"').drop(['_merge', 0], 1)

In [91]:
two_periods.groupby(['Period']).size()

Period
Sep-19 to Feb-20    239
Sep-20 to Jan-21    239
dtype: int64

In [92]:
df_first = two_periods[two_periods.Period == 'Sep-19 to Feb-20'].drop('Period', axis=1).rename(columns={'Average_BusUsers_per_Day': 'Average_BusUsers_per_Day_first'})
df_second = two_periods[two_periods.Period == 'Sep-20 to Jan-21'].drop('Period', axis=1).rename(columns={'Average_BusUsers_per_Day': 'Average_BusUsers_per_Day_second'})

In [99]:
both_averages = df_first.merge(df_second, how='outer', indicator=True).drop('_merge', 1)

In [107]:
change_df = both_averages
change_df['Change'] = change_df.Average_BusUsers_per_Day_first - change_df.Average_BusUsers_per_Day_second
change_df = change_df.drop(['Average_BusUsers_per_Day_first', 'Average_BusUsers_per_Day_second'], 1)

In [108]:
change_df.sort_values('Change', ascending=False)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,GenderDescription,AgeClassDescription,Change
173,R2 - AM Porto,Porto,Gondomar,Female,65+,32475.790502
190,R2 - AM Porto,Porto,Maia,Male,65+,30134.699342
211,R2 - AM Porto,Porto,Porto,Male,65+,6568.967914
205,R2 - AM Porto,Porto,Porto,Female,55-64,4525.266767
179,R2 - AM Porto,Porto,Gondomar,Male,65+,4442.24135
206,R2 - AM Porto,Porto,Porto,Female,65+,3907.464606
17,R1 - AM Lisboa,Lisboa,Cascais,Male,15-24,3294.683979
194,R2 - AM Porto,Porto,Matosinhos,Female,55-64,3184.585351
232,R2 - AM Porto,Porto,Vila Nova de Gaia,Female,45-54,3175.84717
207,R2 - AM Porto,Porto,Porto,Male,25-34,2988.483325
