In [26]:
import pandas as pd
import fiona
import matplotlib.pyplot as plt
pd.options.display.max_rows = 4000
from IPython.display import display

**churn_usersprofile**: Socio-demographic characterization and the residence area of the bus users. Information like the gender, age class, county of origin of bus users can be very useful for this challenge.  
**churn_od**: Demand for public transportation in each county of origin and its respective parish of destination.  

**Goals**: Create a framework that will allow to measure the increase and decrease the passengers of public transport towards cities that are more inclusive and sustainable.  
**Outcome**: Given the public transports users’ profile in two periods, the aggregated OD matrix, and any other external or derived data, this exercise should produce the following major outcomes: Identify churn profiles and drivers, propose measures to win back lost segments and their expected impact.

In [72]:
up = pd.read_csv('https://wdl-data.fra1.digitaloceanspaces.com/pse/Churn_UsersProfile.txt',
                                 encoding = "ISO-8859-1",
                                sep='|')
od = pd.read_csv('https://wdl-data.fra1.digitaloceanspaces.com/pse/Churn_OD.txt',
                                 encoding = "ISO-8859-1",
                                sep='|', dtype={'Dicofre_ParishCode_of_Public_Transportation': str})

In [3]:
up.head(2)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Period,GenderDescription,AgeClassDescription,Average_BusUsers_per_Day
0,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,15-24,294.194206
1,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,25-34,1081.652817


In [4]:
churn_od.head(2)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Region_of_Public_Transportation,District_of_Public_Transportation,County_of_Public_Transportation,Dicofre_ParishCode_of_Public_Transportation,Demand_weight
0,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110608,0.307323
1,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110639,0.069997


In [5]:
print(up.shape, churn_od.shape)

(544, 7) (2253, 8)


In [6]:
set(up.Period)

{'Sep-19 to Feb-20', 'Sep-20 to Jan-21'}

# Analyzing Users Profile

In [7]:
up.columns

Index(['Region_of_Origin', 'District_of_Origin', 'County_of_Origin', 'Period',
       'GenderDescription', 'AgeClassDescription', 'Average_BusUsers_per_Day'],
      dtype='object')

In [8]:
up.County_of_Origin.unique()

array(['Amadora', 'Cascais', 'Lisboa', 'Loures', 'Mafra', 'Odivelas',
       'Oeiras', 'Sintra', 'Vila Franca de Xira', 'Alcochete', 'Almada',
       'Barreiro', 'Moita', 'Montijo', 'Palmela', 'Seixal', 'Sesimbra',
       'Setúbal', 'Espinho', 'Oliveira de Azeméis',
       'Santa Maria da Feira', 'Gondomar', 'Maia', 'Matosinhos',
       'Paredes', 'Porto', 'Póvoa de Varzim', 'Santo Tirso', 'Trofa',
       'Valongo', 'Vila do Conde', 'Vila Nova de Gaia', 'Arouca',
       'São João da Madeira', 'Vale de Cambra'], dtype=object)

 - We have single values for each combination of Period, County, Gender and Age

In [9]:
profiles_gb = up.groupby(['County_of_Origin','GenderDescription', 'AgeClassDescription']).size().to_frame()

In [10]:
single_period = profiles_gb[profiles_gb.values==1].reset_index()

## Find Variation Whenever Possible

In [11]:
two_periods = up.merge(single_period, how='outer', indicator=True).query('_merge != "both"').drop(['_merge', 0], 1)

In [12]:
two_periods.groupby(['Period']).size()

Period
Sep-19 to Feb-20    239
Sep-20 to Jan-21    239
dtype: int64

In [13]:
df_first = two_periods[two_periods.Period == 'Sep-19 to Feb-20'].drop('Period', axis=1).rename(columns={'Average_BusUsers_per_Day': 'Average_BusUsers_per_Day_first'})
df_second = two_periods[two_periods.Period == 'Sep-20 to Jan-21'].drop('Period', axis=1).rename(columns={'Average_BusUsers_per_Day': 'Average_BusUsers_per_Day_second'})

In [14]:
both_averages = df_first.merge(df_second, how='outer', indicator=True).drop('_merge', 1)

In [15]:
change_df = both_averages
change_df['Change'] = change_df.Average_BusUsers_per_Day_first - change_df.Average_BusUsers_per_Day_second
change_df = change_df.drop(['Average_BusUsers_per_Day_first', 'Average_BusUsers_per_Day_second'], 1)

In [16]:
change_df =change_df.sort_values('Change', ascending=False)

# Insights from Change table

In [32]:
for col in change_df.columns.drop('Change'):
    display(change_df.groupby(col)['Change'].mean().to_frame().sort_values('Change', ascending=False))

Unnamed: 0_level_0,Change
Region_of_Origin,Unnamed: 1_level_1
R2 - AM Porto,1367.856185
R1 - AM Lisboa,331.812747


Unnamed: 0_level_0,Change
District_of_Origin,Unnamed: 1_level_1
Porto,1580.489648
Lisboa,434.074012
Aveiro,222.90677
Setúbal,168.194724


Unnamed: 0_level_0,Change
County_of_Origin,Unnamed: 1_level_1
Gondomar,3655.146825
Maia,2776.060462
Porto,2189.972712
Matosinhos,861.287204
Lisboa,843.418032
Oliveira de Azeméis,726.730692
Vila Nova de Gaia,705.161995
Odivelas,656.785136
Cascais,580.510053
Oeiras,441.192978


Unnamed: 0_level_0,Change
GenderDescription,Unnamed: 1_level_1
Male,702.848084
Female,680.833001


Unnamed: 0_level_0,Change
AgeClassDescription,Unnamed: 1_level_1
65+,3521.85701
15-24,548.103674
55-64,534.336945
25-34,400.486989
45-54,380.80139
35-44,165.373038


# Analyzing Demand Table

In [35]:
od.head(2)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Region_of_Public_Transportation,District_of_Public_Transportation,County_of_Public_Transportation,Dicofre_ParishCode_of_Public_Transportation,Demand_weight
0,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110608,0.307323
1,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110639,0.069997


In [36]:
asa = fiona.open("/home/primity/terras/gadm36_PRT_shp/gadm36_PRT_3.shp")

In [79]:
freguesias = pd.read_excel('https://dados.gov.pt/pt/datasets/r/ec6ef805-c278-4b4d-ba9b-3116264f68b4', engine="openpyxl")[['dicofre', 'freguesia']]

In [None]:
dem = od.merge(freguesias, on=[od.Dicofre_ParishCode_of_Public_Transportation, freguesias.dicofre], how='outer', indicator=True).drop('_merge', 1)

In [73]:
od.dtypes

Region_of_Origin                                object
District_of_Origin                              object
County_of_Origin                                object
Region_of_Public_Transportation                 object
District_of_Public_Transportation               object
County_of_Public_Transportation                 object
Dicofre_ParishCode_of_Public_Transportation     object
Demand_weight                                  float64
dtype: object

In [74]:
freguesias.dtypes

nivel         int64
distrito     object
concelho     object
freguesia    object
dicofre      object
brasao       object
dtype: object

In [87]:
Both_DFs = pd.merge(od, freguesias,
                    how='outer',left_on=['Dicofre_ParishCode_of_Public_Transportation'],
                    right_on=['dicofre']).dropna().drop(['dicofre', 'Dicofre_ParishCode_of_Public_Transportation'],1)

In [88]:
Both_DFs

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Region_of_Public_Transportation,District_of_Public_Transportation,County_of_Public_Transportation,Demand_weight,freguesia
0,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,0.307323,Benfica
1,R1 - AM Lisboa,Lisboa,Cascais,R1 - AM Lisboa,LISBOA,LISBOA,0.068546,Benfica
2,R1 - AM Lisboa,Lisboa,Lisboa,R1 - AM Lisboa,LISBOA,LISBOA,0.039357,Benfica
3,R1 - AM Lisboa,Lisboa,Loures,R1 - AM Lisboa,LISBOA,LISBOA,0.016486,Benfica
4,R1 - AM Lisboa,Lisboa,Mafra,R1 - AM Lisboa,LISBOA,LISBOA,0.033026,Benfica
5,R1 - AM Lisboa,Lisboa,Odivelas,R1 - AM Lisboa,LISBOA,LISBOA,0.067301,Benfica
6,R1 - AM Lisboa,Lisboa,Oeiras,R1 - AM Lisboa,LISBOA,LISBOA,0.063808,Benfica
7,R1 - AM Lisboa,Lisboa,Sintra,R1 - AM Lisboa,LISBOA,LISBOA,0.076509,Benfica
8,R1 - AM Lisboa,Lisboa,Vila Franca de Xira,R1 - AM Lisboa,LISBOA,LISBOA,0.020385,Benfica
9,R1 - AM Lisboa,Setúbal,Alcochete,R1 - AM Lisboa,LISBOA,LISBOA,0.592367,Benfica
