In [1]:
import pandas as pd
import fiona
import matplotlib.pyplot as plt
pd.options.display.max_rows = 4000
from IPython.display import display
from matplotlib.collections import PatchCollection
from descartes import PolygonPatch
from shapely.geometry import Polygon, MultiPolygon, shape

**churn_usersprofile**: Socio-demographic characterization and the residence area of the bus users. Information like the gender, age class, county of origin of bus users can be very useful for this challenge.  
**churn_od**: Demand for public transportation in each county of origin and its respective parish of destination.  

**Goals**: Create a framework that will allow to measure the increase and decrease the passengers of public transport towards cities that are more inclusive and sustainable.  
**Outcome**: Given the public transports users’ profile in two periods, the aggregated OD matrix, and any other external or derived data, this exercise should produce the following major outcomes: Identify churn profiles and drivers, propose measures to win back lost segments and their expected impact.

In [2]:
up = pd.read_csv('https://wdl-data.fra1.digitaloceanspaces.com/pse/Churn_UsersProfile.txt',
                                 encoding = "ISO-8859-1",
                                sep='|')
od = pd.read_csv('https://wdl-data.fra1.digitaloceanspaces.com/pse/Churn_OD.txt',
                                 encoding = "ISO-8859-1",
                                sep='|', dtype={'Dicofre_ParishCode_of_Public_Transportation': str})

In [5]:
print(up.shape, od.shape)

(544, 7) (2253, 8)


# Change DF

In [89]:
change_df = pd.read_csv('variation.csv')

# Churn Userprofiles Table

In [67]:
churn_df = change_df[change_df.Change<0]

In [84]:
churn_df.head(10)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,GenderDescription,AgeClassDescription,Change,Change_pct
86,R1 - AM Lisboa,Lisboa,Vila Franca de Xira,Female,25-34,-2.505004,-0.043873
219,R2 - AM Porto,Porto,Trofa,Male,35-44,-2.618801,-0.125641
120,R1 - AM Lisboa,Setúbal,Moita,Male,55-64,-4.481175,-0.263158
225,R2 - AM Porto,Porto,Valongo,Male,45-54,-7.422021,-0.051826
122,R1 - AM Lisboa,Setúbal,Montijo,Female,45-54,-9.367287,-0.278861
146,R1 - AM Lisboa,Setúbal,Sesimbra,Male,25-34,-12.161867,-0.109695
164,R2 - AM Porto,Aveiro,Santa Maria da Feira,Female,45-54,-12.527012,-0.18537
92,R1 - AM Lisboa,Lisboa,Vila Franca de Xira,Male,35-44,-12.933766,-0.094115
216,R2 - AM Porto,Porto,Santo Tirso,Female,35-44,-13.14312,-0.307346
137,R1 - AM Lisboa,Setúbal,Seixal,Female,55-64,-15.365188,-0.064906


In [82]:
for col in churn_df.columns:
    if churn_df[col].dtype == 'object':
        display(churn_df.value_counts(col).to_frame())

Unnamed: 0_level_0,0
Region_of_Origin,Unnamed: 1_level_1
R1 - AM Lisboa,124
R2 - AM Porto,67


Unnamed: 0_level_0,0
District_of_Origin,Unnamed: 1_level_1
Lisboa,79
Porto,56
Setúbal,45
Aveiro,11


Unnamed: 0_level_0,0
County_of_Origin,Unnamed: 1_level_1
Lisboa,11
Odivelas,10
Sintra,10
Cascais,10
Vila Franca de Xira,10
Porto,10
Gondomar,9
Oeiras,8
Seixal,8
Maia,8


Unnamed: 0_level_0,0
GenderDescription,Unnamed: 1_level_1
Female,98
Male,93


Unnamed: 0_level_0,0
AgeClassDescription,Unnamed: 1_level_1
35-44,45
45-54,43
25-34,37
55-64,30
15-24,20
65+,16


# Analyzing Demand Table

In [90]:
od_freguesia = pd.read_csv('intermediate-data/demand.csv')

In [30]:
# Demands sum to a value very close to 1 on the county level
od_freguesia.groupby(['County_of_Origin'])['Demand_weight'].sum().to_frame().sort_values('Demand_weight', ascending=False).sample(5)

Unnamed: 0_level_0,Demand_weight
County_of_Origin,Unnamed: 1_level_1
Alcochete,1.000004
Oliveira de Azeméis,1.0
Seixal,1.000001
Gondomar,1.000001
Trofa,1.000001
