In [1]:
import pandas as pd
import os

In [2]:
land_use_numerization= {'amenity/restaurants':1,
                        'Amenities/ restaurants':1,
                        'building/schools':2,
                        'amenity/hospitals':3,
                        'amenity/place of worship':4,
                        'leisure/parks':5,
                        'leisure-parks':5,
                        'shop/supermarkets':6,
                        'shop/convenience':6,
                        'amenity/fuel':8}

# 1. Train the model

## 1.1 Read reference cities csv file

In [3]:
reference_df = pd.read_csv(os.path.join('data', 'ReferenceCities.csv'), encoding='iso-8859-1')

In [4]:
reference_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Neighbourhoods             33 non-null     object 
 1   Population                 33 non-null     int64  
 2   Area (km2)                 32 non-null     float64
 3   Population Density         32 non-null     float64
 4   Average distance of binsÿ  33 non-null     float64
 5   Predominant land use       33 non-null     object 
 6   Num. of street segments    33 non-null     int64  
 7   Number of bins             33 non-null     int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 2.2+ KB


In [5]:
reference_df['Predominant land use'].unique()

array(['amenity/restaurants', 'leisure/parks', 'Amenities/ restaurants',
       'leisure-parks', 'building/schools', 'shop/supermarkets'],
      dtype=object)

In [6]:
reference_df['Predominant land use_num']= reference_df['Predominant land use'].map(land_use_numerization)

In [7]:
reference_df = reference_df.fillna(0)

In [8]:
reference_df

Unnamed: 0,Neighbourhoods,Population,Area (km2),Population Density,Average distance of binsÿ,Predominant land use,Num. of street segments,Number of bins,Predominant land use_num
0,Bronx,2717758,110.0,24707.0,4.78,amenity/restaurants,17338,108,1
1,Manhattan,3123068,59.1,52844.0,5.82,amenity/restaurants,9702,184,1
2,Queens,4460101,280.0,15929.0,8.63,amenity/restaurants,55192,117,1
3,Brooklyn,4970026,180.0,27611.0,6.35,leisure/parks,22709,94,5
4,Staten Island,912458,152.0,6003.0,6.71,leisure/parks,16060,42,5
5,Salamanca,145344,5.41,26865.0,1.22,Amenities/ restaurants,2050,736,1
6,Chamartin,141527,9.19,15400.0,1.83,Amenities/ restaurants,2935,1095,1
7,Moratalaz,92958,6.34,14662.0,1.08,leisure-parks,1728,1933,5
8,Ciudad-Lineal,212565,11.36,18711.0,2.02,Amenities/ restaurants,4201,2258,1
9,Hortaleza,185738,28.0,6633.0,1.9,leisure/parks,7294,2838,5


## 1.2 Split data in test train 

In [9]:
X = reference_df.drop(columns=['Neighbourhoods', 'Number of bins', 'Predominant land use', 'Average distance of binsÿ'])
y= reference_df['Number of bins']

In [10]:
X

Unnamed: 0,Population,Area (km2),Population Density,Num. of street segments,Predominant land use_num
0,2717758,110.0,24707.0,17338,1
1,3123068,59.1,52844.0,9702,1
2,4460101,280.0,15929.0,55192,1
3,4970026,180.0,27611.0,22709,5
4,912458,152.0,6003.0,16060,5
5,145344,5.41,26865.0,2050,1
6,141527,9.19,15400.0,2935,1
7,92958,6.34,14662.0,1728,5
8,212565,11.36,18711.0,4201,1
9,185738,28.0,6633.0,7294,5


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

## 1.3 Generate SVM model

In [12]:
from sklearn.svm import SVR

In [13]:
svm_model_train_test = SVR()

In [14]:
svm_model_train_test.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [15]:
y_pred= svm_model_train_test.predict(X_test)

In [16]:
from sklearn.metrics import r2_score, mean_squared_error

### 1.3.1 Train test error

In [17]:
r2_score(y_test,y_pred)

-0.47291893497643733

In [18]:
mean_squared_error(y_test,y_pred)

13893.307854165245

## 1.4 Generate RF model

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
rf_model_train_test = RandomForestRegressor()

rf_model_train_test.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [21]:
y_pred= rf_model_train_test.predict(X_test)

### 1.4.1 Train test error

In [22]:
r2_score(y_test,y_pred)

-6.104041129075008

In [23]:
mean_squared_error(y_test,y_pred)

67008.86795000001

# 2. Test model in target cities

## 2.1 Generate models with all reference cities data

In [24]:
svm_model_all = SVR()
svm_model_all.fit(X, y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [25]:
rf_model_all = RandomForestRegressor()
rf_model_all.fit(X, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

## 2.2 Quezon city

### 2.2.1 Read quezon csv file

In [26]:
quezon_df = pd.read_csv(os.path.join('data', 'Quezon City grp1&2.csv'), encoding='iso-8859-1')
quezon_df = quezon_df.drop(columns='Average distance of binsÿ')

In [27]:
quezon_df['Predominant land use'].unique()

array(['amenity/restaurants', 'building/schools', 'amenity/fuel',
       'amenity/place of worship', 'shop/convenience'], dtype=object)

In [28]:
quezon_df['Predominant land use_num']= quezon_df['Predominant land use'].map(land_use_numerization)

In [29]:
quezon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Neighbourhoods            14 non-null     object 
 1   Population                14 non-null     int64  
 2   Area (km2)                14 non-null     float64
 3   Population Density        14 non-null     float64
 4   Predominant land use      14 non-null     object 
 5   Num. of street segments   14 non-null     int64  
 6   Predominant land use_num  14 non-null     int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 912.0+ bytes


In [30]:
quezon_df.head()

Unnamed: 0,Neighbourhoods,Population,Area (km2),Population Density,Predominant land use,Num. of street segments,Predominant land use_num
0,Don Manuelÿ,3753,0.238,157.689,amenity/restaurants,33,1
1,Dona Josefa,2909,0.282,103.15,amenity/restaurants,46,1
2,Dona Aurora,5636,0.128,440.0,building/schools,37,2
3,Dona Imelda,16915,0.929,182.077,amenity/restaurants,138,1
4,San Isidroÿ,8578,0.132,649.0,building/schools,173,2


In [31]:
X_quezon = quezon_df.drop(columns=['Neighbourhoods', 'Predominant land use'])

In [32]:
X_quezon

Unnamed: 0,Population,Area (km2),Population Density,Num. of street segments,Predominant land use_num
0,3753,0.238,157.689,33,1
1,2909,0.282,103.15,46,1
2,5636,0.128,440.0,37,2
3,16915,0.929,182.077,138,1
4,8578,0.132,649.0,173,2
5,10278,0.193,532.5,299,1
6,63129,0.925,682.4,152,2
7,53151,3.12,17035.0,430,1
8,41154,2.01,20474.0,200,8
9,22764,4.24,5368.86,427,1


### 2.2.2 SVM results

In [33]:
y_pred= svm_model_all.predict(X_quezon)

In [34]:
for n,o in zip(quezon_df['Neighbourhoods'],y_pred):
    print('{} --> {}'.format(n,o))

Don Manuelÿ --> 187.89484217113284
Dona Josefa --> 187.89449205710036
Dona Aurora --> 187.89563007972052
Dona Imelda --> 187.89988270680647
San Isidroÿ --> 187.89681388267488
Santo Nino --> 187.89746080345864
Tatalon --> 187.91245624218482
Fairview --> 187.91161912958665
North Fairview --> 187.90882565364691
Greater Lagro --> 187.90238936496723
Pasong Putik Proper --> 187.90589449952435
Kaligayahan --> 187.9121264750285
Santa Monica --> 187.91039228463302
Santa Lucia --> 187.90429826586572


### 2.2.3 RF results

In [35]:
y_pred= rf_model_all.predict(X_quezon)

In [36]:
for n,o in zip(quezon_df['Neighbourhoods'],y_pred):
    print('{} --> {}'.format(n,o))

Don Manuelÿ --> 174.56
Dona Josefa --> 174.56
Dona Aurora --> 156.54
Dona Imelda --> 193.64
San Isidroÿ --> 156.33
Santo Nino --> 181.43
Tatalon --> 231.68
Fairview --> 422.0
North Fairview --> 517.33
Greater Lagro --> 180.88
Pasong Putik Proper --> 155.77
Kaligayahan --> 602.41
Santa Monica --> 576.29
Santa Lucia --> 509.68


## 2.3 Indian cities

### 2.3.1 Read Indian cities csv file

In [37]:
india_df = pd.read_csv(os.path.join('data', 'India grp1&2.csv'), encoding='iso-8859-1')
india_df = india_df.drop(columns='Average distance of binsÿ')

In [38]:
india_df['Predominant land use'].unique()

array(['amenity/hospitals', nan, 'leisure/parks',
       'amenity/place of worship'], dtype=object)

In [39]:
india_df['Predominant land use_num']= india_df['Predominant land use'].map(land_use_numerization)

In [40]:
india_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Neighbourhoods            8 non-null      object 
 1   Population                8 non-null      int64  
 2   Area (km2)                8 non-null      float64
 3   Population Density        8 non-null      int64  
 4   Predominant land use      7 non-null      object 
 5   Num. of street segments   8 non-null      int64  
 6   Predominant land use_num  7 non-null      float64
dtypes: float64(2), int64(3), object(2)
memory usage: 576.0+ bytes


In [41]:
india_df= india_df.dropna()

In [42]:
X_india = india_df.drop(columns=['Neighbourhoods', 'Predominant land use'])

In [43]:
X_india

Unnamed: 0,Population,Area (km2),Population Density,Num. of street segments,Predominant land use_num
0,190619,11.0,17329,134,3.0
2,1618879,159.0,10182,5145,5.0
3,298432,2230.0,134,156,4.0
4,1391753,2140.0,650,3760,5.0
5,7045313,6968.0,1011,17275,3.0
6,6081322,4549.0,1336,4472,3.0
7,1047635,6607.0,159,1060,3.0


### 2.3.2 SVM results

In [44]:
y_pred= svm_model_all.predict(X_india)

In [45]:
for n,o in zip(india_df['Neighbourhoods'],y_pred):
    print('{} --> {}'.format(n,o))

Barnala --> 187.90553122047598
Ludhiana --> 185.16910302344823
Moga --> 187.8479351234792
Gandhinagar --> 185.64775397339423
Ahmedabad --> 184.8857553544863
Surat --> 184.25939667014032
Jamnagar --> 186.47805562883042


### 2.3.3 RF results

In [46]:
y_pred= rf_model_all.predict(X_india)

In [47]:
for n,o in zip(india_df['Neighbourhoods'],y_pred):
    print('{} --> {}'.format(n,o))

Barnala --> 1370.55
Ludhiana --> 1075.85
Moga --> 907.82
Gandhinagar --> 855.24
Ahmedabad --> 500.46
Surat --> 807.5
Jamnagar --> 738.86


In [48]:
print("That's all folks")

That's all folks
