# Geospatial Analysis - Results 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import os
sb.set()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
import holoviews as hv
import geoviews as gv
import datashader as ds
from colorcet import fire, rainbow, bgy, bjy, bkr, kb, kr
from datashader.colors import colormap_select, Greys9
from holoviews.streams import RangeXY
from holoviews.operation.datashader import datashade, dynspread, rasterize
from bokeh.io import push_notebook, show, output_notebook

In [None]:
customers = pd.read_csv("olist_customers_dataset.csv")

order_items = pd.read_csv("olist_order_items_dataset.csv")

order_payment = pd.read_csv("olist_order_payments_dataset.csv")

review = pd.read_csv("olist_order_reviews_dataset.csv")

orders = pd.read_csv("olist_orders_dataset.csv")

products = pd.read_csv("olist_products_dataset.csv")

seller = pd.read_csv("olist_sellers_dataset.csv")

products_translation = pd.read_csv("product_category_name_translation.csv")

geo = pd.read_csv("olist_geolocation_dataset.csv", dtype={'geolocation_zip_code_prefix': str})

In [None]:
geo

In [None]:
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix'].str[0:3]
geo.head(3)

In [None]:
#table data to show the zip code and corresponding city and state
geo.iloc[:, 3:6]

In [None]:
geo['geolocation_zip_code_prefix'].value_counts().to_frame().describe()

In [None]:
#removal of outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
geo = geo[geo.geolocation_lat <= 5.27438888]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
geo = geo[geo.geolocation_lng >= -73.98283055]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
geo = geo[geo.geolocation_lat >= -33.75116944]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
geo = geo[geo.geolocation_lng <=  -34.79314722]

In [None]:
from datashader.utils import lnglat_to_meters as webm
x, y = webm(geo.geolocation_lng, geo.geolocation_lat)
geo['x'] = pd.Series(x)
geo['y'] = pd.Series(y)

In [None]:
# making the zipcode to int for plotting 
geo['geolocation_zip_code_prefix'] = geo['geolocation_zip_code_prefix'].astype(int)
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix_3_digits'].astype(int)

In [None]:
brazil = geo
zipcode = 'geolocation_zip_code_prefix'

In [None]:
orders_df = pd.read_csv('olist_orders_dataset.csv')
order_items = pd.read_csv('olist_order_items_dataset.csv')
order_reviews = pd.read_csv('olist_order_reviews_dataset.csv')
customer = pd.read_csv('olist_customers_dataset.csv', dtype={'customer_zip_code_prefix': str})

# getting the first 3 digits of customer zipcode
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix'].str[0:3]
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix_3_digits'].astype(int)

brazil_geo = geo.set_index('geolocation_zip_code_prefix_3_digits').copy()

In [None]:
brazil_geo

In [None]:
# merging the data
orders_copy = orders_df.merge(order_items, on='order_id')
orders_copy = orders_copy.merge(customer, on='customer_id')
orders_copy = orders_copy.merge(order_reviews, on='order_id')

In [None]:
orders_copy

In [None]:
gp = orders_copy.groupby('customer_zip_code_prefix_3_digits')['price'].sum().to_frame()
revenue = brazil_geo.join(gp)
average = 'revenue'
revenue[average] = revenue.price

In [None]:
output_notebook()
hv.extension('bokeh')

%opts Overlay [width=800 height=600 toolbar='above' xaxis=None yaxis=None]
%opts QuadMesh [tools=['hover'] colorbar=True] (alpha=0 hover_alpha=0.2)

def plot_map(data, label, datavalue, zipcode):
    p = hv.Points(gv.Dataset(data, kdims=['x', 'y'], vdims=[average]))
    hover = hv.util.Dynamic(rasterize(p, aggregator=datavalue, width=40, height=30, streams=[RangeXY]), operation=hv.QuadMesh)
    hover = hover.options(cmap=rainbow)
    url="http://server.arcgisonline.com/ArcGIS/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{Z}/{Y}/{X}.png"
    geomap = gv.WMTS(url)
    agg = datashade(p, element_type=gv.Image, aggregator=datavalue, cmap=rainbow)
    zip_codes = dynspread(agg, threshold=0.05, max_px=1)
    img = geomap * zip_codes * hover
    img = img.relabel(label)
    return img

In [None]:
plot_map(revenue, 'Revenue from Orders Based on location (zipcode)', ds.mean(average), average)

In [None]:
pip install plotly

In [None]:
pip install cufflinks

In [None]:
import plotly.express as px

In [None]:
#map may not load, but a video recording of the results is shown in the powerpoint slides
fig = px.density_mapbox(revenue, lat='geolocation_lat', lon='geolocation_lng', z='revenue', radius=2,
                        center=dict(lat=0, lon=180), zoom=0,
                        mapbox_style="stamen-terrain")
fig.show()

#### Again, we can confirm our findings as through this map we can see that most revenue is generated from SP, RJ, and ES. Since this is the general trend being observed, we can recommend that sellers cater their products to these regions. 

### Based on the Geospatial Analysis, we deduce a good understanding of customer demographics trends in this e-commerce marketplace.