# Description

## Features used
Every feature used has to do with human circulation as I think they are very likely to influence given criterias. It includes :
 - Number of taxis and their average speed (within 50m, 100m and 200m radiuses)
 - Number of mobike starting and ending points (within 50m, 100m and 200m radiuses)
 - Number of subway stations within a 1km radius and the distance from the nearest one (or 2km if over 1km)
 
## Target
The target in conglomerate (stocked in the *conglomerate column*) of the clean, noise and smell factor in a string like so "clean, noise, smell".

# Imports

In [192]:
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from scipy.spatial import distance
import math
from random import sample
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
from IPython.display import clear_output
import time

# Taxis

In [8]:
import pickle5 as pickle
with open ('pickles/utseus-shanghai-taxi-speed-position-cleaned-new.pkl','rb') as fp:
    taxis = pickle.load(fp)
with open ('pickles/TARGET-communities-extract-wellbeing-3params-cleaned.pkl','rb') as fp:
    target = pickle.load(fp)


In [3]:
taxis = pd.read_pickle("pickles/utseus-shanghai-taxi-speed-position-cleaned-new.pkl")
target = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-cleaned.pkl")

In [191]:
from pyproj import CRS

mercator = CRS.from_epsg(4326)
china = CRS.from_epsg(4479)

from pyproj import Transformer

transformer = Transformer.from_crs(mercator, china, always_xy=True)

In [6]:
target["transformed_longitude"] = target.apply(lambda x : transformer.transform(x["longitude"], x["latitude"])[0], axis = 1)
target["transformed_latitude"] = target.apply(lambda x : transformer.transform(x["longitude"], x["latitude"])[1], axis = 1)
taxis["transformed_longitude"] = taxis.apply(lambda x : transformer.transform(x["lon"], x["lat"])[0], axis = 1)
taxis["transformed_latitude"] = taxis.apply(lambda x : transformer.transform(x["lon"], x["lat"])[1], axis = 1)

In [16]:
len(taxis["transformed_latitude"])

774794

In [7]:
gdf_taxis = gpd.GeoDataFrame(taxis,geometry=gpd.points_from_xy(taxis.transformed_longitude,taxis.transformed_latitude))
gdf_target = gpd.GeoDataFrame(target,geometry=gpd.points_from_xy(target.transformed_longitude,target.transformed_latitude))

#### 50

In [14]:
%%time

influenceRadius = 50
influenceRadiusSq = influenceRadius ** 2 

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 19.3 µs


In [15]:
%%time

filtered_taxis=[]

done = 1
total = len(target)
startTime = time.time()

for i, g in gdf_target.iterrows():
    gdf_f = gdf_taxis[gdf_taxis.geometry.within(g.geometry.buffer(influenceRadius))]
    avg_speed = gdf_f['speed'].mean()
    count = len(gdf_f)
    filtered_taxis.append([avg_speed,count])
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

421 / 421  done (~ 0h0m1s remaining)
CPU times: user 1h 4min 36s, sys: 3.45 s, total: 1h 4min 39s
Wall time: 1h 4min 50s


In [16]:
filtered_taxis = np.array(filtered_taxis)

target["nearTaxisAmount"] = filtered_taxis[:,1]
target["meanTaxisSpeed"] = filtered_taxis[:,0]
target.head(n=2)

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude,transformed_longitude,transformed_latitude,geometry,nearTaxisAmount,meanTaxisSpeed
1,万寿社区居委会,3,0.0,0.0,盈浦街道,青浦区,121.104287016932,31.1558823799679,-2822208.0,4677636.0,POINT (-2822207.587 4677635.627),0.0,
2,万泰花园第一居委会,3,1.0,0.0,七宝镇,闵行区,121.351529089795,31.144326843509,-2842711.0,4665980.0,POINT (-2842710.938 4665979.726),136.0,0.547059


In [17]:
target[["district", "town", "nearTaxisAmount", "meanTaxisSpeed", "noise", "clean", "smell", "longitude", "latitude"]].to_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-taxis-new-50m.pkl")

#### 100

In [33]:
influenceRadius = 100
influenceRadiusSq = influenceRadius ** 2 

In [34]:
%%time

filtered_taxis=[]

done = 1
total = len(target)
startTime = time.time()

for i, g in gdf_target.iterrows():
    gdf_f = gdf_taxis[gdf_taxis.geometry.within(g.geometry.buffer(influenceRadius))]
    avg_speed = gdf_f['speed'].mean()
    count = len(gdf_f)
    filtered_taxis.append([avg_speed,count])
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

5 / 421  done (~ 2h1m29s remaining)


KeyboardInterrupt: 

In [35]:
filtered_taxis = np.array(filtered_taxis)

target["nearTaxisAmount"] = filtered_taxis[:,1]
target["meanTaxisSpeed"] = filtered_taxis[:,0]
target.head(n=2)

ValueError: Length of values does not match length of index

In [None]:
target[["district", "town", "nearTaxisAmount", "meanTaxisSpeed", "noise", "clean", "smell", "longitude", "latitude"]].to_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-taxis-new-100m.pkl")

#### 200

In [22]:
influenceRadius = 200
influenceRadiusSq = influenceRadius ** 2 

In [23]:
%%time

filtered_taxis=[]

done = 1
total = len(target)
startTime = time.time()

for i, g in gdf_target.iterrows():
    gdf_f = gdf_taxis[gdf_taxis.geometry.within(g.geometry.buffer(influenceRadius))]
    avg_speed = gdf_f['speed'].mean()
    count = len(gdf_f)
    filtered_taxis.append([avg_speed,count])
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

421 / 421  done (~ 0h0m1s remaining)
CPU times: user 1h 3min 13s, sys: 2.99 s, total: 1h 3min 16s
Wall time: 1h 3min 24s


In [24]:
filtered_taxis = np.array(filtered_taxis)

target["nearTaxisAmount"] = filtered_taxis[:,1]
target["meanTaxisSpeed"] = filtered_taxis[:,0]
target.head(n=2)

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude,transformed_longitude,transformed_latitude,geometry,nearTaxisAmount,meanTaxisSpeed
1,万寿社区居委会,3,0.0,0.0,盈浦街道,青浦区,121.104287016932,31.1558823799679,-2822208.0,4677636.0,POINT (-2822207.587 4677635.627),0.0,
2,万泰花园第一居委会,3,1.0,0.0,七宝镇,闵行区,121.351529089795,31.144326843509,-2842711.0,4665980.0,POINT (-2842710.938 4665979.726),136.0,0.547059


In [25]:
target[["district", "town", "nearTaxisAmount", "meanTaxisSpeed", "noise", "clean", "smell", "longitude", "latitude"]].to_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-taxis-new-200m.pkl")

In [27]:
target["nearTaxisAmount200"] = target["nearTaxisAmount"]
target["meanTaxisSpeed200"] = target["meanTaxisSpeed"]

In [30]:
temp = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-taxis-new-100m.pkl")
target["nearTaxisAmount100"] = temp["nearTaxisAmount"]
target["meanTaxisSpeed100"] = temp["meanTaxisSpeed"]

In [31]:
target[target["meanTaxisSpeed50"] < target[]]

Unnamed: 0,Unnamed: 1,clean,smell,noise,town,district,longitude,latitude,transformed_longitude,transformed_latitude,geometry,nearTaxisAmount,meanTaxisSpeed,nearTaxisAmount200,meanTaxisSpeed200,nearTaxisAmount50,meanTaxisSpeed50,nearTaxisAmount100,meanTaxisSpeed100
1,万寿社区居委会,3,0.0,0.0,盈浦街道,青浦区,121.104287016932,31.1558823799679,-2822208.0,4677636.0,POINT (-2822207.587 4677635.627),0.0,,0.0,,0.0,,0.0,
2,万泰花园第一居委会,3,1.0,0.0,七宝镇,闵行区,121.351529089795,31.144326843509,-2842711.0,4665980.0,POINT (-2842710.938 4665979.726),136.0,0.547059,136.0,0.547059,136.0,0.547059,136.0,0.547059
3,万科阳光苑居委会,2,0.0,0.0,吴泾镇,闵行区,121.468520387505,31.0422853979528,-2855282.0,4665149.0,POINT (-2855282.425 4665148.864),0.0,,0.0,,0.0,,0.0,
4,万豪居委会,3,0.0,0.0,长征镇,普陀区,121.367237458941,31.2407251034807,-2841109.0,4660474.0,POINT (-2841108.749 4660473.749),15.0,15.24,15.0,15.24,15.0,15.24,15.0,15.24
5,万里名轩社区居委会,2,,0.0,万里街道,普陀区,121.403601417271,31.269155049366,-2843214.0,4657274.0,POINT (-2843213.856 4657273.733),0.0,,0.0,,0.0,,0.0,


# Mobikes

In [34]:
mobikes = pd.read_pickle("pickles/utseus-mobike-2016-cleaned.pk")
target = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-cleaned.pkl")

In [37]:
target["transformed_longitude"] = target.apply(lambda x : transformer.transform(x["longitude"], x["latitude"])[0], axis = 1)
target["transformed_latitude"] = target.apply(lambda x : transformer.transform(x["longitude"], x["latitude"])[1], axis = 1)
mobikes["new_start_x"] = mobikes.apply(lambda x : transformer.transform(x["start_location_x"], x["start_location_y"])[0], axis = 1)
mobikes["new_start_y"] = mobikes.apply(lambda x : transformer.transform(x["start_location_x"], x["start_location_y"])[1], axis = 1)
mobikes["new_end_x"] = mobikes.apply(lambda x : transformer.transform(x["end_location_x"], x["end_location_y"])[0], axis = 1)
mobikes["new_end_y"] = mobikes.apply(lambda x : transformer.transform(x["end_location_x"], x["end_location_y"])[1], axis = 1)

In [40]:
gdf_mobikes = gpd.GeoDataFrame(mobikes)
gdf_target = gpd.GeoDataFrame(target,geometry=gpd.points_from_xy(target.transformed_longitude,target.transformed_latitude))

In [41]:
gdf_mobikes["startGeometry"] = gpd.points_from_xy(gdf_mobikes["new_start_x"],gdf_mobikes["new_start_y"])
gdf_mobikes["endGeometry"] = gpd.points_from_xy(gdf_mobikes["new_end_x"],gdf_mobikes["new_end_y"])

### 50

In [48]:
influenceRadius = 50

In [48]:
%%time 

mobikesNumberStart=[]
gdf_mobikes["geometry"] = gdf_mobikes["startGeometry"]

done = 1
total = len(target)
startTime = time.time()

for i, g in gdf_target[0:50].iterrows():
    gdf_f = gdf_mobikes[gdf_mobikes.geometry.within(g.geometry.buffer(influenceRadius))]
    start = len(gdf_f)
    
    mobikesNumberStart.append(start)
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

13 / 421  done (~ 0h42m54s remaining)


KeyboardInterrupt: 

In [58]:
%%time 

done = 1
total = len(target)
startTime = time.time()

mobikesNumberEnd=[]
gdf_mobikes["geometry"] = gdf_mobikes["endGeometry"]

for i, g in gdf_target.iterrows():
    
    gdf_f = gdf_mobikes[gdf_mobikes.geometry.within(g.geometry.buffer(influenceRadius))]
    end = len(gdf_f)
    
    mobikesNumberEnd.append(end)
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

8 / 421  done (~ 0h44m37s remaining)


KeyboardInterrupt: 

In [56]:
target["nearStartingMobikes"] = mobikesNumberStart
target["nearEndingMobikes"] = mobikesNumberEnd

In [None]:
target[["district", "town", "noise", "clean", "smell", "longitude", "latitude", "nearStartingMobikes", "nearEndingMobikes"]].to_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-mobikes-50m.pkl")

### 100

In [48]:
influenceRadius = 100

In [48]:
%%time 

mobikesNumberStart=[]
gdf_mobikes["geometry"] = gdf_mobikes["startGeometry"]

done = 1
total = len(target)
startTime = time.time()

for i, g in gdf_target[0:50].iterrows():
    gdf_f = gdf_mobikes[gdf_mobikes.geometry.within(g.geometry.buffer(influenceRadius))]
    start = len(gdf_f)
    
    mobikesNumberStart.append(start)
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

13 / 421  done (~ 0h42m54s remaining)


KeyboardInterrupt: 

In [58]:
%%time 

done = 1
total = len(target)
startTime = time.time()

mobikesNumberEnd=[]
gdf_mobikes["geometry"] = gdf_mobikes["endGeometry"]

for i, g in gdf_target.iterrows():
    
    gdf_f = gdf_mobikes[gdf_mobikes.geometry.within(g.geometry.buffer(influenceRadius))]
    end = len(gdf_f)
    
    mobikesNumberEnd.append(end)
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

8 / 421  done (~ 0h44m37s remaining)


KeyboardInterrupt: 

In [56]:
target["nearStartingMobikes"] = mobikesNumberStart
target["nearEndingMobikes"] = mobikesNumberEnd

In [None]:
target[["district", "town", "noise", "clean", "smell", "longitude", "latitude", "nearStartingMobikes", "nearEndingMobikes"]].to_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-mobikes-100m.pkl")

### 200

In [48]:
influenceRadius = 200

In [48]:
%%time 

mobikesNumberStart=[]
gdf_mobikes["geometry"] = gdf_mobikes["startGeometry"]

done = 1
total = len(target)
startTime = time.time()

for i, g in gdf_target[0:50].iterrows():
    gdf_f = gdf_mobikes[gdf_mobikes.geometry.within(g.geometry.buffer(influenceRadius))]
    start = len(gdf_f)
    
    mobikesNumberStart.append(start)
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

13 / 421  done (~ 0h42m54s remaining)


KeyboardInterrupt: 

In [58]:
%%time 

done = 1
total = len(target)
startTime = time.time()

mobikesNumberEnd=[]
gdf_mobikes["geometry"] = gdf_mobikes["endGeometry"]

for i, g in gdf_target.iterrows():
    
    gdf_f = gdf_mobikes[gdf_mobikes.geometry.within(g.geometry.buffer(influenceRadius))]
    end = len(gdf_f)
    
    mobikesNumberEnd.append(end)
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

8 / 421  done (~ 0h44m37s remaining)


KeyboardInterrupt: 

In [56]:
target["nearStartingMobikes"] = mobikesNumberStart
target["nearEndingMobikes"] = mobikesNumberEnd

In [None]:
target[["district", "town", "noise", "clean", "smell", "longitude", "latitude", "nearStartingMobikes", "nearEndingMobikes"]].to_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-mobikes-200m.pkl")

## Try ML

In [2]:
target = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-taxis-new-50m.pkl")
target.head()

Unnamed: 0,district,town,nearTaxisAmount,meanTaxisSpeed,noise,clean,smell,longitude,latitude
1,青浦区,盈浦街道,0.0,,0.0,3,0.0,121.104287016932,31.1558823799679
2,闵行区,七宝镇,136.0,0.547059,0.0,3,1.0,121.351529089795,31.144326843509
3,闵行区,吴泾镇,0.0,,0.0,2,0.0,121.468520387505,31.0422853979528
4,普陀区,长征镇,15.0,15.24,0.0,3,0.0,121.367237458941,31.2407251034807
5,普陀区,万里街道,0.0,,0.0,2,,121.403601417271,31.269155049366


In [3]:
target["nearTaxisAmount50"] = target["nearTaxisAmount"]
target["meanTaxisSpeed50"] = target["meanTaxisSpeed"]

In [4]:
temp = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-taxis-new-100m.pkl")
target["nearTaxisAmount100"] = temp["nearTaxisAmount"]
target["meanTaxisSpeed100"] = temp["meanTaxisSpeed"]

In [5]:
temp = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-taxis-new-200m.pkl")
target["nearTaxisAmount200"] = temp["nearTaxisAmount"]
target["meanTaxisSpeed200"] = temp["meanTaxisSpeed"]

In [7]:
target.head()

Unnamed: 0,district,town,nearTaxisAmount,meanTaxisSpeed,noise,clean,smell,longitude,latitude,nearTaxisAmount50,meanTaxisSpeed50,nearTaxisAmount100,meanTaxisSpeed100,nearTaxisAmount200,meanTaxisSpeed200
1,青浦区,盈浦街道,0.0,,0.0,3,0.0,121.104287016932,31.1558823799679,0.0,,79.0,0.036709,200.0,0.111
2,闵行区,七宝镇,136.0,0.547059,0.0,3,1.0,121.351529089795,31.144326843509,136.0,0.547059,291.0,0.794845,566.0,9.174028
3,闵行区,吴泾镇,0.0,,0.0,2,0.0,121.468520387505,31.0422853979528,0.0,,7.0,0.0,285.0,0.312982
4,普陀区,长征镇,15.0,15.24,0.0,3,0.0,121.367237458941,31.2407251034807,15.0,15.24,97.0,15.676289,430.0,14.56814
5,普陀区,万里街道,0.0,,0.0,2,,121.403601417271,31.269155049366,0.0,,51.0,12.829412,317.0,17.354259


In [10]:
temp = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-mobikes-50m.pkl")
target["nearStartingMobikes50"] = temp["nearStartingMobikes"]
target["nearEndingMobikes50"] = temp["nearEndingMobikes"]

In [11]:
temp = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-mobikes-100m.pkl")
target["nearStartingMobikes100"] = temp["nearStartingMobikes"]
target["nearEndingMobikes100"] = temp["nearEndingMobikes"]

In [12]:
temp = pd.read_pickle("pickles/TARGET-communities-extract-wellbeing-3params-with-mobikes-200m.pkl")
target["nearStartingMobikes200"] = temp["nearStartingMobikes"]
target["nearEndingMobikes200"] = temp["nearEndingMobikes"]

In [15]:
target.head()

Unnamed: 0,district,town,nearTaxisAmount,meanTaxisSpeed,noise,clean,smell,longitude,latitude,nearTaxisAmount50,...,nearTaxisAmount100,meanTaxisSpeed100,nearTaxisAmount200,meanTaxisSpeed200,nearStartingMobikes50,nearEndingMobikes50,nearStartingMobikes100,nearEndingMobikes100,nearStartingMobikes200,nearEndingMobikes200
1,青浦区,盈浦街道,0.0,,0.0,3,0.0,121.104287016932,31.1558823799679,0.0,...,79.0,0.036709,200.0,0.111,0,0,0,0,0,0
2,闵行区,七宝镇,136.0,0.547059,0.0,3,1.0,121.351529089795,31.144326843509,136.0,...,291.0,0.794845,566.0,9.174028,3,1,9,4,50,43
3,闵行区,吴泾镇,0.0,,0.0,2,0.0,121.468520387505,31.0422853979528,0.0,...,7.0,0.0,285.0,0.312982,1,1,8,3,27,23
4,普陀区,长征镇,15.0,15.24,0.0,3,0.0,121.367237458941,31.2407251034807,15.0,...,97.0,15.676289,430.0,14.56814,47,42,58,54,543,563
5,普陀区,万里街道,0.0,,0.0,2,,121.403601417271,31.269155049366,0.0,...,51.0,12.829412,317.0,17.354259,0,2,86,89,848,858


In [36]:
def doConglomerate(x) :
    c = x["clean"]
    if (x["noise"] == "1.0") :
        n = 1
    else :
        n = 0
    if (x["smell"] == "1.0") :
        s = 1
    else :
        s = 0
    return str(c) + ", " + str(n) + ", " + str(s) 

In [37]:
target["conglomerate"] = target.apply(doConglomerate, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target["conglomerate"] = target.apply(doConglomerate, axis = 1)


In [38]:
target.head()

Unnamed: 0,district,town,nearTaxisAmount,meanTaxisSpeed,noise,clean,smell,longitude,latitude,nearTaxisAmount50,...,meanTaxisSpeed100,nearTaxisAmount200,meanTaxisSpeed200,nearStartingMobikes50,nearEndingMobikes50,nearStartingMobikes100,nearEndingMobikes100,nearStartingMobikes200,nearEndingMobikes200,conglomerate
1,青浦区,盈浦街道,0.0,,0.0,3,0.0,121.104287016932,31.1558823799679,0.0,...,0.036709,200.0,0.111,0,0,0,0,0,0,"3, 0, 0"
2,闵行区,七宝镇,136.0,0.547059,0.0,3,1.0,121.351529089795,31.144326843509,136.0,...,0.794845,566.0,9.174028,3,1,9,4,50,43,"3, 0, 1"
3,闵行区,吴泾镇,0.0,,0.0,2,0.0,121.468520387505,31.0422853979528,0.0,...,0.0,285.0,0.312982,1,1,8,3,27,23,"2, 0, 0"
4,普陀区,长征镇,15.0,15.24,0.0,3,0.0,121.367237458941,31.2407251034807,15.0,...,15.676289,430.0,14.56814,47,42,58,54,543,563,"3, 0, 0"
6,浦东新区,川沙新镇,2.0,23.6,0.0,3,0.0,121.668505522212,31.1953057158861,2.0,...,20.3,73.0,63.193151,0,0,4,5,7,8,"3, 0, 0"


In [39]:
for value in target[(target["noise"] != "") & (target["smell"] != "")]["conglomerate"].unique() :
    print(value) 

In [45]:
target = target[['district', 'town', 'noise', 'clean', 'smell', 'longitude', 'latitude', 'nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100', 'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200', 'nearStartingMobikes50', 'nearEndingMobikes50', 'nearStartingMobikes100', 'nearEndingMobikes100', 'nearStartingMobikes200', 'nearEndingMobikes200', 'conglomerate']]

In [213]:
target.to_pickle("pickles/target_final-v2.pkl")

In [51]:
target = target.fillna(0)

In [214]:
target.columns

Index(['district', 'town', 'noise', 'clean', 'smell', 'longitude', 'latitude',
       'nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100',
       'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200',
       'nearStartingMobikes50', 'nearEndingMobikes50',
       'nearStartingMobikes100', 'nearEndingMobikes100',
       'nearStartingMobikes200', 'nearEndingMobikes200', 'conglomerate',
       'transformed_longitude', 'transformed_latitude', 'geometry',
       'nSubwayStations1km', 'nearestSubwayStation'],
      dtype='object')

In [187]:
sbwstt = pd.read_pickle("pickles/subway_meters.pk")

In [188]:
sbwstt

Unnamed: 0.1,Unnamed: 0,_id,exploreid,name_en,name_zh_Hans,lines,lat,lng,x,y,code,phonetic,multiline,transformed_longitude,transformed_latitude
0,0,449,279,Xinzhuang,莘庄,15,31.113035,121.380655,10240708,-42935534,,Xīnzhuāng,0,-2.846016e+06,4.666065e+06
1,1,450,3,Waihuanlu,外环路,1,31.122854,121.388599,15814733,-37358774,,Wàihuánlù,0,-2.846370e+06,4.665190e+06
2,2,451,4,Lianhua Road,莲花路,1,31.133039,121.398481,21208778,-31956624,,Liánhuā Lù,0,-2.846871e+06,4.664201e+06
3,3,452,5,Jinjiang Park,锦江乐园,1,31.143994,121.409714,26638853,-26527039,,Jǐnjiāng Lèyuán,0,-2.847458e+06,4.663107e+06
4,4,453,639,Shanghai South Railway Station,上海南站,13,31.155858,121.425249,32104868,-20912684,,Shànghǎi Nánzhàn,0,-2.848367e+06,4.661754e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,282,743,1609,Huinan,惠南,S,31.055915,121.758416,146469255,-69867890,,Huìnán,0,-2.878439e+06,4.649980e+06
283,283,744,1610,East Huinan,惠南东,S,31.028795,121.789590,150638644,-74013108,,Huìnán Dōng,0,-2.881786e+06,4.649731e+06
284,284,745,1611,Shuyuan,书院,S,30.961529,121.846678,154808033,-78158327,,Shūyuàn,0,-2.888444e+06,4.650120e+06
285,285,746,1612,Lingang Avenue,临港大道,S,30.925973,121.906598,158977423,-82303545,,Língǎng Dàdào,0,-2.894377e+06,4.648817e+06


In [193]:
target["transformed_longitude"] = target.apply(lambda x : transformer.transform(x["longitude"], x["latitude"])[0], axis = 1)
target["transformed_latitude"] = target.apply(lambda x : transformer.transform(x["longitude"], x["latitude"])[1], axis = 1)

In [194]:
gdf_sbwstt = gpd.GeoDataFrame(sbwstt, geometry=gpd.points_from_xy(sbwstt.transformed_longitude,sbwstt.transformed_latitude))

In [195]:
gdf_target = gpd.GeoDataFrame(target,geometry=gpd.points_from_xy(target.transformed_longitude,target.transformed_latitude))

In [206]:
%%time

influenceRadius = 1000
influenceRadiusSq = influenceRadius ** 2 

CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 18.1 µs


In [209]:
%%time

filtered_stations=[]

done = 1
total = len(target)
startTime = time.time()

for i, g in gdf_target.iterrows():
    gdf_f = gdf_sbwstt[gdf_sbwstt.geometry.within(g.geometry.buffer(influenceRadius))]
    count = len(gdf_f)
    if (count != 0) :
        minDist = gdf_f.distance(g.geometry).min()
    else :
        minDist = influenceRadius * 2 
    filtered_stations.append([count, minDist])
    
    clear_output(wait=True)
    timer = time.time()
    remainingTime = int((timer - startTime) / done * (total - done)) + 1
    print(done,"/",total," done (~ " + str(remainingTime//3600) + "h" + str((remainingTime%3600)//60) + "m" + str(remainingTime%60) + "s " + "remaining)")
    done+=1 

358 / 358  done (~ 0h0m1s remaining)
CPU times: user 4.29 s, sys: 408 ms, total: 4.7 s
Wall time: 4.64 s


In [212]:
filtered_stations = np.array(filtered_stations)

target["nSubwayStations1km"] = filtered_stations[:,0]
target["nearestSubwayStation"] = filtered_stations[:,1]
target.head(n=2)

Unnamed: 0,district,town,noise,clean,smell,longitude,latitude,nearTaxisAmount50,meanTaxisSpeed50,nearTaxisAmount100,...,nearStartingMobikes100,nearEndingMobikes100,nearStartingMobikes200,nearEndingMobikes200,conglomerate,transformed_longitude,transformed_latitude,geometry,nSubwayStations1km,nearestSubwayStation
1,青浦区,盈浦街道,0.0,3,0.0,121.104287016932,31.1558823799679,0.0,0.0,79.0,...,0,0,0,0,"3, 0, 0",-2822208.0,4677636.0,POINT (-2822207.587 4677635.627),0.0,2000.0
2,闵行区,七宝镇,0.0,3,1.0,121.351529089795,31.144326843509,136.0,0.547059,291.0,...,9,4,50,43,"3, 0, 1",-2842711.0,4665980.0,POINT (-2842710.938 4665979.726),0.0,2000.0


In [211]:
target.head()

Unnamed: 0,district,town,noise,clean,smell,longitude,latitude,nearTaxisAmount50,meanTaxisSpeed50,nearTaxisAmount100,...,nearStartingMobikes50,nearEndingMobikes50,nearStartingMobikes100,nearEndingMobikes100,nearStartingMobikes200,nearEndingMobikes200,conglomerate,transformed_longitude,transformed_latitude,geometry
1,青浦区,盈浦街道,0.0,3,0.0,121.104287016932,31.1558823799679,0.0,0.0,79.0,...,0,0,0,0,0,0,"3, 0, 0",-2822208.0,4677636.0,POINT (-2822207.587 4677635.627)
2,闵行区,七宝镇,0.0,3,1.0,121.351529089795,31.144326843509,136.0,0.547059,291.0,...,3,1,9,4,50,43,"3, 0, 1",-2842711.0,4665980.0,POINT (-2842710.938 4665979.726)
3,闵行区,吴泾镇,0.0,2,0.0,121.468520387505,31.0422853979528,0.0,0.0,7.0,...,1,1,8,3,27,23,"2, 0, 0",-2855282.0,4665149.0,POINT (-2855282.425 4665148.864)
4,普陀区,长征镇,0.0,3,0.0,121.367237458941,31.2407251034807,15.0,15.24,97.0,...,47,42,58,54,543,563,"3, 0, 0",-2841109.0,4660474.0,POINT (-2841108.749 4660473.749)
6,浦东新区,川沙新镇,0.0,3,0.0,121.668505522212,31.1953057158861,2.0,23.6,4.0,...,0,0,4,5,7,8,"3, 0, 0",-2866945.0,4647692.0,POINT (-2866944.966 4647691.900)


### Imports

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

### Goal

In [93]:
"Random = " + str(1/len(target.conglomerate.unique()))

'Random = 0.07692307692307693'

### Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier

In [215]:
%%time
X = target[['nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100', 'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200', 'nearStartingMobikes50', 'nearEndingMobikes50', 'nearStartingMobikes100', 'nearEndingMobikes100', 'nearStartingMobikes200', 'nearEndingMobikes200','nSubwayStations1km', 'nearestSubwayStation']]
Y = target["conglomerate"]
Xrescaled = MinMaxScaler().fit_transform(X)

paramGrid = {"max_depth" : [3,5,10,20,25,50], "n_estimators" : [2, 5, 10, 25, 50] , "criterion" : ["gini", "entropy"]}
search = GridSearchCV(RandomForestClassifier(random_state = 666),paramGrid, cv=ShuffleSplit(n_splits = 5))
search.fit(X, Y)

CPU times: user 15.7 s, sys: 37.9 ms, total: 15.7 s
Wall time: 16.6 s


GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=None, train_size=None),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                          

In [216]:
search.best_score_

0.43888888888888883

In [217]:
search.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 25}

In [218]:
search.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=666,
                       verbose=0, warm_start=False)

### Nearest Neighbors

In [219]:
%%time

X = target[['nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100', 'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200', 'nearStartingMobikes50', 'nearEndingMobikes50', 'nearStartingMobikes100', 'nearEndingMobikes100', 'nearStartingMobikes200', 'nearEndingMobikes200','nSubwayStations1km', 'nearestSubwayStation']]
Y = target["conglomerate"]
Xrescaled = MinMaxScaler().fit_transform(X)

from sklearn.neighbors import KNeighborsClassifier
paramGrid = {"n_neighbors" : [1, 2, 5, 10, 20, 25,50, 100,150, 200]}
search = GridSearchCV(KNeighborsClassifier(),paramGrid, cv=ShuffleSplit(n_splits = 5))
search.fit(X,Y)

CPU times: user 576 ms, sys: 6.45 ms, total: 582 ms
Wall time: 957 ms


GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=None, train_size=None),
             error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 5, 10, 20, 25, 50, 100, 150,
                                         200]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [220]:
search.best_score_

0.3833333333333333

In [221]:
search.best_params_

{'n_neighbors': 150}

In [222]:
search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=150, p=2,
                     weights='uniform')

### AdaBoost

In [88]:
%%time

X = target[['nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100', 'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200', 'nearStartingMobikes50', 'nearEndingMobikes50', 'nearStartingMobikes100', 'nearEndingMobikes100', 'nearStartingMobikes200', 'nearEndingMobikes200']]
Y = target["conglomerate"]
Xrescaled = MinMaxScaler().fit_transform(X)
xTrain, xValidate, yTrain, yValidate = train_test_split(X, Y, test_size = 0.20)

from sklearn.ensemble import GradientBoostingClassifier
#paramGrid = {"n_estimators" : [1, 2, 5, 10, 20, 25, 50, 100, 150], "learning_rate" : [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0],"max_depth" : [1, 2, 3, 5, 10, 25]}
search = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
search.fit(xTrain,yTrain)

CPU times: user 1.61 s, sys: 13.1 ms, total: 1.63 s
Wall time: 1.64 s


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=1.0, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [89]:
search.score(xValidate, yValidate)

0.013888888888888888

### KKN+NCA (Pipe)

In [149]:
X = target[['nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100', 'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200', 'nearStartingMobikes50', 'nearEndingMobikes50', 'nearStartingMobikes100', 'nearEndingMobikes100', 'nearStartingMobikes200', 'nearEndingMobikes200']]
Y = target["conglomerate"]
Xrescaled = MinMaxScaler().fit_transform(X)
xTrain, xTest, yTrain, yTest = train_test_split(Xrescaled, Y, test_size = 0.20)
xTrain, xValidate, yTrain, yValidate = train_test_split(xTrain, yTrain, test_size = 0.20)

from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
#paramGrid = {"n_neighbors" : [1, 2, 5, 10, 20, 25]}
search = Pipeline([('nca', NeighborhoodComponentsAnalysis()), ('knn', KNeighborsClassifier(n_neighbors = 2))])
search.fit(xTrain,yTrain)

Pipeline(memory=None,
         steps=[('nca',
                 NeighborhoodComponentsAnalysis(callback=None, init='auto',
                                                max_iter=50, n_components=None,
                                                random_state=None, tol=1e-05,
                                                verbose=0, warm_start=False)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=2, p=2,
                                      weights='uniform'))],
         verbose=False)

In [150]:
search.score(xValidate, yValidate)

0.3448275862068966

## Working on ...

### KNN (CHOOSEN AND SAVED)

In [314]:
%%time

X = target[['nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100', 'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200', 'nearStartingMobikes50', 'nearEndingMobikes50', 'nearStartingMobikes100', 'nearEndingMobikes100', 'nearStartingMobikes200', 'nearEndingMobikes200','nSubwayStations1km', 'nearestSubwayStation']]
Y = target["conglomerate"]
Xrescaled = MinMaxScaler().fit_transform(X)
xTrain, xValidate, yTrain, yValidate = train_test_split(X, Y, test_size = 0.20)

from sklearn.neighbors import KNeighborsClassifier
paramGrid = {"n_neighbors" : [1, 2, 5, 10, 20, 25,50, 100,150, 200]}
search = KNeighborsClassifier(n_neighbors = 50)
search.fit(xTrain,yTrain)

CPU times: user 26.6 ms, sys: 12 µs, total: 26.7 ms
Wall time: 26.5 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                     weights='uniform')

In [315]:
yPred = search.predict(xValidate)

In [316]:
from sklearn.metrics import classification_report

In [317]:
print(classification_report(yValidate, yPred, zero_division=1))

              precision    recall  f1-score   support

     1, 0, 0       1.00      0.00      0.00         4
     1, 0, 1       1.00      0.00      0.00         1
     2, 0, 0       0.20      0.04      0.07        23
     2, 1, 0       1.00      0.00      0.00         3
     3, 0, 0       0.51      0.94      0.66        36
     3, 1, 0       1.00      0.00      0.00         4
     4, 0, 0       1.00      0.00      0.00         1

    accuracy                           0.49        72
   macro avg       0.10      0.14      0.10        72
weighted avg       0.32      0.49      0.35        72



  _warn_prf(average, modifier, msg_start, len(result))


In [318]:
yPred

array(['3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '2, 0, 0', '2, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '2, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '2, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '2, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0'],
      dtype=object)

### Random Forest

In [251]:
%%time

X = target[['nearTaxisAmount50', 'meanTaxisSpeed50', 'nearTaxisAmount100', 'meanTaxisSpeed100', 'nearTaxisAmount200', 'meanTaxisSpeed200', 'nearStartingMobikes50', 'nearEndingMobikes50', 'nearStartingMobikes100', 'nearEndingMobikes100', 'nearStartingMobikes200', 'nearEndingMobikes200','nSubwayStations1km', 'nearestSubwayStation']]
Y = target["conglomerate"]
Xrescaled = MinMaxScaler().fit_transform(X)
xTrain, xValidate, yTrain, yValidate = train_test_split(X, Y, test_size = 0.20)

from sklearn.neighbors import KNeighborsClassifier
paramGrid = {"n_neighbors" : [1, 2, 5, 10, 20, 25,50, 100,150, 200]}
search = RandomForestClassifier(criterion = 'entropy', max_depth = 5, n_estimators= 50)
search.fit(xTrain,yTrain)

CPU times: user 248 ms, sys: 6.31 ms, total: 255 ms
Wall time: 260 ms


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [252]:
yPred = search.predict(xValidate)

In [253]:
from sklearn.metrics import classification_report

In [254]:
print(classification_report(yValidate, yPred))

              precision    recall  f1-score   support

     1, 1, 0       0.00      0.00      0.00         1
     2, 0, 0       0.60      0.22      0.32        27
     2, 0, 1       0.00      0.00      0.00         2
     2, 1, 0       0.00      0.00      0.00         7
     2, 1, 1       0.00      0.00      0.00         1
     3, 0, 0       0.39      0.89      0.55        27
     3, 0, 1       0.00      0.00      0.00         1
     3, 1, 0       0.00      0.00      0.00         4
     4, 0, 0       0.00      0.00      0.00         1
     4, 1, 0       0.00      0.00      0.00         1

    accuracy                           0.42        72
   macro avg       0.10      0.11      0.09        72
weighted avg       0.37      0.42      0.33        72



  _warn_prf(average, modifier, msg_start, len(result))


In [255]:
yPred

array(['3, 0, 0', '2, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '2, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '2, 0, 0', '2, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '2, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '2, 0, 0', '3, 0, 0', '2, 0, 0', '3, 0, 0', '3, 1, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '2, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '2, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0',
       '3, 0, 0', '3, 0, 0', '2, 0, 0', '3, 0, 0', '3, 0, 0', '3, 0, 0'],
      dtype=object)

In [None]:
search.

## Save

In [319]:
import pickle as pkl

In [321]:
pkl.dump(search, open("knn-machine-learning.pkl", "wb"))

# Conclusion

The KNN algorithm gave me a f1 score of 0.49 (over 13 unsorted categories). I did not manage to find a way to sort them and, even if I am not fully satisfied with the result, it is not a bad outcome.