In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from geopy.geocoders import ArcGIS
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from category_encoders import LeaveOneOutEncoder
from category_encoders.wrapper import NestedCVWrapper

%matplotlib inline

**warning**: este notebook usa un servicio de geocoding y tarda **mucho** en correr. Para facilitar las cosas guardo los datos obtenidos en `geo.cvs` y hago el análisis en un notebook separado (`analyze_location.ipynb`).

In [2]:
train_dataset = pd.read_csv('../dataset/train.csv', index_col=0)
test_dataset = pd.read_csv('../dataset/test.csv', index_col=0)
data = pd.concat([test_dataset, train_dataset])
data = data.drop(columns=['text', 'keyword'])
data = data[data['location'].notnull()]
data['location'] = data['location'].map(lambda x: x.lower())
data.sort_index(inplace=True)
data

Unnamed: 0_level_0,location,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
46,london,
47,niall's place | saf 12 squad |,
48,birmingham,1.0
49,est. september 2012 - bristol,0.0
50,africa,1.0
...,...,...
10826,tn,0.0
10829,#newcastleupontyne #uk,0.0
10831,"vancouver, canada",0.0
10832,london,0.0


### Coordenadas
Se puede ver que algunas ubicaciones son coordenadas. Trato de idenificarlas y buscar sus datos

In [3]:
def to_coord(loc):
    try:
        x, y = loc.split(',')
        x = float(re.findall(r'-?[\d\.]+', x)[0])
        if not (-90 < x < 90): return np.nan
        y = float(re.findall(r'-?[\d\.]+', y)[0])
        if not (-180 < y <180): return np.nan
        return x, y
    except:
        return np.nan

coords = data['location'].map(to_coord).dropna()
coords

id
196                       (19.600858, -99.047821)
1350                      (30.307558, -81.403118)
1433                      (39.982988, -75.261624)
1794                      (41.373061, -71.942237)
1973                     (-27.499212, 153.011072)
2460                      (-6.152261, 106.775995)
2499                      (41.252426, -96.072013)
2568                     (21.462446, -158.022017)
2616                       (52.479722, 62.184971)
2984                      (-26.695807, 27.837865)
3101     (10.614817868480726, 12.195582811791382)
3114                        (46.950109, 7.439469)
3256                                   (0.0, 0.0)
3389                      (36.142163, -95.979189)
3402                      (40.736324, -73.990062)
3569                      (33.209923, -87.545328)
3725                            (6.4682, 3.18287)
3787                       (19.123127, 72.825133)
4065                        (48.870833, 2.399227)
4847                       (43.631838, -79.5580

In [4]:
geocoder = ArcGIS(
    timeout=500,
    user_agent='tp1_datos_1c2020',
    username='tp1_datos_1c2020',
    password='datos_1c2020',
    referer='www.datos_1c2020.com'
)

In [5]:
def geo_info_by_coords(lat, lon, geocoder):
    response = geocoder.reverse(query=f'{lat},{lon}')
    return {
            'lat': lat,
            'lon': lon,
            'country': response.raw['CountryCode'],
            'city': response.raw.get('City', np.nan),
            'match_score': 100,
        }

In [6]:
coords_info = pd.DataFrame(list(coords.map(lambda x: geo_info_by_coords(x[0], x[1], geocoder))), index=coords.index)
coords_info

Unnamed: 0_level_0,lat,lon,country,city,match_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
196,19.600858,-99.047821,MEX,Ecatepec de Morelos,100
1350,30.307558,-81.403118,USA,Neptune Beach,100
1433,39.982988,-75.261624,USA,Philadelphia,100
1794,41.373061,-71.942237,USA,Stonington,100
1973,-27.499212,153.011072,AUS,Brisbane,100
2460,-6.152261,106.775995,IDN,Grogol Petamburan,100
2499,41.252426,-96.072013,USA,Omaha,100
2568,21.462446,-158.022017,USA,Mililani,100
2616,52.479722,62.184971,KAZ,Денисов ауданы,100
2984,-26.695807,27.837865,ZAF,Emfuleni,100


### Dirección
El resto de las ubicaciones se buscan como string

In [7]:
data['coords'] = coords
names = data[data['coords'].isnull()]['location']
names

id
46                               london
47       niall's place | saf 12 squad |
48                           birmingham
49        est. september 2012 - bristol
50                               africa
                      ...              
10826                                tn
10829            #newcastleupontyne #uk
10831                 vancouver, canada
10832                           london 
10833                           lincoln
Name: location, Length: 7197, dtype: object

In [8]:
def geo_info_by_string(query, geocoder):
    response = geocoder.geocode(query=query, out_fields=['Country', 'Score', 'City'])
    try:
        return {
            'lat': response.point.latitude,
            'lon': response.point.longitude,
            'country': response.raw['attributes'].get('Country', np.nan),
            'city': response.raw['attributes'].get('City', np.nan),
            'match_score': response.raw['attributes'].get('Score', np.nan),
        }
    except (KeyError, AttributeError):
        return {
            'lat': np.nan,
            'lon': np.nan,
            'country': np.nan,
            'city': np.nan,
            'match_score': 0,
        }

In [9]:
names_info = pd.DataFrame(list(names.map(lambda x: geo_info_by_string(x, geocoder))), index=names.index)
names_info

Unnamed: 0_level_0,lat,lon,country,city,match_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
46,51.506420,-0.127210,GBR,London,100.00
47,19.633330,-72.516670,HTI,Place,68.60
48,52.478910,-1.905920,GBR,Birmingham,100.00
49,51.453790,-2.591680,GBR,Bristol,73.00
50,7.188100,21.093750,,,100.00
...,...,...,...,...,...
10826,34.116318,9.608516,TUN,,100.00
10829,54.973280,-1.613960,GBR,نیوکاسل,99.98
10831,49.260380,-123.113360,CAN,Vancouver,100.00
10832,51.506420,-0.127210,GBR,London,100.00


In [10]:
all = pd.concat([names_info, coords_info, train_dataset[train_dataset['location'].isnull()], test_dataset[test_dataset['location'].isnull()]])[['country', 'city', 'match_score', 'location']]
all['has_location'] = all['location'].notnull().apply(lambda x: 1 if x else 0)
all = all.drop(columns=['location'])
all = all.merge(train_dataset['target'], how="outer", left_index=True, right_index=True)
all

Unnamed: 0_level_0,country,city,match_score,has_location,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,,,,0,
1,,,,0,1.0
2,,,,0,
3,,,,0,
4,,,,0,1.0
...,...,...,...,...,...
10871,,,,0,1.0
10872,,,,0,1.0
10873,,,,0,1.0
10874,,,,0,


In [11]:
all.describe()

Unnamed: 0,match_score,has_location,target
count,7238.0,10876.0,7613.0
mean,93.003531,0.0,0.42966
std,17.108331,0.0,0.49506
min,0.0,0.0,0.0
25%,96.5025,0.0,0.0
50%,100.0,0.0,0.0
75%,100.0,0.0,1.0
max,100.0,0.0,1.0


In [12]:
train = all[all['target'].notnull()]
train

Unnamed: 0_level_0,country,city,match_score,has_location,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,,0,1.0
4,,,,0,1.0
5,,,,0,1.0
6,,,,0,1.0
7,,,,0,1.0
...,...,...,...,...,...
10869,,,,0,1.0
10870,,,,0,1.0
10871,,,,0,1.0
10872,,,,0,1.0


In [13]:
test = all[all['target'].isnull()]
test

Unnamed: 0_level_0,country,city,match_score,has_location,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,,,,0,
2,,,,0,
3,,,,0,
9,,,,0,
11,,,,0,
...,...,...,...,...,...
10861,,,,0,
10865,,,,0,
10868,,,,0,
10874,,,,0,


In [14]:
enc_nested = NestedCVWrapper(LeaveOneOutEncoder(cols=['country', 'city']), random_state=42)

X_train_enc, X_test_enc = enc_nested.fit_transform(train[['country', 'city']].fillna("no_info"), train['target'], X_test=test[['country', 'city']].fillna("no_info"))
X_train_enc = X_train_enc.set_index(train_dataset.index)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [15]:
X_train_enc

Unnamed: 0_level_0,country,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.422934,0.422934
4,0.420188,0.420188
5,0.420188,0.420188
6,0.420188,0.420188
7,0.414843,0.414843
...,...,...
10869,0.414843,0.414843
10870,0.414843,0.414843
10871,0.414843,0.414843
10872,0.414843,0.414843


In [16]:
X_test_enc

Unnamed: 0_level_0,country,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.421769,0.421769
2,0.421769,0.421769
3,0.421769,0.421769
9,0.421769,0.421769
11,0.421769,0.421769
...,...,...
10861,0.421769,0.421769
10865,0.421769,0.421769
10868,0.421769,0.421769
10874,0.421769,0.421769


In [17]:
features = all[['has_location',	'match_score']].merge(pd.concat([X_train_enc, X_test_enc]), left_index=True, right_index=True)
features = features.rename(columns={'country': 'encoded_country', 'city': 'encoded_city'})
features.head(50)

Unnamed: 0_level_0,has_location,match_score,encoded_country,encoded_city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,,0.421769,0.421769
1,0,,0.422934,0.422934
2,0,,0.421769,0.421769
3,0,,0.421769,0.421769
4,0,,0.420188,0.420188
5,0,,0.420188,0.420188
6,0,,0.420188,0.420188
7,0,,0.414843,0.414843
8,0,,0.422934,0.422934
9,0,,0.421769,0.421769


In [19]:
features.fillna(0).to_csv('../features/features_location.csv', index=True)