# Litterati dataset

In [22]:
import pandas as pd
from pathlib import Path

PATH_DATA = Path('/data/datasets/catdd/')
PATH_DATA_CLEAN = PATH_DATA / 'clean'

In [2]:
ITEMS = pd.read_csv(PATH_DATA / 'litterati-location.csv')
ITEMS.sample()

Unnamed: 0,id,email,lat,lon,time,tags,url,club,username,display_name,country_code
11360,406480.0,jeff@litterati.org,37.813399838,-122.265747443,2017-04-11 16:21:09,lid+plastic,https://firebasestorage.googleapis.com/v0/b/pr...,,,Jeff Kirschner,US


In [20]:
ITEMS = (ITEMS
 [['lat', 'lon', 'time', 'tags']]
 .dropna(subset=['tags'])
 .astype({'lat': float, 'lon': float})
 .assign(time=lambda d: pd.to_datetime(d.time))
 .rename(columns={'lon': 'longitude', 'lat': 'latitude'})
)
ITEMS.sample()

Unnamed: 0,latitude,longitude,time,tags
5185,37.809876,-122.243678,2018-03-10 23:51:08,plastic+lid


In [21]:
from pandas.io.json import build_table_schema
import json

def get_table_schema(df):
    return build_table_schema(df, index=False, version=False)

get_table_schema(ITEMS)

{'fields': [{'name': 'latitude', 'type': 'number'},
  {'name': 'longitude', 'type': 'number'},
  {'name': 'time', 'type': 'datetime'},
  {'name': 'tags', 'type': 'string'}]}

In [23]:
ITEMS.to_csv(PATH_DATA_CLEAN / 'litterati.csv')

Get the labels table, try to do some matching

In [16]:
def process_tags(df):
    return (df
            .assign(type=lambda d: d.type.map({'m': 'material', 'b': 'brand', 'c': 'category', 'o': 'object', 'x': 'misc'}))
           )

In [17]:
TAGS = pd.read_csv(PATH_DATA / 'litterati-comb.csv')
TAGS.pipe(process_tags)

Unnamed: 0,tag,count,type
0,plastic,253790,material
1,cigarette,87931,object
2,paper,72836,material
3,wrapper,56905,object
4,can,46871,object
5,bottle,31962,object
6,metal,31332,material
7,candy,20875,object
8,beer,17720,category
9,bottlecap,17027,object


In [8]:
ITEMS.tags.str.get_dummies(sep='+')

Unnamed: 0,100grand,1220pst,25,2for99,365,3m,3musketeers,5,7eleven,7up,...,yogurt,yolk,yongsheng,yoplait,york,zaza,zbar,zen,zigzag,zingers
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
