# Basic Stats

In [3]:
# Raw tweets from streaming: /data/muntean/food-tweets
# Wordcount per day and whole (4-grams): /home/muntean/food-wordcount
import os
import sys
from collections import defaultdict
os.chdir("/Users/muntean/Documents/workspace/food101/")
# from twitter.Tweet import Tweet

## Processing

### Simple counts

In [2]:
tweetsAsDict = Tweet.getTweetAsDictionary("/Users/muntean/food-tweets/")

countTweets = 0
countCoords = 0
countPlace = 0
countUserLocation = 0
countImage = 0
countMedia =0 
for tweet, fileName in tweetsAsDict:
    if tweet["coordinates"] is not None:
        countCoords +=1
    if tweet["place"] is not None:
        countPlace +=1
    if tweet['user']['location'] is not None:
        countUserLocation +=1
    # we have two situations: either media is attached to the tweet or we have a link to instagram
    if "media" in tweet["entities"]:
        countMedia +=1
        foundMedia = tweet["entities"]["media"]
        # the type can be photo, video ...
        if foundMedia[0]["type"] == "photo":
            countImage += 1
    countTweets +=1

### Intersections with tweets attributes

In [2]:
def getLocationData(tweet):
    """
    These can always have None values; e.g no coordinates, no city, no user location
    :param tweet:
    :return:
    """
    if tweet["coordinates"] is not None:
        tweet_coords = tweet['coordinates']['coordinates']  # returns a list [longitude, latitude]
    else:
        tweet_coords = None

    if tweet["place"] is not None:
        if tweet["place"]["place_type"] == "city":
            tweet_place_city = tweet["place"]["name"]  # if place type == city
            tweet_place_country = tweet["place"]["country"]
            tweet_place_country_code = tweet["place"]["country_code"]
        else:
            tweet_place_city = None
            tweet_place_country = tweet["place"]["country"]
            tweet_place_country_code = tweet["place"]["country_code"]
    else:
        tweet_place_city = None
        tweet_place_country = None
        tweet_place_country_code = None

    user_location = tweet['user']['location']

    return tweet_coords, tweet_place_city, tweet_place_country, tweet_place_country_code, user_location

In [3]:
tweetsAsDict = Tweet.getTweetAsDictionary("/Users/muntean/food-tweets")

dictCoords = defaultdict()
dictPlace = defaultdict()
dictUserLocation = defaultdict()
dictImage = defaultdict()

for tweet, fileName in tweetsAsDict:
    tweet_coords, tweet_place_city, tweet_place_country, tweet_place_country_code, user_location = getLocationData(tweet)
    if tweet_coords is not None:
        dictCoords[tweet["id_str"]] = tweet_coords
    if tweet_place_country_code is not None:
        dictPlace[tweet["id_str"]] = tweet_place_country # or country code or even city but may be less
    if user_location is not None:
        dictUserLocation[tweet["id_str"]] = user_location 
    if "media" in tweet["entities"]:
        foundMedia = tweet["entities"]["media"]
        dictImage[tweet["id_str"]] = foundMedia[0]["media_url"]

## Atributes

### 1. How many tweets in total?

In [5]:
# Food related tweets (filtered on relevant kw matching #food): 
print "Total Tweets: ", countTweets

Total Tweets:  69808


### 2. How many tweets with Place or GEO? How many with valid user location?

In [6]:
# Tweets with Place, Geo and User location
print "Tweets with Coords: ", countCoords
print "Tweets with Place: ", countPlace
print "Tweets with User Location: ", countUserLocation

Tweets with Coords:  4801
Tweets with Place:  6242
Tweets with User Location:  49621


### 3. How many tweets with image?

In [7]:
# Tweets with images or other media
print "Tweets with Image: ", countImage, " all media: ", countMedia

Tweets with Image:  18234  all media:  18234


### 4. How many with combinations of all of the above?

In [8]:
# How many with coords and images
a = set(dictCoords.keys()).intersection(dictImage.keys())
print len(a)
# How many with Country code and images
b = set(dictPlace.keys()).intersection(dictImage.keys())
print len(b)
# How many with user location and images
c = set(dictUserLocation.keys()).intersection(dictImage.keys())
print len(c)


# print dictCoords["673842284794765312"]
# print dictPlace["673842284794765312"]
# print dictImage["673842284794765312"]

52
643
11961


## Distribution by countries

In [None]:
reversed_dictPlace = defaultdict(list)
for key,value in dictPlace.iteritems():
    reversed_dictPlace[value].append(key)
print "Total countries", len(reversed_dictPlace) 

#order dict per popularity
for k in sorted(reversed_dictPlace, key=lambda k: len(reversed_dictPlace[k]), reverse=True):
        print k, len(reversed_dictPlace[k])

In [None]:
### only for the ones with country and photo
reversed_b = defaultdict(list)
for tweetID in b:
    reversed_b[dictPlace[tweetID]].append(tweetID)
print "Total countries", len(reversed_b) 

#order dict per popularity
for k in sorted(reversed_b, key=lambda k: len(reversed_b[k]), reverse=True):
        print k, len(reversed_b[k])

## Visualize wordcount

In [None]:
# better look on the server

## Test user location field for 1000

In [4]:
def getLocationsFromToken(token, citiesIndex, citiesInfo, countriesIndex, countriesInfo):
    """
    
    :param token: 
    :param citiesIndex: 
    :param citiesInfo: 
    :param countriesIndex: 
    :param countriesInfo: 
    :return: 
    """
    city = ""
    country = ""

    if citiesIndex[token]: #TODO: this condition is because we have more cities with same name
        geonamesidCity = citiesIndex[token][0]
        city = citiesInfo[geonamesidCity][0]
#         print "<-------- get City from token -------->"
#         print "token: ", token
#         print "list of geonameids in index:", citiesIndex[token]
#         print "geonameid first in list:", citiesIndex[token][0]
#         print "city: ", citiesInfo[citiesIndex[token][0]]
#         print "<---------------------------------->"
        
    if countriesIndex[token]:
        geonamesidCountry = countriesIndex[token][0]
        country = countriesInfo[geonamesidCountry][0]  # we take the name of the country from the tuple, elim. both UK and United K.
#         print "<-------- get Country from token -------->"
#         print "token: ", token
#         print "list of geonameids in index:", countriesIndex[token][0]
#         print "geonameid first in list:", countriesIndex[token][0]
#         print "country: ", countriesInfo[countriesIndex[token][0]]
#         print "<---------------------------------->"
    return city, country



def cleanLists(potentialCities, potentialCountries):
    if "" in potentialCities:
        potentialCities.remove("")
    if "" in potentialCountries:
        potentialCountries.remove("")
    return potentialCities, potentialCountries

In [5]:
def inferCountryFromCity(citiesList, citiesIndex, citiesInfo, ccDict):
    """
    This should be done by country code crossing
    :param citiesList: 
    :param cityDict: 
    :param ccDict: 
    :return: 
    """

    potential_countries = set()
    for city in citiesList:
        geonameidCity = citiesIndex[city.lower()][0]  # TODO: this is tricky, we might have more cities with same name
        country_code = citiesInfo[geonameidCity][4]
        potential_countries.add(ccDict[country_code])
    return potential_countries

In [6]:
from tokenizer import twokenize, ngrams
def getUserLocation(locationField, citiesIndex, citiesInfo, countriesIndex, countriesInfo):
    """
    THis field is an empty string
    :param tweet:
    :return:
    """

    potentialCities = set()
    potentialCountries = set()

    # 1. split by / - the only char that is not in the tokenizer!
    if "/" in locationField:
        locArray = locationField.split("/")
        for token in locArray:
            city, country = getLocationsFromToken(token.strip().lower(), citiesIndex, citiesInfo, countriesIndex, countriesInfo)
            if city or country:
                potentialCities.add(city)
                potentialCountries.add(country)

    if "," in locationField:
        locArray = locationField.split(",")
        for token in locArray:
            city, country = getLocationsFromToken(token.strip().lower(), citiesIndex, citiesInfo, countriesIndex, countriesInfo)
            if city or country:
                potentialCities.add(city)
                potentialCountries.add(country)

    # 2. tokenize with util and get unigrams, bigrams and trigrams - to lower
    # unigrams
    tokenList = twokenize.tokenize(locationField.lower())
    tokens = ngrams.window_no_twitter_elems(tokenList, 1)
    for token in tokens:
        city, country = getLocationsFromToken(token.strip(), citiesIndex, citiesInfo, countriesIndex, countriesInfo)
        if city or country:
            potentialCities.add(city)
            potentialCountries.add(country)

    # bigrams
    tokens = ngrams.window_no_twitter_elems(tokenList, 2)
    for token in tokens:
        city, country = getLocationsFromToken(token.strip(), citiesIndex, citiesInfo, countriesIndex, countriesInfo)
        if city or country:
            potentialCities.add(city)
            potentialCountries.add(country)

    # trigrams
    tokens = ngrams.window_no_twitter_elems(tokenList, 3)
    for token in tokens:
        city, country = getLocationsFromToken(token, citiesIndex, citiesInfo, countriesIndex, countriesInfo)
        if city or country:
            potentialCities.add(city)
            potentialCountries.add(country)

    cities, countries = cleanLists(potentialCities, potentialCountries)
    return cities, countries

In [7]:
def getFinalUserLocation(user_cities, user_countries, inferred_countries):
    city = ""
    country = ""
    if len(user_cities) == 1:
        city = next(iter(user_cities))

    if len(user_countries) == 1 and len(inferred_countries) == 1:
        country = next(iter(user_countries))

    if len(user_countries) == 1 and len(inferred_countries) == 0:
        country = next(iter(user_countries))

    if len(user_countries) == 0 and len(inferred_countries) == 1:
        country = next(iter(inferred_countries))
        
    # this means we have a city and ambiguous country tagging, 
    # but since we can't infer country we must dismiss city
    if len(city) > 0 and len(country) == 0:
        city = ""

    return city, country

In [9]:
from location import locations
import location.get_place_from_user_location 

citiesIndex, citiesInfo = locations.Cities.loadFromFile()
countriesIndex, countriesInfo = locations.Countries.loadFromFile()
ccDict = locations.Countries.countryCodeDict(countriesInfo)

i = 0
for tweet_id, user_loc in dictUserLocation.iteritems():
    i += 1
    print
    print user_loc
    user_cities, user_countries = getUserLocation(user_loc, citiesIndex, citiesInfo, countriesIndex, countriesInfo)
    inferred_countries = inferCountryFromCity(user_cities, citiesIndex, citiesInfo, ccDict)
    
    for city in user_cities:
        geonameidCity = citiesIndex[city.lower()][0] # this is a list
        #print citiesInfo[geonameidCity], ccDict[citiesInfo[geonameidCity][4]]
    print user_cities, user_countries, inferred_countries   
    city, country = getFinalUserLocation(user_cities, user_countries, inferred_countries)
    print city, country
    if i % 1000 == 0:
        break

All cities with all name:  22310
All cities unique geonameid:   23516
All countries with all names:  24782
All countries unique geonameid:   251
Len of country code dict 251

Johannesburg, South Africa
set([u'johannesburg']) set([u'south africa']) set([u'south africa'])
johannesburg south africa

UK
set([]) set([u'united kingdom']) set([])
 united kingdom

Los Angeles, CA
set([u'los angeles']) set([]) set([u'united states'])
los angeles united states

Southend
set([]) set([]) set([])
 

Florida, USA
set([]) set([u'united states']) set([])
 united states

everywhere
set([]) set([]) set([])
 

Madrid, España
set([u'madrid']) set([u'spain']) set([u'spain'])
madrid spain

Bolton Abbey, Skipton
set([u'bolton']) set([]) set([u'united kingdom'])
bolton united kingdom

Ventura, CA
set([u'ventura']) set([]) set([u'united states'])
ventura united states

 North Sioux City, SD.
set([u'sioux city']) set([]) set([u'united states'])
sioux city united states

Brighton, England
set([u'brighton']) set(

In [14]:
print countriesIndex["uk"]
print countriesIndex["south africa"]

[2635167, 2635167]
[953987, 953987, 953987]


# Image Classifier stats

In [16]:
from itertools import izip
from collections import defaultdict

true2pred=list()
true2predDict = defaultdict(list)
with open("./resources/testset.txt") as testfile, open("./resources/predicted.txt") as predfile: 
    for x, y in izip(testfile, predfile):
        x = x.strip().split("/")[0]
        y = y.strip()
        true2pred.append((x,y))
        true2predDict[x].append((x,y))

print len(true2predDict) 
print len(true2predDict["apple_pie"]), true2predDict["apple_pie"]
print len(true2pred) # 101*250

101
250 [('apple_pie', 'baklava'), ('apple_pie', 'french_toast'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'chicken_curry'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'french_onion_soup'), ('apple_pie', 'tiramisu'), ('apple_pie', 'chocolate_cake'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'baklava'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'bread_pudding'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'pizza'), ('apple_pie', 'apple_pie'), ('apple_pie', 'apple_pie'), ('apple_pie', 'macaroni_and_cheese'), ('apple_pie', 'apple_pie'), ('apple_pie', 'bread_pudding'), ('apple_pie', 'apple_pie'), ('apple_pie', 'french_toast'), ('apple_pie', 'apple_pie'), ('apple_pie', 'bread_pudding'), ('apple_pie', 'waffles'), ('app

In [40]:
# precision
def recallPerClass(somelistOfTuples):
    truepos = [1 for (x,y) in somelistOfTuples if x==y]
    return len(truepos), len(somelistOfTuples), float(len(truepos)) / len(somelistOfTuples)

In [41]:
def precisionPerClass(className, true2pred, true2predDict):
    classInstances = true2predDict[className]
    truepos = [1 for (x,y) in classInstances if x==y]
    numPredicted = [1 for (x,y) in true2pred if y==className]
    return len(truepos), len(numPredicted), float(len(truepos)) / len(numPredicted)

In [42]:
recallPerClass(true2predDict["apple_pie"])

(144, 250, 0.576)

In [43]:
precisionPerClass("apple_pie", true2pred, true2predDict)

(144, 253, 0.5691699604743083)

In [None]:
import numpy as 
metricList = []
for className in true2predDict.keys():
    metricList.append(recallPerClass(true2predDict[className]))
    
