# Basic Stats

In [1]:
# Raw tweets from streaming: /data/muntean/food-tweets
# Wordcount per day and whole (4-grams): /home/muntean/food-wordcount
import os
import sys
from collections import defaultdict
os.chdir("/Users/muntean/Documents/workspace/food101/")
from twitter.Tweet import Tweet

## Processing

### Simple counts

In [2]:
tweetsAsDict = Tweet.getTweetAsDictionary("/Users/muntean/food-tweets/")

countTweets = 0
countCoords = 0
countPlace = 0
countUserLocation = 0
countImage = 0
countMedia =0 
for tweet, fileName in tweetsAsDict:
    if tweet["coordinates"] is not None:
        countCoords +=1
    if tweet["place"] is not None:
        countPlace +=1
    if tweet['user']['location'] is not None:
        countUserLocation +=1
    # we have two situations: either media is attached to the tweet or we have a link to instagram
    if "media" in tweet["entities"]:
        countMedia +=1
        foundMedia = tweet["entities"]["media"]
        # the type can be photo, video ...
        if foundMedia[0]["type"] == "photo":
            countImage += 1
    countTweets +=1

### Intersections with tweets attributes

In [2]:
def getLocationData(tweet):
    """
    These can always have None values; e.g no coordinates, no city, no user location
    :param tweet:
    :return:
    """
    if tweet["coordinates"] is not None:
        tweet_coords = tweet['coordinates']['coordinates']  # returns a list [longitude, latitude]
    else:
        tweet_coords = None

    if tweet["place"] is not None:
        if tweet["place"]["place_type"] == "city":
            tweet_place_city = tweet["place"]["name"]  # if place type == city
            tweet_place_country = tweet["place"]["country"]
            tweet_place_country_code = tweet["place"]["country_code"]
        else:
            tweet_place_city = None
            tweet_place_country = tweet["place"]["country"]
            tweet_place_country_code = tweet["place"]["country_code"]
    else:
        tweet_place_city = None
        tweet_place_country = None
        tweet_place_country_code = None

    user_location = tweet['user']['location']

    return tweet_coords, tweet_place_city, tweet_place_country, tweet_place_country_code, user_location

In [3]:
tweetsAsDict = Tweet.getTweetAsDictionary("/Users/muntean/food-tweets")

dictCoords = defaultdict()
dictPlace = defaultdict()
dictUserLocation = defaultdict()
dictImage = defaultdict()

for tweet, fileName in tweetsAsDict:
    tweet_coords, tweet_place_city, tweet_place_country, tweet_place_country_code, user_location = getLocationData(tweet)
    if tweet_coords is not None:
        dictCoords[tweet["id_str"]] = tweet_coords
    if tweet_place_country_code is not None:
        dictPlace[tweet["id_str"]] = tweet_place_country # or country code or even city but may be less
    if user_location is not None:
        dictUserLocation[tweet["id_str"]] = user_location 
    if "media" in tweet["entities"]:
        foundMedia = tweet["entities"]["media"]
        dictImage[tweet["id_str"]] = foundMedia[0]["media_url"]

## Atributes

### 1. How many tweets in total?

In [4]:
# Food related tweets (filtered on relevant kw matching #food): 
print "Total Tweets: ", countTweets

Total Tweets:  69808


### 2. How many tweets with Place or GEO? How many with valid user location?

In [5]:
# Tweets with Place, Geo and User location
print "Tweets with Coords: ", countCoords
print "Tweets with Place: ", countPlace
print "Tweets with User Location: ", countUserLocation

Tweets with Coords:  4801
Tweets with Place:  6242
Tweets with User Location:  49621


### 3. How many tweets with image?

In [6]:
# Tweets with images or other media
print "Tweets with Image: ", countImage, " all media: ", countMedia

Tweets with Image:  18234  all media:  18234


### 4. How many with combinations of all of the above?

In [13]:
# How many with coords and images
a = set(dictCoords.keys()).intersection(dictImage.keys())
print len(a)
# How many with Country code and images
b = set(dictPlace.keys()).intersection(dictImage.keys())
print len(b)
# How many with user location and images
c = set(dictUserLocation.keys()).intersection(dictImage.keys())
print len(c)


# print dictCoords["673842284794765312"]
# print dictPlace["673842284794765312"]
# print dictImage["673842284794765312"]

52
643
11961


## Distribution by countries

In [14]:
reversed_dictPlace = defaultdict(list)
for key,value in dictPlace.iteritems():
    reversed_dictPlace[value].append(key)
print "Total countries", len(reversed_dictPlace) 

#order dict per popularity
for k in sorted(reversed_dictPlace, key=lambda k: len(reversed_dictPlace[k]), reverse=True):
        print k, len(reversed_dictPlace[k])

Total countries 194
United States 2282
United Kingdom 523
日本 302
Republic of the Philippines 262
Italia 244
Canada 243
Indonesia 183
Malaysia 148
España 132
México 122
Australia 117
India 114
Thailand 83
France 79
Brasil 78
Japan 69
Argentina 62
Deutschland 50
Singapore 48
Ireland 48
Mexico 46
South Africa 45
Italy 42
Spain 40
Türkiye 34
United Arab Emirates 34
Chile 33
Germany 30
Colombia 28
The Netherlands 23
Brazil 23
Vietnam 22
Brunei 21
Peru 19
Estados Unidos 18
ประเทศไทย 18
Republic of Korea 18
Hong Kong 17
Nederland 15
Sverige 15
台灣 15
People's Republic of China 15
New Zealand 13
Pakistan 13
Taiwan 13
中华人民共和国 13
Portugal 10
Ecuador 10
Venezuela 10
Polska 9
Österreich 9
Francia 8
대한민국 8
Turkey 8
Belgium 8
Россия 8
Sri Lanka 8
Reino Unido 7
Costa Rica 7
Kingdom of Saudi Arabia 7
Greece 7
België 7
Regno Unito 6
Bahrain 6
Kenya 6
Sweden 6
Espanya 6
Republic of Slovenia 6
Suomi 6
Poland 6
Etats-Unis 6
Tanzania 5
Republic of Serbia 5
Dominican Republic 5
Bulgaria 5
Nigeria 5
Denmark 5

In [15]:
### only for the ones with country and photo
reversed_b = defaultdict(list)
for tweetID in b:
    reversed_b[dictPlace[tweetID]].append(tweetID)
print "Total countries", len(reversed_b) 

#order dict per popularity
for k in sorted(reversed_b, key=lambda k: len(reversed_b[k]), reverse=True):
        print k, len(reversed_b[k])

Total countries 76
United States 262
United Kingdom 89
Canada 39
Italia 23
日本 19
Australia 15
Malaysia 15
Ireland 13
India 12
Republic of the Philippines 10
España 9
Colombia 8
México 7
France 6
Indonesia 6
Deutschland 6
South Africa 6
Japan 6
Italy 5
Argentina 4
Nederland 4
भारत 3
Germany 3
Thailand 3
Österreich 3
Espanya 3
Mexico 3
Peru 3
Kingdom of Saudi Arabia 2
Lithuania 2
Brasil 2
Venezuela 2
Magyarország 2
Chile 2
Belgium 2
The Netherlands 2
Türkiye 2
ประเทศไทย 2
Brazil 1
Qatar 1
Crna Gora 1
Canadá 1
Costa Rica 1
Morocco 1
中华人民共和国 1
Sri Lanka 1
Nigeria 1
Ecuador 1
Bangladesh 1
Singapore 1
Frankreich 1
Cameroon 1
Zambia 1
Spain 1
Schweiz 1
Sverige 1
Paraguay 1
Новая Зеландия 1
Suomi 1
Россия 1
Estados Unidos 1
Kenya 1
Switzerland 1
New Zealand 1
België 1
Bulgaria 1
Pakistan 1
台灣 1
United Arab Emirates 1
Guam 1
Austria 1
Japon 1
Republic of Serbia 1
République populaire de Chine 1
Albania 1
Việt Nam 1


## Visualize wordcount

In [None]:
# better look on the server

## Test user location field for 1000

In [10]:
from location import locations
from location.get_place_from_user_location import getUserLocation, inferCountryFromCity

countries = locations.Countries.loadFromFile()
cities = locations.Cities.loadFromFile()
citiesAscii = locations.Cities.loadFromFile(ascii=True)
ccDict = locations.Countries.countryCodeDict(countries)

i = 0
for tweet_id, user_loc in dictUserLocation.iteritems():
    i += 1
    user_cities, user_countries = getUserLocation(user_loc, citiesAscii, countries)
    inferred_countries = inferCountryFromCity(user_cities, citiesAscii, ccDict)
    print
    for city in user_cities:
        print citiesAscii[city], ccDict[citiesAscii[city][4]]
    print user_loc, user_cities, user_countries, inferred_countries   
    if i % 1000 == 0:
        break

 current dir:  /Users/muntean/Documents/workspace/food101
./location/countryInfo.txt
All countries:  254
All cities:  22312
All cities:  22251

(u'johannesburg', u'Johannesburg', 28.04363, -26.20227, u'ZA', 2026469, u'Africa/Johannesburg') south africa
Johannesburg, South Africa set([u'johannesburg']) set([u'south africa']) set([u'south africa'])

UK set([]) set([u'united kingdom']) None

(u'los angeles', u'Los Angeles', -118.24368, 34.05223, u'US', 3792621, u'America/Los_Angeles') united states
Los Angeles, CA set([u'los angeles']) set([]) set([u'united states'])

Southend set([]) set([]) None

(u'florida', u'Florida', -56.21417, -34.09556, u'UY', 32234, u'America/Montevideo') uruguay
Florida, USA set([u'florida']) set([]) set([u'uruguay'])

everywhere set([]) set([]) None

(u'madrid', u'Madrid', -3.70256, 40.4165, u'ES', 3255944, u'Europe/Madrid') spain
Madrid, España set([u'madrid']) set([]) set([u'spain'])

(u'bolton', u'Bolton', -2.43333, 53.58333, u'GB', 141331, u'Europe/London')