# Topic Modeling of tweets

## Setup

In [1]:
from tep.accountCollector import AccountCollector

In [2]:
api = AccountCollector().api

{"created_at": "Thu May 01 12:37:22 +0000 2014", "description": "PhD Student @TUDarmstadt, doing research on Artificial Intelligence and Machine Learning. Supporting IT security startups @startupsec_da", "favourites_count": 401, "followers_count": 58, "friends_count": 235, "id": 2472450259, "id_str": "2472450259", "lang": "en", "listed_count": 6, "location": "Darmstadt, Germany", "name": "Felix Peters", "profile_background_color": "C0DEED", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "profile_image_url": "http://pbs.twimg.com/profile_images/600953861629734913/7y_RkdW4_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/600953861629734913/7y_RkdW4_normal.jpg", "profile_link_color": "224F82", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image

## Get user categories

In [3]:
categories = api.GetUserSuggestionCategories()

In [4]:
print(len(categories))
print(type(categories[0]))

13
<class 'twitter.models.Category'>


In [5]:
categories

[Category(Name='Sports', Slug=sports, Size=16),
 Category(Name='Television', Slug=television, Size=13),
 Category(Name='Music', Slug=music, Size=15),
 Category(Name='Fashion', Slug=fashion, Size=15),
 Category(Name='Entertainment', Slug=entertainment, Size=14),
 Category(Name='Books', Slug=books, Size=12),
 Category(Name='Gaming', Slug=gaming, Size=15),
 Category(Name='Family', Slug=family, Size=9),
 Category(Name='Food & Drink', Slug=food-drink, Size=17),
 Category(Name='Funny', Slug=funny, Size=15),
 Category(Name='Business', Slug=business, Size=10),
 Category(Name='Government', Slug=government, Size=14),
 Category(Name='News', Slug=news, Size=18)]

## Get users for categories

In [8]:
user_dict = {}
for cat in categories:
    users = api.GetUserSuggestion(category=cat)
    print("Got {} users for category {}".format(len(users), cat.name))
    user_ids = [u.id for u in users]
    user_dict[cat.slug] = user_ids

Got 16 users for category Sports
Got 13 users for category Television
Got 15 users for category Music
Got 15 users for category Fashion
Got 14 users for category Entertainment
Got 12 users for category Books
Got 15 users for category Gaming
Got 8 users for category Family
Got 17 users for category Food & Drink
Got 15 users for category Funny
Got 10 users for category Business
Got 14 users for category Government
Got 18 users for category News


## Get user tweets

In [11]:
tweet_dict = {}
for cat in user_dict:
    users = user_dict[cat]
    for u in users:
        tweets = api.GetUserTimeline(user_id=u, count=200)
        print("Got {} tweets for user {}".format(len(tweets), u))
        for t in tweets:
            tweet = {
                'id': t.id,
                'text': t.text,
                'topic': cat,
            }
            tweet_dict[str(t.id)] = tweet
print("Final dataset size: {}".format(len(tweet_dict)))

Got 200 tweets for user 300392950
Got 199 tweets for user 23083404
Got 200 tweets for user 52422878
Got 200 tweets for user 21254264
Got 200 tweets for user 6446742
Got 200 tweets for user 265483421
Got 200 tweets for user 42562446
Got 199 tweets for user 107146095
Got 200 tweets for user 1059194370
Got 198 tweets for user 98809456
Got 199 tweets for user 145107843
Got 200 tweets for user 2181233851
Got 198 tweets for user 783763632
Got 200 tweets for user 170759111
Got 200 tweets for user 131948686
Got 200 tweets for user 22449367
Got 200 tweets for user 586198217
Got 200 tweets for user 6480682
Got 200 tweets for user 23544596
Got 200 tweets for user 28785486
Got 200 tweets for user 31080039
Got 200 tweets for user 26585095
Got 198 tweets for user 205302299
Got 93 tweets for user 3353566654
Got 200 tweets for user 3303293865
Got 197 tweets for user 197598287
Got 200 tweets for user 55117855
Got 200 tweets for user 46296304
Got 200 tweets for user 25460615
Got 200 tweets for user 2144

## Save tweets to file

In [14]:
!mkdir -p data/topic/us-topics-181109

In [16]:
import json

In [17]:
with open('data/topic/us-topics-181109/train.json', 'w') as f:
    json.dump(tweet_dict, f, indent=2, sort_keys=True)

In [19]:
!ls -lh data/topic/us-topics-181109/

total 14848
-rw-r--r--  1 felix  staff   7.2M Nov  9 11:55 train.json
