In [1]:
import pandas as pd
pd.set_option("display.max_columns", None)

# load in data
df = pd.read_json("all_tweets.json", orient="split")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264542 entries, 0 to 264541
Data columns (total 9 columns):
id_str            264542 non-null int64
screen_name       264542 non-null object
created_at        264542 non-null datetime64[ns]
lang              264542 non-null object
source            264542 non-null object
retweet_count     264542 non-null int64
favorite_count    264542 non-null int64
is_retweet        264542 non-null bool
full_text         264542 non-null object
dtypes: bool(1), datetime64[ns](1), int64(3), object(4)
memory usage: 18.4+ MB


In [3]:
# create a sorted list of unique screen names
screen_names = sorted(list(df["screen_name"].unique()))
len(screen_names)

86

In [4]:
from data_gather import load_env
import tweepy

In [5]:
# gather keys and tokens to access Twitter API
key, secret_key, token, token_secret = load_env()

In [7]:
# Tweepy authorization
auth = tweepy.OAuthHandler(key, secret_key)
    
# set Tweepy access token's
auth.set_access_token(token, token_secret)
    
# call Twitter API
api = tweepy.API(auth_handler=auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
try:
    api.verify_credentials()
    print("Credentials verified")
except:
    print("Credentials unverified, check to see keys/tokens being used are correct.")

Credentials verified


In [9]:
# get list of keys for the information in each User-object
api.get_user("earny_joe")._json.keys()

dict_keys(['id', 'id_str', 'name', 'screen_name', 'location', 'profile_location', 'description', 'url', 'entities', 'protected', 'followers_count', 'friends_count', 'listed_count', 'created_at', 'favourites_count', 'utc_offset', 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', 'status', 'contributors_enabled', 'is_translator', 'is_translation_enabled', 'profile_background_color', 'profile_background_image_url', 'profile_background_image_url_https', 'profile_background_tile', 'profile_image_url', 'profile_image_url_https', 'profile_banner_url', 'profile_link_color', 'profile_sidebar_border_color', 'profile_sidebar_fill_color', 'profile_text_color', 'profile_use_background_image', 'has_extended_profile', 'default_profile', 'default_profile_image', 'can_media_tag', 'followed_by', 'following', 'follow_request_sent', 'notifications', 'translator_type'])

In [10]:
# keys we're interested in
user_keys = [
    "name",
    "id_str",
    "location",
    "created_at",
    "statuses_count",
    "followers_count",
    "friends_count",
    "description"
]

In [11]:
%%time 
from tqdm import tqdm

# loop through each user, and append JSON-object to list for each one
user_list = []
for name in tqdm(screen_names):
    user_list.append(api.get_user(name)._json)

100%|██████████| 86/86 [00:16<00:00,  5.35it/s]

CPU times: user 1.35 s, sys: 107 ms, total: 1.45 s
Wall time: 16.1 s





In [13]:
# create dataframe from list generated above
users = pd.DataFrame(user_list)[user_keys]

In [14]:
# check out first five rows
users.head()

Unnamed: 0,name,id_str,location,created_at,statuses_count,followers_count,friends_count,description
0,50cent,18222378,NYC,Thu Dec 18 19:18:49 +0000 2008,14941,11164421,5,It's the kid 50 Cent | G-Unit -- Official 'Get...
1,ABC News,28785486,New York City / Worldwide,Sat Apr 04 12:40:32 +0000 2009,280596,14745606,532,"All the news and information you need to see, ..."
2,Alexandria Ocasio-Cortez,138203134,"Bronx + Queens, NYC",Wed Apr 28 22:38:40 +0000 2010,9532,6027471,1879,"US Representative,NY-14 (BX & Queens). In a mo..."
3,The Associated Press,51241574,Global,Fri Jun 26 21:48:52 +0000 2009,256798,13639172,7088,"News from The Associated Press, and a taste of..."
4,Neil Patrick Harris,90420314,Manhattanland,Mon Nov 16 15:54:48 +0000 2009,3060,26857298,144,"Husband, father, random sayer of stuff."


In [15]:
# see if there are any duplicated entries
users[users.duplicated(subset="name", keep="first")]

Unnamed: 0,name,id_str,location,created_at,statuses_count,followers_count,friends_count,description


In [17]:
# ensure that the number of unique names is equal to 86
len(users["name"].unique()) == 86

True

In [22]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 8 columns):
name               86 non-null object
id_str             86 non-null object
location           86 non-null object
created_at         86 non-null object
statuses_count     86 non-null int64
followers_count    86 non-null int64
friends_count      86 non-null int64
description        86 non-null object
dtypes: int64(3), object(5)
memory usage: 5.5+ KB


In [23]:
# set the index to the id_str for each user
users.set_index("id_str", inplace=True)

In [25]:
users.to_json("user_info.json", orient="split")

In [27]:
pd.read_json("user_info.json", orient="split").info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86 entries, 18222378 to 20322929
Data columns (total 7 columns):
name               86 non-null object
location           86 non-null object
created_at         86 non-null datetime64[ns, UTC]
statuses_count     86 non-null int64
followers_count    86 non-null int64
friends_count      86 non-null int64
description        86 non-null object
dtypes: datetime64[ns, UTC](1), int64(3), object(3)
memory usage: 5.4+ KB
