In [248]:
# library imports
import numpy as np
import pandas as pd
import matplotlib as plt
import requests
import os
import tweepy
import json
from twitter_api import api # imports tweepy api with keys
from pprint import pprint
from shutil import move
from datetime import datetime
from collections import defaultdict

# Gather

* Create and/or set directory for data files to be stored

In [2]:
# Make directory if it doesn't already exist
folder_name = 'project_data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

* Check for enhanced twitter archive file (provided): `twitter-archive-enhanced.csv`
    + If doesn't exist, this file needs to be downloaded from Udacity
    + Load data into Pandas DataFrame: `tweet_archive_df` 

In [103]:
# Check for twitter_archive_enhanced.csv
tweet_archive_file_name = 'twitter-archive-enhanced.csv'
tweet_archive_file_path = os.path.join(folder_name, tweet_archive_file_name)
# if file is in the same directory as project file,
# move it to 'project_data' folder
if os.path.isfile(tweet_archive_file_name):
    move(tweet_archive_file_name, tweet_archive_file_path)
    print("'{}' moved to '{}'.".format(tweet_archive_file_name, tweet_archive_file_path))
assert os.path.isfile(tweet_archive_file_path), \
    "Download 'twitter-archive-enhanced.csv' to '{}'.".format(tweet_archive_file_path)
# Load into DataFrame
tweet_archive_df = pd.read_csv(tweet_archive_file_path)

* Check for image prediction data file: `image-predictions.tsv`
    + If it doesn't exist, download programatically from URL 
    + Load data into Pandas DataFrame: `image_predictions_df`

In [4]:
# Check for image-predictions.tsv
# if image-predictions.tsv doesn't exist, download programatically
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
predictions_file_path = os.path.join(folder_name, url.split('/')[-1])
if not os.path.isfile(predictions_file_path):
    with open(predictions_file_path, mode = 'wb') as file:
        file.write(requests.get(url).content)
# Load into DataFrame
image_predictions_df = pd.read_csv(predictions_file_path, '\t')

* Check for twitter json file: `tweet_json.txt`
    + If doesn't exist, download Tweet json data using Tweepy library
    + Load data into Pandas DataFrame: `tweet_json_df`

In [82]:
text_file_name = 'tweet_json.txt'
text_file_path = os.path.join(folder_name, text_file_name)

def write_tweets(tweepy_statuses, file_path):
    """ Append tweepy Status objects as JSON string to provided file_path """
    with open(text_file_path, 'a+') as file:
        for tweepy_status in tweepy_statuses:
            file.write(json.dumps(tweepy_status._json)+'\n')
            
# list to sublist generator from:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
def create_tweet_json_file(tweet_ids, file_path):
    """ download and write JSON for provided tweet_ids to provided file_path"""
    for chunk in chunks(tweet_ids, 100):
        chunk_statuses = api.statuses_lookup(id_ = list(chunk))
        '''
        chunk_statuses = api.statuses_lookup(id_ = list(chunk),
                                             include_entities = False, # not interested in media data
                                             trim_user = False) #,     # want follower data to see trend
                                             #map_ = True)             # unfortunately this isn't working
        '''
        write_tweets(chunk_statuses, file_path)
    
if not os.path.isfile(text_file_path):
    tweet_ids = tweet_archive_df.tweet_id
    create_tweet_json_file(tweet_ids, text_file_path) #first run contained 2343 JSON strings
    print("Downloaded {} tweets to '{}' as JSON".format(sum(1 for line in open(text_file_path)), text_file_path))
else:
    print("{} tweets exist in '{}'".format(sum(1 for line in open(text_file_path)), text_file_path))
    
# load into DataFrame
tweet_json_df = pd.read_json(text_file_path, lines = True)

Downloaded 2343 tweets to 'project_data/tweet_json.txt' as JSON


# Assess

## `tweet_archive_df`

In [62]:
tweet_archive_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

The DataFrame contains tweet statuses that are replies and retweets, which may contain quality issues. The `timestamp` doesn't seem to be imported as a datetime.

Before looking at those, check example data of what's in the column fields.

In [194]:
tweet_archive_df.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


Let's revisit retweets and replies.

In [None]:
# tweets that are replies
# .info() says there should be 78
78 == sum(tweet_archive_df.in_reply_to_status_id.notnull())

In [None]:
# tweets that are retweets
# .info() says there should be 181
181 == sum(tweet_archive_df.retweeted_status_id.notnull())

In [195]:
tweet_archive_df[tweet_archive_df.retweeted_status_id.notnull()][0:3]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
19,888202515573088257,,,2017-07-21 01:02:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Canela. She attempted s...,8.87474e+17,4196984000.0,2017-07-19 00:47:34 +0000,https://twitter.com/dog_rates/status/887473957...,13,10,Canela,,,,
32,886054160059072513,,,2017-07-15 02:45:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @Athletics: 12/10 #BATP https://t.co/WxwJmv...,8.860537e+17,19607400.0,2017-07-15 02:44:07 +0000,https://twitter.com/dog_rates/status/886053434...,12,10,,,,,
36,885311592912609280,,,2017-07-13 01:35:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Lilly. She just paralle...,8.305833e+17,4196984000.0,2017-02-12 01:04:29 +0000,https://twitter.com/dog_rates/status/830583320...,13,10,Lilly,,,,


In [197]:
tweet_archive_df[tweet_archive_df.rating_numerator.notnull()][0:3]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


Revisit, `timestamp` column to see what we're working with.

In [126]:
# check timestamp format
tweet_archive_df.timestamp[0:5]

0    2017-08-01 16:23:56 +0000
1    2017-08-01 00:17:27 +0000
2    2017-07-31 00:18:03 +0000
3    2017-07-30 15:58:51 +0000
4    2017-07-29 16:00:24 +0000
Name: timestamp, dtype: object

Issues:
* tweets archive includes retweets and replies, which could contain quality issues
* datetime column `timestamp` is a `object` type (string)

## `image_predictions_df`

In [128]:
image_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


Nothing stands out from `.info()`, but `img_num` may create problems on inspection.

In [129]:
image_predictions_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


Still looking okay.

Let's check into `img_num` and see if there's anything other than 1.

In [150]:
# image_predictions_df[image_predictions_df.img_num != 1]

# there's a lot; let's narrow it down to a slice
image_predictions_df[image_predictions_df.img_num != 1][0:5]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
144,668623201287675904,https://pbs.twimg.com/media/CUdtP1xUYAIeBnE.jpg,4,Chihuahua,0.708163,True,Pomeranian,0.091372,True,titi,0.067325,False
312,671547767500775424,https://pbs.twimg.com/media/CVHRIiqWEAAj98K.jpg,2,Loafer,0.255088,False,platypus,0.090019,False,cowboy_boot,0.066536,False
315,671735591348891648,https://pbs.twimg.com/media/CVJ79MzW4AEpTom.jpg,2,stone_wall,0.271121,False,Irish_wolfhound,0.063078,True,poncho,0.048226,False
319,671768281401958400,https://pbs.twimg.com/media/CVKZsHtWwAA6gPj.jpg,2,Chihuahua,0.500373,True,French_bulldog,0.112796,True,Italian_greyhound,0.062893,True
345,672272411274932228,https://pbs.twimg.com/media/CVRkLuJWUAAhhYp.jpg,2,pug,0.914685,True,Norwegian_elkhound,0.014982,True,Siamese_cat,0.009221,False


Let's check by the `tweet_id` of an entry with higher `img_num` to see if there are multiple entries (one for each `img_num`) in case it is a way to code for tweets with multiple images.

In [152]:
image_predictions_df[image_predictions_df.tweet_id == 668623201287675904]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
144,668623201287675904,https://pbs.twimg.com/media/CUdtP1xUYAIeBnE.jpg,4,Chihuahua,0.708163,True,Pomeranian,0.091372,True,titi,0.067325,False


Not sure exactly what `img_num` is since there is no indication on how the images were parsed to identify only 3 objects, but there aren't multiple entries to account for higher image numbers.

Looking at `img_num`, there seem to be image identifications that may not be of dogs.

In [153]:
image_predictions_df[image_predictions_df.p1_dog == False][0:5]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,0.033919,False,partridge,5.2e-05,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,0.085547,False,bookcase,0.07948,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,0.01525,False,great_grey_owl,0.013207,False


In [151]:
len(image_predictions_df[image_predictions_df.p1_dog == False])

543

There are some images that don't have dogs as the first identified object in the picture.

Are there any that don't contain dogs as the primary 3 objects?

In [145]:
image_predictions_df[(image_predictions_df.p1_dog == False) &
                     (image_predictions_df.p2_dog == False) &
                     (image_predictions_df.p3_dog == False)]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,4.588540e-02,False,terrapin,1.788530e-02,False
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,3.391940e-02,False,partridge,5.206580e-05,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,8.554740e-02,False,bookcase,7.947970e-02,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,1.525000e-02,False,great_grey_owl,1.320720e-02,False
25,666362758909284353,https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg,1,guinea_pig,0.996496,False,skunk,2.402450e-03,False,hamster,4.608630e-04,False
29,666411507551481857,https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg,1,coho,0.404640,False,barracouta,2.714850e-01,False,gar,1.899450e-01,False
45,666786068205871104,https://pbs.twimg.com/media/CUDmZIkWcAAIPPe.jpg,1,snail,0.999888,False,slug,5.514170e-05,False,acorn,2.625800e-05,False
50,666837028449972224,https://pbs.twimg.com/media/CUEUva1WsAA2jPb.jpg,1,triceratops,0.442113,False,armadillo,1.140710e-01,False,common_iguana,4.325530e-02,False
51,666983947667116034,https://pbs.twimg.com/media/CUGaXDhW4AY9JUH.jpg,1,swab,0.589446,False,chain_saw,1.901420e-01,False,wig,3.450970e-02,False
53,667012601033924608,https://pbs.twimg.com/media/CUG0bC0U8AAw2su.jpg,1,hyena,0.987230,False,African_hunting_dog,1.260080e-02,False,coyote,5.735010e-05,False


In [154]:
len(image_predictions_df[(image_predictions_df.p1_dog == False) &
                         (image_predictions_df.p2_dog == False) &
                         (image_predictions_df.p3_dog == False)])

324

There seem to be a lot of images that don't contain dogs. Looking at random images, a lot of them legitimately don't contain a dog, but some, like 666268910803644416, do actually have a dog.

Since the twitter account is humorous and playful, rating other animals is an act of humor and not necessarily a quality issue. However, the intent of looking at the data is to examine the dog ratings or posts with dog ratings from WeRateDogs (@dog_rates), so this will be considered a quality issue in this case.

Issue:
* tweeted pictures aren't always of dogs

## `tweet_json_df`

What does this look like after import?

In [155]:
tweet_json_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2343 entries, 0 to 2342
Data columns (total 28 columns):
contributors                 0 non-null float64
coordinates                  0 non-null float64
created_at                   2343 non-null datetime64[ns]
extended_entities            1823 non-null object
favorite_count               2343 non-null int64
favorited                    2343 non-null bool
geo                          0 non-null float64
id                           2343 non-null int64
id_str                       2343 non-null int64
in_reply_to_screen_name      78 non-null object
in_reply_to_status_id        78 non-null float64
in_reply_to_status_id_str    78 non-null float64
in_reply_to_user_id          78 non-null float64
in_reply_to_user_id_str      78 non-null float64
is_quote_status              2343 non-null bool
lang                         2343 non-null object
place                        1 non-null object
possibly_sensitive           2206 non-null float64
quoted

In [105]:
tweet_json_df.head(3)

Unnamed: 0,contributors,coordinates,created_at,extended_entities,favorite_count,favorited,geo,id,id_str,in_reply_to_screen_name,...,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user
0,,,2017-06-18 16:57:37,,18717,False,,876484053909872640,876484053909872640,,...,,,,2410,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Benedict. He wants to thank you for th...,True,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1,,,2017-07-15 23:25:31,,21109,False,,886366144734445568,886366144734445568,,...,,,,3203,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Roscoe. Another pupper fallen victim t...,True,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
2,,,2017-06-21 19:36:23,"{'media': [{'id': 876850756556607488, 'id_str'...",0,False,,877611172832227328,877611172832227328,,...,,,,81,False,{'created_at': 'Mon Jun 19 17:14:49 +0000 2017...,"<a href=""http://twitter.com/download/iphone"" r...",RT @rachel2195: @dog_rates the boyfriend and h...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."


Issues:
* JSON tree includes compound fields with another dictionary tree (multiple data in a column)
* contains duplicate data fields as 'tweet_archive_df'

Check for missing entries.

In [114]:
print("tweet archive contains {} entries.".format(len(tweet_archive_df.tweet_id)))
print("tweepy api retrieved {} entries.".format(len(tweet_json_df.id)))

tweet archive contains 2356 entries.
tweepy api retrieved 2343 entries.


Issue:
* There are missing entries from the api versus the id's present in the tweet archive.

## Issues (aggregated)
* `tweet_archive_df`
    + tweets archive includes retweets and replies
    + datetime column timestamp is a object type (string)
* `image_predictions_df`
    + tweeted pictures aren't always of dogs
* `tweet_json_df`
    + multiple data in several columns
        * JSON tree includes compound fields with another dictionary tree ()
    + contains duplicate data fields as 'tweet_archive_df'
    + too much information; a lot of trash fields.
    + missing entries from the api versus the id's present in the tweet archive.

# Clean

## `tweet_archive_df`

In [174]:
# Copy to another DataFrame to clean
tweet_archive_clean = tweet_archive_df.copy()

In [187]:
tweet_archive_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')

---
#### Define
The `timestamp` column contains string text. To be usable for graphing and analysis, the data in this series needs to be changed to `datetime` objects.

#### Code

In [182]:
# get example time string from index 0 to check format
example_time = tweet_archive_clean.timestamp[0]
example_time

'2017-08-01 16:23:56 +0000'

In [181]:
# test strptime format with example string from index 0
example_time = tweet_archive_clean.timestamp[0]
datetime.strptime(example_time,'%Y-%m-%d %H:%M:%S %z')

2017-08-01 16:23:56 +0000


datetime.datetime(2017, 8, 1, 16, 23, 56, tzinfo=datetime.timezone.utc)

In [185]:
# use pd.to_datetime() to change series of string date & time to datetime objects
# pd.to_datetime(tweet_archive_clean.timestamp, format='%Y-%m-%d %H:%M:%S %z')

# pd.to_datetime() doesn't accept '%z', so create function and map column
def string_to_datetime(string):
    return datetime.strptime(string,'%Y-%m-%d %H:%M:%S %z')
tweet_archive_clean.timestamp = tweet_archive_clean.timestamp.map(string_to_datetime)

#### Test

In [186]:
tweet_archive_clean.timestamp.head()

0   2017-08-01 16:23:56+00:00
1   2017-08-01 00:17:27+00:00
2   2017-07-31 00:18:03+00:00
3   2017-07-30 15:58:51+00:00
4   2017-07-29 16:00:24+00:00
Name: timestamp, dtype: datetime64[ns, UTC]

## `image_predictions_df`

In [None]:
# Copy to another DataFrame to clean
image_predictions_clean = image_predictions_df.copy()

## 

## `tweet_json_df`

In [None]:
# Copy to another DataFrame to clean
tweet_json_clean = tweet_json_df.copy()

Examine JSON structure to find fields of interest.

In [104]:
# use first tweet as test tweet:
test_tweet_id = tweet_archive_df.tweet_id[0]
test_tweet_api = api.statuses_lookup([test_tweet_id])
# pretty-print JSON of test tweet
pprint(test_tweet_api[0]._json)

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'entities': {'hashtags': [],
              'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
                         'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
                         'id': 892420639486877696,
                         'id_str': '892420639486877696',
                         'indices': [86, 109],
                         'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'sizes': {'large': {'h': 528,
                                             'resize': 'fit',
                                             'w': 540},
                                   'medium': {'h': 528,
                                              'resize': 'fit',
                                              'w': 540},
           

Fields that are interesting:
    * 'retweet_count'
    * 'favorite_count'
    * 'users'['followers_count']
 The rest is duplicate, media data, etc.

---
#### Define
Try to piece together information about missing tweets.

In [208]:
# find missing tweet ids
missing_tweet_ids = [tweet_id for index, tweet_id in tweet_archive_df.tweet_id.items() \
                  if tweet_id not in tweet_json_df.id.values]

In [249]:
missing_tweets = defaultdict(list)
missing_tweets['tweets_id'] = [tweet_id for index, tweet_id in tweet_archive_df.tweet_id.items() \
                               if tweet_id not in tweet_json_df.id.values]
missing_tweets['code'], missing_tweets['message'] = [], []
for tweet_id in missing_tweets['tweets_id']:
    try:
        api.get_status(missing_tweet_ids[0])
    except tweepy.TweepError as error:
        error = eval(error.response.text)['errors'][0]
        missing_tweets['code'].append(error['code'])
        missing_tweets['message'].append(error['message'])

In [250]:
missing_tweets

defaultdict(list,
            {'tweets_id': [888202515573088257,
              873697596434513921,
              869988702071779329,
              866816280283807744,
              861769973181624320,
              845459076796616705,
              842892208864923648,
              837012587749474308,
              827228250799742977,
              802247111496568832,
              775096608509886464,
              770743923962707968,
              754011816964026368],
             'code': [144,
              144,
              144,
              144,
              144,
              144,
              144,
              144,
              144,
              144,
              144,
              144,
              144],
             'message': ['No status found with that ID.',
              'No status found with that ID.',
              'No status found with that ID.',
              'No status found with that ID.',
              'No status found with that ID.',
              'No status

In [247]:
print (error.api_code)
message = eval(error.response.text)['errors'][0]
print(message)

144
{'code': 144, 'message': 'No status found with that ID.'}


https://developer.twitter.com/en/docs/basics/response-codes.html