In [248]:
# library imports
import numpy as np
import pandas as pd
import matplotlib as plt
import requests
import os
import tweepy
import json
from twitter_api import api # imports tweepy api with keys
from pprint import pprint
from shutil import move
from datetime import datetime
from collections import defaultdict

# Gather

* Create and/or set directory for data files to be stored

In [2]:
# Make directory if it doesn't already exist
folder_name = 'project_data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

* Check for enhanced twitter archive file (provided): `twitter-archive-enhanced.csv`
    + If doesn't exist, this file needs to be downloaded from Udacity
    + Load data into Pandas DataFrame: `tweet_archive_df` 

In [103]:
# Check for twitter_archive_enhanced.csv
tweet_archive_file_name = 'twitter-archive-enhanced.csv'
tweet_archive_file_path = os.path.join(folder_name, tweet_archive_file_name)
# if file is in the same directory as project file,
# move it to 'project_data' folder
if os.path.isfile(tweet_archive_file_name):
    move(tweet_archive_file_name, tweet_archive_file_path)
    print("'{}' moved to '{}'.".format(tweet_archive_file_name, tweet_archive_file_path))
assert os.path.isfile(tweet_archive_file_path), \
    "Download 'twitter-archive-enhanced.csv' to '{}'.".format(tweet_archive_file_path)
# Load into DataFrame
tweet_archive_df = pd.read_csv(tweet_archive_file_path)

* Check for image prediction data file: `image-predictions.tsv`
    + If it doesn't exist, download programatically from URL 
    + Load data into Pandas DataFrame: `image_predictions_df`

In [4]:
# Check for image-predictions.tsv
# if image-predictions.tsv doesn't exist, download programatically
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
predictions_file_path = os.path.join(folder_name, url.split('/')[-1])
if not os.path.isfile(predictions_file_path):
    with open(predictions_file_path, mode = 'wb') as file:
        file.write(requests.get(url).content)
# Load into DataFrame
image_predictions_df = pd.read_csv(predictions_file_path, '\t')

* Check for twitter json file: `tweet_json.txt`
    + If doesn't exist, download Tweet json data using Tweepy library
    + Load data into Pandas DataFrame: `tweet_json_df`

In [82]:
text_file_name = 'tweet_json.txt'
text_file_path = os.path.join(folder_name, text_file_name)

def write_tweets(tweepy_statuses, file_path):
    """ Append tweepy Status objects as JSON string to provided file_path """
    with open(text_file_path, 'a+') as file:
        for tweepy_status in tweepy_statuses:
            file.write(json.dumps(tweepy_status._json)+'\n')
            
# list to sublist generator from:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
def create_tweet_json_file(tweet_ids, file_path):
    """ download and write JSON for provided tweet_ids to provided file_path"""
    for chunk in chunks(tweet_ids, 100):
        chunk_statuses = api.statuses_lookup(id_ = list(chunk))
        write_tweets(chunk_statuses, file_path)
    
if not os.path.isfile(text_file_path):
    tweet_ids = tweet_archive_df.tweet_id
    create_tweet_json_file(tweet_ids, text_file_path) #first run contained 2343 JSON strings
    print("Downloaded {} tweets to '{}' as JSON".format(sum(1 for line in open(text_file_path)), text_file_path))
else:
    print("{} tweets exist in '{}'".format(sum(1 for line in open(text_file_path)), text_file_path))
    
# load into DataFrame
tweet_json_df = pd.read_json(text_file_path, lines = True)

Downloaded 2343 tweets to 'project_data/tweet_json.txt' as JSON


# Assess

## `tweet_archive_df`

In [62]:
tweet_archive_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

Issues:
* tweets archive includes retweets and replies, which create quality issues
* datetime column `timestamp` is a `object` type (string)

In [194]:
tweet_archive_df.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


Issues:
* `source` contains HTML fragments, but the column probably isn't needed.
* doggo, floofer, pupper, and puppo seem like they don't need to be separate columns

From the fields, numerators and denominators could be a source of quality errors.

In [296]:
tweet_archive_df.rating_numerator.value_counts().sort_index()

0         2
1         9
2         9
3        19
4        17
5        37
6        32
7        55
8       102
9       158
10      461
11      464
12      558
13      351
14       54
15        2
17        1
20        1
24        1
26        1
27        1
44        1
45        1
50        1
60        1
75        2
80        1
84        1
88        1
99        1
121       1
143       1
144       1
165       1
182       1
204       1
420       2
666       1
960       1
1776      1
Name: rating_numerator, dtype: int64

For someone that loves dogs and saying all of them are perfect, may be giving 0 might show errors with low scores.

In [364]:
tweet_archive_df[tweet_archive_df.rating_numerator == 0].sort_values(by=['rating_numerator'],  ascending=False)[['tweet_id','rating_numerator', 'rating_denominator', 'text']][0:10]

Unnamed: 0,tweet_id,rating_numerator,rating_denominator,text
315,835152434251116546,0,10,When you're so blinded by your systematic plag...
1016,746906459439529985,0,10,"PUPDATE: can't see any. Even if I could, I cou..."


The entries with zero seem fine.

The distribution of values seems to be mainly below 20, so that will be the arbitraty set point to start a closer look for quality errors.

In [350]:
tweet_archive_df[tweet_archive_df.rating_numerator > 20].sort_values(by=['rating_numerator'],  ascending=False)[['tweet_id','rating_numerator', 'rating_denominator', 'text']][0:10]

Unnamed: 0,tweet_id,rating_numerator,rating_denominator,text
979,749981277374128128,1776,10,This is Atticus. He's quite simply America af....
313,835246439529840640,960,0,@jonnysun @Lin_Manuel ok jomny I know you're e...
189,855860136149123072,666,10,@s8n You tried very hard to portray this good ...
188,855862651834028034,420,10,@dhmontgomery We also gave snoop dogg a 420/10...
2074,670842764863651840,420,10,After so many requests... here you go.\n\nGood...
1120,731156023742988288,204,170,Say hello to this unbelievably well behaved sq...
290,838150277551247360,182,10,@markhoppus 182/10
902,758467244762497024,165,150,Why does this never happen at my front door......
1779,677716515794329600,144,120,IT'S PUPPERGEDDON. Total of 144/120 ...I think...
1634,684225744407494656,143,130,"Two sneaky puppers were not initially seen, mo..."


Even looking at the truncated `text` shows that some are of multiple dogs, like 677716515794329600. This view also shows that there are issues with the denominator as well, which seems to correlate with the amount of dogs. To be sure, isolating the text of the top and bottom 10 of values above 20 might help confirm, since the bottom 10 still needs to be looked at anyhow.

In [373]:
# separate out top 10 above 20 and bottom 10 above 20
top_10_above_20 = tweet_archive_df[tweet_archive_df.rating_numerator > 20].sort_values(by=['rating_numerator'],  ascending=False)[0:10]
bottom_10_above_20 = tweet_archive_df[tweet_archive_df.rating_numerator > 20].sort_values(by=['rating_numerator'],  ascending=True)[0:10]

In [370]:
def print_tweet_text(tweet_df):
    for index, tweet in tweet_df.text.items():
        print(tweet)
print("Top 10:")
print_tweet_text(top_10_above_20)
print("\n\nBottom 10:")
print_tweet_text(bottom_10_above_20)

top 10:
This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh
@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho
@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10
@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research
After so many requests... here you go.

Good dogg. 420/10 https://t.co/yfAAo1gdeY
Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv
@markhoppus 182/10
Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE
IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq
Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3

bottom 10:
Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. 
Keep Sam smil

The values above 20 seem to be mostly of multiple dogs, with the exception of Snoop Dogg as well as tweet_id = 810984652412424192 that uses the colloquialism '24/7' for all the time. Looking at the text also shows that there are probably some issues with decimals, since no decimal values have popped up. Cross-checking the value with the above text for the `bottom_10_above_20` will confirm.

In [372]:
bottom_10_above_20[['tweet_id','rating_numerator', 'rating_denominator', 'text']]

Unnamed: 0,tweet_id,rating_numerator,rating_denominator,text
516,810984652412424192,24,7,Meet Sam. She smiles 24/7 &amp; secretly aspir...
1712,680494726643068929,26,10,Here we have uncovered an entire battalion of ...
763,778027034220126208,27,10,This is Sophie. She's a Jubilant Bush Pupper. ...
1433,697463031882764288,44,40,Happy Wednesday here's a bucket of pups. 44/40...
1274,709198395643068416,45,50,"From left to right:\nCletus, Jerome, Alejandro..."
1202,716439118184652801,50,50,This is Bluebert. He just saw that both #Final...
1351,704054845121142784,60,50,Here is a whole flock of puppers. 60/50 I'll ...
340,832215909146226688,75,10,"RT @dog_rates: This is Logan, the Chow who liv..."
695,786709082849828864,75,10,"This is Logan, the Chow who lived. He solemnly..."
1254,710658690886586372,80,80,Here's a brigade of puppers. All look very pre...


The regex extraction extracted the decimal value below the decimal value only.

The '24/7' brings above the question of denominator distribution.

In [375]:
tweet_archive_df.rating_denominator.value_counts().sort_index()

0         1
2         1
7         1
10     2333
11        3
15        1
16        1
20        2
40        1
50        3
70        1
80        2
90        1
110       1
120       1
130       1
150       1
170       1
Name: rating_denominator, dtype: int64

In [377]:
print_tweet_text(tweet_archive_df[tweet_archive_df.rating_denominator < 10])

@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho
Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. 
Keep Sam smiling by clicking and sharing this link:
https://t.co/98tB8y7y7t https://t.co/LouL5vdvxx
This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv


There aren't too many. The first on is a reply. The '24/7' was already discussed. The last one pulled the wrong pair because of the '1/2'.

In [378]:
tweet_archive_df[tweet_archive_df.rating_denominator == 2].tweet_id

2335    666287406224695296
Name: tweet_id, dtype: int64

Issues:
* numerator extraction of decimals extracted the number behind the '.' as the whole number rating
* numerators/denominator pairs with denominator above 10 are for multiple dogs
* tweet_id = 810984652412424192 isn't actually a rating but uses '24/7' as 'all the time'
* tweet_id = 666287406224695296 with the denominator of 2 was extracted from '1/2' instead of the rating of '9/10'

The other issues should be eliminated by getting rid of retweets/replies and non-dogs from the image_prediction data.

## `image_predictions_df`

In [128]:
image_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


Nothing stands out from `.info()`, but `img_num` may create problems on inspection.

In [129]:
image_predictions_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


Still looking okay.

Let's check into `img_num` and see if there's anything other than 1.

In [150]:
# image_predictions_df[image_predictions_df.img_num != 1]

# there's a lot; let's narrow it down to a slice
image_predictions_df[image_predictions_df.img_num != 1][0:5]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
144,668623201287675904,https://pbs.twimg.com/media/CUdtP1xUYAIeBnE.jpg,4,Chihuahua,0.708163,True,Pomeranian,0.091372,True,titi,0.067325,False
312,671547767500775424,https://pbs.twimg.com/media/CVHRIiqWEAAj98K.jpg,2,Loafer,0.255088,False,platypus,0.090019,False,cowboy_boot,0.066536,False
315,671735591348891648,https://pbs.twimg.com/media/CVJ79MzW4AEpTom.jpg,2,stone_wall,0.271121,False,Irish_wolfhound,0.063078,True,poncho,0.048226,False
319,671768281401958400,https://pbs.twimg.com/media/CVKZsHtWwAA6gPj.jpg,2,Chihuahua,0.500373,True,French_bulldog,0.112796,True,Italian_greyhound,0.062893,True
345,672272411274932228,https://pbs.twimg.com/media/CVRkLuJWUAAhhYp.jpg,2,pug,0.914685,True,Norwegian_elkhound,0.014982,True,Siamese_cat,0.009221,False


Let's check by the `tweet_id` of an entry with higher `img_num` to see if there are multiple entries (one for each `img_num`) in case it is a way to code for tweets with multiple images.

In [380]:
image_predictions_df[image_predictions_df.tweet_id == 668623201287675904]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
144,668623201287675904,https://pbs.twimg.com/media/CUdtP1xUYAIeBnE.jpg,4,Chihuahua,0.708163,True,Pomeranian,0.091372,True,titi,0.067325,False


Not sure exactly what `img_num` is since there is no indication on how the images were parsed to identify only 3 objects, but there aren't multiple entries to account for higher image numbers.

Looking at `img_num`, there seem to be image identifications that may not be of dogs.

In [153]:
image_predictions_df[image_predictions_df.p1_dog == False][0:5]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,0.033919,False,partridge,5.2e-05,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,0.085547,False,bookcase,0.07948,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,0.01525,False,great_grey_owl,0.013207,False


In [151]:
len(image_predictions_df[image_predictions_df.p1_dog == False])

543

There are some images that don't have dogs as the first identified object in the picture.

Are there any that don't contain dogs as the primary 3 objects?

In [145]:
image_predictions_df[(image_predictions_df.p1_dog == False) &
                     (image_predictions_df.p2_dog == False) &
                     (image_predictions_df.p3_dog == False)]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,4.588540e-02,False,terrapin,1.788530e-02,False
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,3.391940e-02,False,partridge,5.206580e-05,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,8.554740e-02,False,bookcase,7.947970e-02,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,1.525000e-02,False,great_grey_owl,1.320720e-02,False
25,666362758909284353,https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg,1,guinea_pig,0.996496,False,skunk,2.402450e-03,False,hamster,4.608630e-04,False
29,666411507551481857,https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg,1,coho,0.404640,False,barracouta,2.714850e-01,False,gar,1.899450e-01,False
45,666786068205871104,https://pbs.twimg.com/media/CUDmZIkWcAAIPPe.jpg,1,snail,0.999888,False,slug,5.514170e-05,False,acorn,2.625800e-05,False
50,666837028449972224,https://pbs.twimg.com/media/CUEUva1WsAA2jPb.jpg,1,triceratops,0.442113,False,armadillo,1.140710e-01,False,common_iguana,4.325530e-02,False
51,666983947667116034,https://pbs.twimg.com/media/CUGaXDhW4AY9JUH.jpg,1,swab,0.589446,False,chain_saw,1.901420e-01,False,wig,3.450970e-02,False
53,667012601033924608,https://pbs.twimg.com/media/CUG0bC0U8AAw2su.jpg,1,hyena,0.987230,False,African_hunting_dog,1.260080e-02,False,coyote,5.735010e-05,False


In [154]:
len(image_predictions_df[(image_predictions_df.p1_dog == False) &
                         (image_predictions_df.p2_dog == False) &
                         (image_predictions_df.p3_dog == False)])

324

There seem to be a lot of images that don't contain dogs. Looking at random images, a lot of them legitimately don't contain a dog, but some, like 666268910803644416, do actually have a dog.

Since the twitter account is humorous and playful, rating other animals is an act of humor and not necessarily a quality issue. However, the intent of looking at the data is to examine the dog ratings or posts with dog ratings from WeRateDogs (@dog_rates), so this will be considered a quality issue in this case.

Issue:
* tweeted pictures aren't always of dogs

## `tweet_json_df`

What does this look like after import?

In [155]:
tweet_json_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2343 entries, 0 to 2342
Data columns (total 28 columns):
contributors                 0 non-null float64
coordinates                  0 non-null float64
created_at                   2343 non-null datetime64[ns]
extended_entities            1823 non-null object
favorite_count               2343 non-null int64
favorited                    2343 non-null bool
geo                          0 non-null float64
id                           2343 non-null int64
id_str                       2343 non-null int64
in_reply_to_screen_name      78 non-null object
in_reply_to_status_id        78 non-null float64
in_reply_to_status_id_str    78 non-null float64
in_reply_to_user_id          78 non-null float64
in_reply_to_user_id_str      78 non-null float64
is_quote_status              2343 non-null bool
lang                         2343 non-null object
place                        1 non-null object
possibly_sensitive           2206 non-null float64
quoted

In [105]:
tweet_json_df.head(3)

Unnamed: 0,contributors,coordinates,created_at,extended_entities,favorite_count,favorited,geo,id,id_str,in_reply_to_screen_name,...,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user
0,,,2017-06-18 16:57:37,,18717,False,,876484053909872640,876484053909872640,,...,,,,2410,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Benedict. He wants to thank you for th...,True,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1,,,2017-07-15 23:25:31,,21109,False,,886366144734445568,886366144734445568,,...,,,,3203,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Roscoe. Another pupper fallen victim t...,True,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
2,,,2017-06-21 19:36:23,"{'media': [{'id': 876850756556607488, 'id_str'...",0,False,,877611172832227328,877611172832227328,,...,,,,81,False,{'created_at': 'Mon Jun 19 17:14:49 +0000 2017...,"<a href=""http://twitter.com/download/iphone"" r...",RT @rachel2195: @dog_rates the boyfriend and h...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."


Issues:
* JSON tree includes compound fields with another dictionary tree (multiple data in a column)
* contains duplicate data fields as 'tweet_archive_df'

Check for missing entries.

In [114]:
print("tweet archive contains {} entries.".format(len(tweet_archive_df.tweet_id)))
print("tweepy api retrieved {} entries.".format(len(tweet_json_df.id)))

tweet archive contains 2356 entries.
tweepy api retrieved 2343 entries.


Issue:
* There are missing entries from the api versus the id's present in the tweet archive.

## Issues (aggregated)

* `tweet_archive_df`
    + tweets archive includes retweets and replies, which create quality issues
    + datetime column timestamp is a object type (string)
    + `source` contains HTML fragments, but the column probably isn't needed.
    + DataFrame contains extraineous information to analysis
    + doggo, floofer, pupper, and puppo seem like they don't need to be separate columns
    + numerator extraction of decimals extracted the number behind the '.' as the whole number rating
    + numerators/denominator pairs with denominator above 10 are for multiple dogs
    + tweet_id = 810984652412424192 isn't actually a rating but uses '24/7' as 'all the time'
    + tweet_id = 666287406224695296 with the denominator of 2 was extracted from '1/2' instead of the rating of '9/10'
* `image_predictions_df`
    + tweeted pictures aren't always of dogs
* `tweet_json_df`
    + multiple data in several columns
        * JSON tree includes compound fields with another dictionary tree ()
    + contains duplicate data fields as 'tweet_archive_df'
    + too much information; a lot of trash fields.
    + missing entries from the api versus the id's present in the tweet archive.
* all
    + tweet_id has an inconsistent data type
    + data should be consolidated into one DataFrame

# Clean

## `tweet_archive_df`

In [408]:
# Copy to another DataFrame to clean
tweet_archive_clean = tweet_archive_df.copy()

---
#### Define
Drop the retweets and replies to eliminate quality problems form tweets that aren't primary rating pictures.

#### Code

In [393]:
# check for relevant column names
list(tweet_archive_clean.columns.values)

['tweet_id',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'timestamp',
 'source',
 'text',
 'retweeted_status_id',
 'retweeted_status_user_id',
 'retweeted_status_timestamp',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo']

In [409]:
# copy entries in the tweet archive DataFrame that don't have a retweet or reply id
tweet_archive_clean = tweet_archive_clean[(tweet_archive_clean.in_reply_to_status_id.isnull()) &
                                          (tweet_archive_clean.retweeted_status_id.isnull())] 

In [410]:
# drop columns related to replies and retweets
tweet_archive_clean = tweet_archive_clean.drop(['in_reply_to_status_id',
                                                'in_reply_to_user_id',
                                                'retweeted_status_id',
                                                'retweeted_status_user_id',
                                                'retweeted_status_timestamp'],
                                               axis=1)

#### Test

In [414]:
# get the reply and retweets
reply_or_retweet  = tweet_archive_df[(tweet_archive_df.in_reply_to_status_id.notnull()) |
                                     (tweet_archive_df.retweeted_status_id.notnull())]
# check if any of tweet id's in the clean DataFrame are in the reply/retweet only copy
any(tweet_id for tweet_id in list(reply_or_retweet.tweet_id) \
    if tweet_id in list(tweet_archive_clean.tweet_id))

False

In [396]:
# check for relevant column names
list(tweet_archive_clean.columns.values)

['tweet_id',
 'timestamp',
 'source',
 'text',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo']

---
#### Define
The `timestamp` column contains string text. To be usable for graphing and analysis, the data in this series needs to be changed to `datetime` objects.

#### Code

In [415]:
# get example time string from index 0 to check format
example_time = tweet_archive_clean.timestamp[0]
example_time

'2017-08-01 16:23:56 +0000'

In [416]:
# test strptime format with example string from index 0
example_time = tweet_archive_clean.timestamp[0]
datetime.strptime(example_time,'%Y-%m-%d %H:%M:%S %z')

datetime.datetime(2017, 8, 1, 16, 23, 56, tzinfo=datetime.timezone.utc)

In [417]:
# use pd.to_datetime() to change series of string date & time to datetime objects
# pd.to_datetime(tweet_archive_clean.timestamp, format='%Y-%m-%d %H:%M:%S %z')

# pd.to_datetime() doesn't accept '%z', so create function and map column
def string_to_datetime(string):
    return datetime.strptime(string,'%Y-%m-%d %H:%M:%S %z')
tweet_archive_clean.timestamp = tweet_archive_clean.timestamp.map(string_to_datetime)

#### Test

In [418]:
tweet_archive_clean.timestamp.head()

0   2017-08-01 16:23:56+00:00
1   2017-08-01 00:17:27+00:00
2   2017-07-31 00:18:03+00:00
3   2017-07-30 15:58:51+00:00
4   2017-07-29 16:00:24+00:00
Name: timestamp, dtype: datetime64[ns, UTC]

---
#### Define
Delete extraneous information not needed for analysis.

In [420]:
# check for relevant column names
list(tweet_archive_clean.columns.values)

['tweet_id',
 'timestamp',
 'source',
 'text',
 'expanded_urls',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo']


Columns to be deleted:
* 'source' - contains device/interface that was used to post
    + deleting source will also fix the residual HTML tags.
* 'expanded_urls' - contains url of tweet

#### Code

In [421]:
# drop columns related to replies and retweets
tweet_archive_clean = tweet_archive_clean.drop(['source',
                                                'expanded_urls'],
                                               axis=1)

#### Test

In [422]:
# check for updated column names
list(tweet_archive_clean.columns.values)

['tweet_id',
 'timestamp',
 'text',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo']

---
#### Define
Change doggo, floofer, pupper, and puppo from separate columns into one column.

#### Code

In [444]:
columns_to_keep = list(tweet_archive_clean.columns.values[:-4])
columns_to_keep

['tweet_id',
 'timestamp',
 'text',
 'rating_numerator',
 'rating_denominator',
 'name']

In [511]:
def is_stage(data):
    return 0 if data == 'None' else 1

def multiple_stages(df):
    doggo = df.doggo.map(is_stage)
    floofer = df.floofer.map(is_stage)
    pupper = df.pupper.map(is_stage)
    puppo = df.puppo.map(is_stage)
    return doggo.add(floofer).add(pupper).add(puppo)

tweet_archive_clean[multiple_stages(tweet_archive_clean) > 1]

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
191,855851453814013952,2017-04-22 18:31:02+00:00,Here's a puppo participating in the #ScienceMa...,13,10,,doggo,,,puppo
200,854010172552949760,2017-04-17 16:34:26+00:00,"At first I thought this was a shy doggo, but i...",11,10,,doggo,floofer,,
460,817777686764523521,2017-01-07 16:59:28+00:00,"This is Dido. She's playing the lead role in ""...",13,10,Dido,doggo,,pupper,
531,808106460588765185,2016-12-12 00:29:28+00:00,Here we have Burke (pupper) and Dexter (doggo)...,12,10,,doggo,,pupper,
575,801115127852503040,2016-11-22 17:28:25+00:00,This is Bones. He's being haunted by another d...,12,10,Bones,doggo,,pupper,
705,785639753186217984,2016-10-11 00:34:48+00:00,This is Pinot. He's a sophisticated doggo. You...,10,10,Pinot,doggo,,pupper,
733,781308096455073793,2016-09-29 01:42:20+00:00,"Pupper butt 1, Doggo 0. Both 12/10 https://t.c...",12,10,,doggo,,pupper,
889,759793422261743616,2016-07-31 16:50:42+00:00,"Meet Maggie &amp; Lila. Maggie is the doggo, L...",12,10,Maggie,doggo,,pupper,
956,751583847268179968,2016-07-09 01:08:47+00:00,Please stop sending it pictures that don't eve...,5,10,,doggo,,pupper,
1063,741067306818797568,2016-06-10 00:39:48+00:00,This is just downright precious af. 12/10 for ...,12,10,just,doggo,,pupper,


In [494]:
# melt 'doggo', 'floofer', 'pupper', 'puppo' into 'dog_stage'
test = pd.melt(tweet_archive_clean, id_vars=columns_to_keep,
                              var_name='stages', value_name='dog_stage')

In [None]:
test[test.].sort_values(by='dogs_stage').duplicated(subset='tweet_id')

In [499]:
test2 = test[test.dog_stage == test.stages]

In [502]:
test2[test2.duplicated(subset='tweet_id', keep=False)].sort_values(by='tweet_id')

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,stages,dog_stage
897,733109485275860992,2016-05-19 01:38:16+00:00,"Like father (doggo), like son (pupper). Both 1...",12,10,,doggo,doggo
5091,733109485275860992,2016-05-19 01:38:16+00:00,"Like father (doggo), like son (pupper). Both 1...",12,10,,pupper,pupper
848,741067306818797568,2016-06-10 00:39:48+00:00,This is just downright precious af. 12/10 for ...,12,10,just,doggo,doggo
5042,741067306818797568,2016-06-10 00:39:48+00:00,This is just downright precious af. 12/10 for ...,12,10,just,pupper,pupper
748,751583847268179968,2016-07-09 01:08:47+00:00,Please stop sending it pictures that don't eve...,5,10,,doggo,doggo
4942,751583847268179968,2016-07-09 01:08:47+00:00,Please stop sending it pictures that don't eve...,5,10,,pupper,pupper
689,759793422261743616,2016-07-31 16:50:42+00:00,"Meet Maggie &amp; Lila. Maggie is the doggo, L...",12,10,Maggie,doggo,doggo
4883,759793422261743616,2016-07-31 16:50:42+00:00,"Meet Maggie &amp; Lila. Maggie is the doggo, L...",12,10,Maggie,pupper,pupper
562,781308096455073793,2016-09-29 01:42:20+00:00,"Pupper butt 1, Doggo 0. Both 12/10 https://t.c...",12,10,,doggo,doggo
4756,781308096455073793,2016-09-29 01:42:20+00:00,"Pupper butt 1, Doggo 0. Both 12/10 https://t.c...",12,10,,pupper,pupper


In [468]:
test[test.duplicated(subset='tweet_id')].sort_values(by='tweet_id')

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,stages,dog_stage
5091,733109485275860992,2016-05-19 01:38:16+00:00,"Like father (doggo), like son (pupper). Both 1...",12,10,,pupper,pupper
5042,741067306818797568,2016-06-10 00:39:48+00:00,This is just downright precious af. 12/10 for ...,12,10,just,pupper,pupper
4942,751583847268179968,2016-07-09 01:08:47+00:00,Please stop sending it pictures that don't eve...,5,10,,pupper,pupper
4883,759793422261743616,2016-07-31 16:50:42+00:00,"Meet Maggie &amp; Lila. Maggie is the doggo, L...",12,10,Maggie,pupper,pupper
4756,781308096455073793,2016-09-29 01:42:20+00:00,"Pupper butt 1, Doggo 0. Both 12/10 https://t.c...",12,10,,pupper,pupper
4730,785639753186217984,2016-10-11 00:34:48+00:00,This is Pinot. He's a sophisticated doggo. You...,10,10,Pinot,pupper,pupper
4640,801115127852503040,2016-11-22 17:28:25+00:00,This is Bones. He's being haunted by another d...,12,10,Bones,pupper,pupper
4610,808106460588765185,2016-12-12 00:29:28+00:00,Here we have Burke (pupper) and Dexter (doggo)...,12,10,,pupper,pupper
4552,817777686764523521,2017-01-07 16:59:28+00:00,"This is Dido. She's playing the lead role in ""...",13,10,Dido,pupper,pupper
2258,854010172552949760,2017-04-17 16:34:26+00:00,"At first I thought this was a shy doggo, but i...",11,10,,floofer,floofer


#### Test

---
#### Define
numerator extraction of decimals extracted the number behind the '.' as the whole number rating
numerators/denominator pairs with denominator above 10 are for multiple dogs
tweet_id = 810984652412424192 isn't actually a rating but uses '24/7' as 'all the time'
tweet_id = 666287406224695296 with the denominator of 2 was extracted from '1/2' instead of the rating of '9/10'

## `image_predictions_df`

In [None]:
# Copy to another DataFrame to clean
image_predictions_clean = image_predictions_df.copy()

## `tweet_json_df`

In [None]:
# Copy to another DataFrame to clean
tweet_json_clean = tweet_json_df.copy()

Examine JSON structure to find fields of interest.

In [312]:
test_tweet_api[0].entities['media'][0]['media_url']

'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg'

In [326]:
for x in range(1):
    Image(url= test_tweet_api[0].entities['media'][0]['media_url'])

In [313]:
type(test_tweet_api)

tweepy.models.ResultSet

In [104]:
# use first tweet as test tweet:
test_tweet_id = tweet_archive_df.tweet_id[0]
test_tweet_api = api.statuses_lookup([test_tweet_id])
# pretty-print JSON of test tweet
pprint(test_tweet_api[0]._json)

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'entities': {'hashtags': [],
              'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
                         'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
                         'id': 892420639486877696,
                         'id_str': '892420639486877696',
                         'indices': [86, 109],
                         'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'sizes': {'large': {'h': 528,
                                             'resize': 'fit',
                                             'w': 540},
                                   'medium': {'h': 528,
                                              'resize': 'fit',
                                              'w': 540},
           

Fields that are interesting:
    * 'retweet_count'
    * 'favorite_count'
    * 'users'['followers_count']
 The rest is duplicate, media data, etc.

---
#### Define
Piece together information about missing tweets in `tweet_json_df` versus source of tweet ids, `tweet_archive_df`.

At the time of writing, the difference in retreived tweet JSONs from the API was very small. With so few missing tweets, we should be able to use `.get_status()` without fear of timeouts, etc.

In [266]:
# set up dict to ultimately create a DataFrame
missing_tweets = defaultdict(list)
# get values of missing tweet_id's
missing_tweets['tweets_id'] = [tweet_id for index, tweet_id in tweet_archive_df.tweet_id.items() \
                               if tweet_id not in tweet_json_df.id.values]
# get status codes
for tweet_id in missing_tweets['tweets_id']:
    try:
        api.get_status(missing_tweet_ids[0])
    except tweepy.TweepError as error:
        error = eval(error.response.text)['errors'][0]
        missing_tweets['code'].append(error['code'])
        missing_tweets['message'].append(error['message'])
# change `missing_tweets` to DataFrame
missing_tweets = pd.DataFrame(missing_tweets)

In [267]:
# verify the gathering and encoding to DataFrame
missing_tweets.head(3)

Unnamed: 0,tweets_id,code,message
0,888202515573088257,144,No status found with that ID.
1,873697596434513921,144,No status found with that ID.
2,869988702071779329,144,No status found with that ID.


In [268]:
# check occurances of error codes
missing_tweets.code.value_counts()

144    13
Name: code, dtype: int64

All the codes are 144. According to the Twitter developer [Response Codes](https://developer.twitter.com/en/docs/basics/response-codes.html), code 144 means the tweets were most likely deleted. Therefore, no data recovery is possible; what's in the archive is what's available.