In [1]:
# library imports
import numpy as np
import pandas as pd
import matplotlib as plt
import requests
import os
import json
from twitter_api import api # imports tweepy api with keys
from pprint import pprint

# Gather

* Create and/or set directory for data files to be stored

In [2]:
# Make directory if it doesn't already exist
folder_name = 'project_data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

* Check for enhanced twitter archive file (provided): `twitter-archive-enhanced.csv`
    + If doesn't exist, this file needs to be downloaded from Udacity
    + Load data into Pandas DataFrame: `tweet_archive_df` 

In [3]:
# Check for twitter_archive_enhanced.csv
tweet_archive_file_path = os.path.join(folder_name, 'twitter-archive-enhanced.csv')
assert os.path.isfile(tweet_archive_file_path), "Download twitter-archive-enhanced.csv to '{}'".format(tweet_archive_file_path)
# Load into DataFrame
tweet_archive_df = pd.read_csv(tweet_archive_file_path)

* Check for image prediction data file: `image-predictions.tsv`
    + If it doesn't exist, download programatically from URL 
    + Load data into Pandas DataFrame: `image_predictions_df`

In [4]:
# Check for image-predictions.tsv
# if image-predictions.tsv doesn't exist, download programatically
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
predictions_file_path = os.path.join(folder_name, url.split('/')[-1])
if not os.path.isfile(predictions_file_path):
    with open(predictions_file_path, mode = 'wb') as file:
        file.write(requests.get(url).content)
# load into DataFrame
image_predictions_df = pd.read_csv(predictions_file_path, '\t')

* Check for twitter json file: `tweet_json.txt`
    + If doesn't exist, download Tweet json data using Tweepy library
    + Load data into Pandas DataFrame: `tweet_json_df`

In [53]:
text_file_name = 'tweet_json.txt'
text_file_path = os.path.join(folder_name, text_file_name)

def write_tweets(tweepy_statuses, file_path):
    """ Append tweepy Status objects as JSON string to provided file_path """
    with open(text_file_path, 'a+') as file:
        for tweepy_status in tweepy_statuses:
            file.write(json.dumps(tweepy_status._json)+'\n')
            
# list to sublist generator from:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
def create_tweet_json_file(tweet_ids, file_path):
    """ download and write JSON for provided tweet_ids to provided file_path"""
    for chunk in chunks(tweet_ids, 100):
        chunk_statuses = api.statuses_lookup(list(chunk))
        write_tweets(chunk_statuses, file_path)
    
if not os.path.isfile(text_file_path):
    tweet_ids = tweet_archive_df.tweet_id
    create_tweet_json_file(tweet_ids, text_file_path) #first run contained 2343 JSON strings
    print("Downloaded {} tweets to '{}' as JSON".format(sum(1 for line in open(text_file_path)), text_file_path))
else:
    print("{} tweets exist in '{}'".format(sum(1 for line in open(text_file_path)), text_file_path))
    
# load into DataFrame
tweet_json_df = pd.read_json(text_file_path, lines = True)

Downloaded 2343 tweets to 'project_data/tweet_json.txt' as JSON


# Assess

## `tweet_json_df`

Examine JSON structure.

In [57]:
# use first tweet as test tweet:
test_tweet_id = tweet_archive_df.tweet_id[0]
test_tweet_api = api.statuses_lookup([test_tweet_id])
# pretty-print JSON of test tweet
pprint(test_tweet_api[0]._json)

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'entities': {'hashtags': [],
              'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
                         'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
                         'id': 892420639486877696,
                         'id_str': '892420639486877696',
                         'indices': [86, 109],
                         'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'sizes': {'large': {'h': 528,
                                             'resize': 'fit',
                                             'w': 540},
                                   'medium': {'h': 528,
                                              'resize': 'fit',
                                              'w': 540},
           

In [None]:
#'retweet_count', 'favorite_count'

## Twitter Archive (Enhanced)

### Assess [Twitter Archive]

In [None]:
tweet_archive_df.info()

In [None]:
# tweets that are replies
# .info() says there should be 78
78 == sum(tweet_archive_df.in_reply_to_status_id.notnull())

In [None]:
# tweets that are retweets
# .info() says there should be 181
181 == sum(tweet_archive_df.retweeted_status_id.notnull())

In [None]:
# check timestamp format
tweet_archive_df.timestamp[0:5]

In [None]:
tweet_archive_df.retweeted_status_user_id.value_counts()

#### Issues:
    * tweets include retweets and replies
    * `timestamp` needs to be in datetime format for use in Python

### Clean [Twitter Archive]

In [None]:
# Copy to another DataFrame to clean
tweet_archive_clean = tweet_archive_df.copy()

In [None]:
tweet_archive_clean.columns

---
#### Define
The `timestamp` column contains string text. To be usable for graphing and analysis, the data in this series needs to be changed to `datetime` objects.

#### Code

In [None]:
from datetime import datetime

# test strptime format with example string from index 0
# datetime.strptime('2017-08-01 16:23:56 +0000','%Y-%m-%d %H:%M:%S %z')

# use pd.to_datetime() to change series of string date & time to datetime objects
# pd.to_datetime(tweet_archive_clean.timestamp, format='%Y-%m-%d %H:%M:%S %z')

# pd.to_datetime() doesn't accept '%z', so create function and map column
def string_to_datetime(string):
    return datetime.strptime(string,'%Y-%m-%d %H:%M:%S %z')
tweet_archive_clean.timestamp = tweet_archive_clean.timestamp.map(string_to_datetime)

#### Test

In [None]:
tweet_archive_clean.timestamp.head()

---
#### Define
Archive contains retweets and replies. The replies and retweets

#### Code

#### Test

## Image Predictions

### Assess

In [None]:
image_predictions_df.info()

In [None]:
image_predictions_df.head()

In [None]:
image_predictions_df[image_predictions_df.p1_dog == False]

### Clean

In [None]:
# Copy to another DataFrame to clean
image_predictions_clean = image_predictions_df.copy()