In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import json
import requests
import tweepy

### Data Gathering Section

Loading twitter archive dataset

In [3]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

Getting image prediction of different dogs at [Udacity server](https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv)

In [None]:
response = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

In [None]:
# Saving the image prediction into a tsv file
with open('image_predictions.tsv', 'wb') as file: 
    file.write(response.content)

Loading image_predictions.tsv file into a dataframe

In [4]:
image_pred = pd.read_csv('image_predictions.tsv', sep='\t')

Getting more data information about different tweetids in twitter archive dataframe

In [2]:
from getpass import getpass # Package to get consumer key and secret as a password

In [4]:
consumer_key = getpass("Consumer key:") # Getting the consumer key from stdin
consumer_secret = getpass("Consumer secret:") # Getting the consumer secret from stdin

auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 

access_token = getpass("Access Token:")
access_secret = getpass('Access secret:')

auth.set_access_token(access_token, access_secret) # Setting the access tokens

api = tweepy.API(auth, wait_on_rate_limit=True) # Initializing the API

with open('tweet_json.txt', 'w') as file:
    for tweetid in twitter_archive['tweet_id'].values:
        try:
            tweet = api.get_status(tweetid, tweet_mode='extended') # Getting different tweets
            json.dump(tweet._json, file) # SAving the tweet as json file
            file.write('\n') # Adding newline character after saving 
        except:
            print(f'{tweetid} not found!') # This will get prompt when tweet id is not found

Loading the tweets retrieved from twitter into a dataframe

In [10]:
tweets_df = pd.read_json('tweet_json.txt', lines=True)

In [13]:
tweets_use = tweets_df.loc[:, ['id', 'retweet_count', 'favorite_count', 'lang', 'user']]

### Data assessment

#### Asessing Twitter archive file

In [None]:
twitter_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [None]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [None]:
for column in twitter_archive.loc[:, "doggo":].columns:
    print(column, "column values:", twitter_archive[column].unique())

doggo column values: ['None' 'doggo']
floofer column values: ['None' 'floofer']
pupper column values: ['None' 'pupper']
puppo column values: ['None' 'puppo']


Data quality
- Timestamp column is a string instead of datetime object
- Misrepresentation of NA values in Doggo, Floofer, Pupper, Puppo columns.
- Source column contains href attribute instead of url only.
- Invalid tweet id (deleted tweet not useful).
- Invalid dogs name e.g., "a", "such", "quite", "None", etc.
- 

Tidyness issues
- Dog stages in multiple columns (Requires only one column)
- 

#### Additional tweet information

In [None]:
tweets_use.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   id              2354 non-null   int64              
 1   retweet_count   2354 non-null   int64              
 2   favorite_count  2354 non-null   int64              
 3   created_at      2354 non-null   datetime64[ns, UTC]
 4   lang            2354 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(3), object(1)
memory usage: 92.1+ KB
