# Wrangle and Analyze Data


Project Description

In [61]:
import pandas as pd
import os
import io
import requests
import numpy as np
import json
from PIL import Image


## Gather

In [116]:
# WeRateDogs Twitter archive.

df_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [68]:
# Tweet image predictions
urlData = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv').content

df_images = pd.read_csv(io.StringIO(urlData.decode('utf-8')), delimiter='\t')

df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [11]:
# Twitter API 

import tweepy

consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, parser=tweepy.parsers.JSONParser(), wait_on_rate_limit = True, wait_on_rate_limit_notify = True)


In [5]:
# Array all Tweets ID's

tweets_id = np.asarray(tw_archive['tweet_id'])
tweets_id

array([892420643555336193, 892177421306343426, 891815181378084864, ...,
       666033412701032449, 666029285002620928, 666020888022790149])

In [40]:
# Get all JSON files from ID, store in list and dump into .txt file. 
    
with open('tweet_json.txt', 'a+', encoding='utf-8') as outfile:
    for a in tweets_id:
        try:
            tweet = api.get_status(a, tweet_mode = 'extended')
            outfile.write(json.dumps(tweet))
            outfile.write('\n')
        
        except: 
            pass
      
outfile.close()


Rate limit reached. Sleeping for: 3
Rate limit reached. Sleeping for: 30


In [41]:
# Create list from .txt

with open('tweet_json.txt') as file:
    status = []
    for line in file:
        status.append(json.loads(line))

In [46]:
# Create Dataframe from list 

df_tweets = pd.DataFrame(status, columns = ['id','retweet_count', 'favorite_count'])

df_tweets.head()


Unnamed: 0,id,retweet_count,favorite_count
0,892420643555336193,7733,36346
1,892177421306343426,5727,31311
2,891815181378084864,3789,23592
3,891689557279858688,7901,39660
4,891327558926688256,8520,37832


## Assess and Clean


### Quality 
##### `Archive` dataframe
- Erroneous datatypes(columns - timestamp and retweeted_status_timestamp)
- Data inside html tags (column source)
- Innacurate denominator, values different from 10 (column rating_denominator)]
- Innacurate numerator, has too large values (column rating_numerator)
- Retweeted tweets.
- Missing values (column expanded_urls)
- Some sources different than Twitter.


##### `Tweets` dataframe
- No issues

##### `Images` dataframe
- p1, p2 and p3 columns have underscore between words.

#### Visual Assessment

In [461]:
df_archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
429,821107785811234820,,,2017-01-16 21:32:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a doggo who looks like he's about to gi...,,,,https://twitter.com/dog_rates/status/821107785...,11,10,,doggo,,,
1388,700462010979500032,,,2016-02-18 23:28:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Murphy. He's a mini golden retriever. ...,,,,https://twitter.com/dog_rates/status/700462010...,6,10,Murphy,,,,
458,817908911860748288,,,2017-01-08 01:40:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Looks like he went cross-eyed trying way too h...,,,,https://twitter.com/micahgrimes/status/8179020...,12,10,,,,,
97,873337748698140672,,,2017-06-10 00:35:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Sierra. She's one preci...,8.732138e+17,4196984000.0,2017-06-09 16:22:42 +0000,https://www.gofundme.com/help-my-baby-sierra-g...,12,10,Sierra,,,pupper,
2039,671547767500775424,,,2015-12-01 04:33:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Marley. She chews shoes then feels ext...,,,,https://twitter.com/dog_rates/status/671547767...,10,10,Marley,,,,


In [462]:
df_images.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
16,666102155909144576,https://pbs.twimg.com/media/CT54YGiWUAEZnoK.jpg,1,English_setter,0.298617,True,Newfoundland,0.149842,True,borzoi,0.133649,True
1100,720775346191278080,https://pbs.twimg.com/media/CgC1WqMW4AI1_N0.jpg,1,Newfoundland,0.48997,True,groenendael,0.174497,True,giant_schnauzer,0.079067,True
582,678798276842360832,https://pbs.twimg.com/media/CWuTbAKUsAAvZHh.jpg,1,Airedale,0.583122,True,silky_terrier,0.129567,True,Lakeland_terrier,0.094727,True
1910,853299958564483072,https://pbs.twimg.com/media/C9eHyF7XgAAOxPM.jpg,1,grille,0.65228,False,beach_wagon,0.112846,False,convertible,0.086252,False
286,671151324042559489,https://pbs.twimg.com/media/CVBokRSWsAADuXx.jpg,1,Rottweiler,0.781201,True,black-and-tan_coonhound,0.061206,True,kelpie,0.048856,True


In [463]:
df_tweets.sample(5)

Unnamed: 0,id,retweet_count,favorite_count
900,755955933503782912,2910,7405
319,832757312314028032,3655,16990
85,875747767867523072,3929,23743
1666,681339448655802368,4077,9294
1314,705066031337840642,618,2181


#### Programmatic Assessment

In [454]:
df_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [455]:
df_images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [456]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id              2331 non-null   int64
 1   retweet_count   2331 non-null   int64
 2   favorite_count  2331 non-null   int64
dtypes: int64(3)
memory usage: 54.8 KB


In [457]:
df_archive.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [460]:
df_images.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [459]:
df_tweets.describe()

Unnamed: 0,id,retweet_count,favorite_count
count,2331.0,2331.0,2331.0
mean,7.419079e+17,2715.41613,7592.785929
std,6.82317e+16,4592.831407,11780.997024
min,6.660209e+17,1.0,0.0
25%,6.78267e+17,548.5,1324.5
50%,7.182469e+17,1275.0,3305.0
75%,7.986692e+17,3154.0,9301.5
max,8.924206e+17,78188.0,156904.0


#### Define 
    - Convert columns timestamp and retweeted_status_timestamp to timestamp type. 
    - Remove html link tag in column source.
    - Replace denominators different than 10. 
    - Remove rows with non standard numerators. 
    - Remove tweets that are retweets.
    - Remove rows with missing expanded_urls.
    - Replace _ with space in p1, p2 and p3 columns. 
    - Remove sources different from twitter. 

##### Achive Dataframe

In [418]:
df_clean_archive = df_archive.copy()

In [419]:
# Convert columns timestamp and retweeted_status_timestamp to datetime
df_clean_archive['timestamp'] = pd.to_datetime(df_archive['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
df_clean_archive['retweeted_status_timestamp'] = pd.to_datetime(df_archive['retweeted_status_timestamp'], format='%Y-%m-%d %H:%M:%S.%f')

# Test 
df_archive.info(), df_clean_archive.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

(None, None)

In [420]:
# Remove <a> link tag in column source
df_clean_archive['source'].unique()

# Use regular expressions to use only the content of the html tag
df_clean_archive['source'] = df_clean_archive.source.str.extract(r'>(.*?)<')

# Test 
df_clean_archive['source'].unique()


array(['Twitter for iPhone', 'Twitter Web Client', 'Vine - Make a Scene',
       'TweetDeck'], dtype=object)

In [421]:
# Replace denominators different than 10. 
df_clean_archive.query('rating_denominator != 10')

# Set all rating denominators to 10. 
df_clean_archive['rating_denominator'] = 10

# Test 
df_clean_archive.query('rating_denominator != 10')


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [422]:
# Remove non standard numerators
remove_id = df_clean_archive.query('rating_numerator > 20')

# Remove rows with rating_numerators higher than 20. 
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_id.index)]

# Test
df_clean_archive.query('rating_numerator > 20')


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [423]:
# Remove tweets that are retweets 

# Verify possible values for column in_reply_to_status_id
df_clean_archive.retweeted_status_id.unique()

# Store retweets to be removed
remove_retweet = df_clean_archive.query('retweeted_status_id != "nan"')

# Remove retweets 
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_retweet.index)]

# Test 
df_clean_archive.query('retweeted_status_id != "nan"')


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [424]:
# Remove records if expanded_urls column null

# Store missing expanded_urls
remove_miss_exp_url = df_clean_archive[df_clean_archive['expanded_urls'].isnull()]

# Remove missing expanded_urls
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_miss_exp_url.index)]

# Test
df_clean_archive[df_clean_archive['expanded_urls'].isnull()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [425]:
# Remove record with source Vine - Make a Scene

non_twitter_source = df_clean_archive.query('source == "Vine - Make a Scene"')

df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(non_twitter_source.index)]

# Test
df_clean_archive.source.unique()


array(['Twitter for iPhone', 'Twitter Web Client', 'TweetDeck'],
      dtype=object)

##### Images Dataframe

In [430]:
df_clean_images = df_images.copy()

In [431]:
df_clean_images.columns

Index(['tweet_id', 'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2',
       'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'],
      dtype='object')

In [432]:
# Replace underscore with space and capitalize

df_clean_images['p1'] = df_clean_images.p1.str.replace('_', ' ')
df_clean_images['p1'] = df_clean_images.p1.str.capitalize()

df_clean_images['p2'] = df_clean_images.p1.str.replace('_', ' ')
df_clean_images['p2'] = df_clean_images.p1.str.capitalize()

df_clean_images['p3'] = df_clean_images.p1.str.replace('_', ' ')
df_clean_images['p3'] = df_clean_images.p1.str.capitalize()

df_clean_images.sample(5)



Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1805,832273440279240704,https://pbs.twimg.com/ext_tw_video_thumb/83227...,1,Pembroke,0.134081,True,Pembroke,0.051928,False,Pembroke,0.044311,True
1433,773547596996571136,https://pbs.twimg.com/media/Crwxb5yWgAAX5P_.jpg,1,Norwegian elkhound,0.372202,True,Norwegian elkhound,0.137187,True,Norwegian elkhound,0.071436,True
153,668655139528511488,https://pbs.twimg.com/media/CUeKTeYW4AEr_lx.jpg,1,Beagle,0.31911,True,Beagle,0.103338,True,Beagle,0.09193,True
1245,747512671126323200,https://pbs.twimg.com/media/Cl-yykwWkAAqUCE.jpg,1,Cardigan,0.111493,True,Cardigan,0.095089,True,Cardigan,0.080146,True
294,671355857343524864,https://pbs.twimg.com/media/CVEilyCUwAETbJ-.jpg,1,Miniature poodle,0.313811,True,Miniature poodle,0.165585,True,Miniature poodle,0.056094,True


### Tidiness 
##### `Archive` dataframe
- Columns doggo, floofer, pupper and puppo have values in both column and rows. 
- Since retweeted tweets will not be used, retweeted columns are useless. 
- In some cases, a dog might have two dog stages. 

##### `Tweets` dataframe

##### `Images` dataframe


#### Define 
    - Create new column dog_stage
    - Remove columns retweeted_status_id, retweeted_status_user_id and retweeted_status_timestamp.
    - Remove dogs with more than one dog stage. 
    - Merge archive data frame with images dataframe
    - Merge the new data frame with tweets dataframe
    

##### Achive Dataframe

In [433]:
df_clean_archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1565,688064179421470721,,,2016-01-15 18:24:18+00:00,Twitter for iPhone,This is Kilo. He's a Pouncing Brioche. Really ...,,,NaT,https://twitter.com/dog_rates/status/688064179...,11,10,Kilo,,,,
1697,681231109724700672,,,2015-12-27 21:52:07+00:00,Twitter for iPhone,I just love this pic. 11/10 this pupper is goi...,,,NaT,https://twitter.com/dog_rates/status/681231109...,11,10,,,,pupper,
2243,667902449697558528,,,2015-11-21 03:08:47+00:00,Twitter for iPhone,This is Cleopatricia. She is a northern Paperb...,,,NaT,https://twitter.com/dog_rates/status/667902449...,9,10,Cleopatricia,,,,
1897,674737130913071104,,,2015-12-09 23:47:22+00:00,Twitter for iPhone,Meet Rufio. He is unaware of the pink legless ...,,,NaT,https://twitter.com/dog_rates/status/674737130...,10,10,Rufio,,,pupper,
1163,723673163800948736,,,2016-04-23 00:41:42+00:00,Twitter for iPhone,This is Ivar. She is a badass Viking warrior. ...,,,NaT,https://twitter.com/dog_rates/status/723673163...,10,10,Ivar,,,,


In [434]:
# Remove columns related to retweets 

df_clean_archive.drop('retweeted_status_id', axis=1, inplace=True)
df_clean_archive.drop('retweeted_status_user_id', axis=1, inplace=True)
df_clean_archive.drop('retweeted_status_timestamp', axis=1, inplace=True)

# Test 

df_clean_archive.head()


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [435]:
# Verify dog stages values

print(df_clean_archive.doggo.unique())
print(df_clean_archive.floofer.unique())
print(df_clean_archive.pupper.unique())
print(df_clean_archive.puppo.unique())

['None' 'doggo']
['None' 'floofer']
['None' 'pupper']
['None' 'puppo']


In [436]:
df_clean_archive['doggo'].replace('None', '', inplace=True)
df_clean_archive['floofer'].replace('None', '', inplace=True)
df_clean_archive['pupper'].replace('None', '', inplace=True)
df_clean_archive['puppo'].replace('None', '', inplace=True)

# Verify dog stages values after replace

print(df_clean_archive.doggo.unique())
print(df_clean_archive.floofer.unique())
print(df_clean_archive.pupper.unique())
print(df_clean_archive.puppo.unique())

['' 'doggo']
['' 'floofer']
['' 'pupper']
['' 'puppo']


In [437]:
# Create new column dog_stages 

df_clean_archive['dog_stages'] = (df_clean_archive['doggo'] + 
                                  df_clean_archive['floofer'] +
                                  df_clean_archive['pupper'] +
                                  df_clean_archive['puppo'] 
                                 )

df_clean_archive.dog_stages.unique()

array(['', 'doggo', 'puppo', 'pupper', 'floofer', 'doggopuppo',
       'doggofloofer', 'doggopupper'], dtype=object)

In [438]:
# Drop columns doggo, puppo, pupper and floofer 

df_clean_archive.drop('doggo', axis=1, inplace=True)
df_clean_archive.drop('floofer', axis=1, inplace=True)
df_clean_archive.drop('pupper', axis=1, inplace=True)
df_clean_archive.drop('puppo', axis=1, inplace=True)

# Test

df_clean_archive.sample(3)


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,dog_stages
1145,727175381690781696,,,2016-05-02 16:38:15+00:00,Twitter for iPhone,This is Karll. He just wants to go kayaking. 1...,https://twitter.com/dog_rates/status/727175381...,10,10,Karll,
1029,745712589599014916,,,2016-06-22 20:18:30+00:00,Twitter for iPhone,This is Percy. He fell asleep at the wheel. Ir...,https://twitter.com/dog_rates/status/745712589...,7,10,Percy,
2343,666073100786774016,,,2015-11-16 01:59:36+00:00,Twitter for iPhone,Let's hope this flight isn't Malaysian (lol). ...,https://twitter.com/dog_rates/status/666073100...,10,10,,


In [439]:
# Dogs with more than one dog stage

print('before removing', df_clean_archive.dog_stages.unique())

# Remove doggopuppo, doggofloofer, doggopupper

remove_dogstage = df_clean_archive.query('dog_stages == "doggopuppo" or dog_stages == "doggofloofer" or dog_stages == "doggopupper"')
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_dogstage.index)] 

df_clean_archive['dog_stages'].replace('', 'NaN', inplace=True)

# Test 
print('after removing', df_clean_archive.dog_stages.unique())                                      

before removing ['' 'doggo' 'puppo' 'pupper' 'floofer' 'doggopuppo' 'doggofloofer'
 'doggopupper']
after removing ['NaN' 'doggo' 'puppo' 'pupper' 'floofer']


#### Dataframe Merge

In [442]:
# Merge image dataframe with archive dataframe. Only tweets with images 

df_new = df_clean_archive.merge(df_clean_images, left_on = 'tweet_id', right_on = 'tweet_id', suffixes=('_archive','_images'))

print(df_new.info())

df_new.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1964 entries, 0 to 1963
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   tweet_id               1964 non-null   int64              
 1   in_reply_to_status_id  21 non-null     float64            
 2   in_reply_to_user_id    21 non-null     float64            
 3   timestamp              1964 non-null   datetime64[ns, UTC]
 4   source                 1964 non-null   object             
 5   text                   1964 non-null   object             
 6   expanded_urls          1964 non-null   object             
 7   rating_numerator       1964 non-null   int64              
 8   rating_denominator     1964 non-null   int64              
 9   name                   1964 non-null   object             
 10  dog_stages             1964 non-null   object             
 11  jpg_url                1964 non-null   object           

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,...,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,892420643555336193,,,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,...,1,Orange,0.097049,False,Orange,0.085851,False,Orange,0.07611,False
1,892177421306343426,,,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,...,1,Chihuahua,0.323581,True,Chihuahua,0.090647,True,Chihuahua,0.068957,True
2,891815181378084864,,,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,...,1,Chihuahua,0.716012,True,Chihuahua,0.078253,True,Chihuahua,0.031379,True
3,891689557279858688,,,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,...,1,Paper towel,0.170278,False,Paper towel,0.168086,True,Paper towel,0.040836,False
4,891327558926688256,,,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,...,2,Basset,0.555712,True,Basset,0.22577,True,Basset,0.175219,True


#### Final Dataframe

In [447]:
# Merge df_new with df_tweets 

df = df_new.merge(df_tweets, left_on = 'tweet_id', right_on = 'id')

df.drop(['id'], axis=1, inplace=True)

print(df.info())

df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1957 entries, 0 to 1956
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   tweet_id               1957 non-null   int64              
 1   in_reply_to_status_id  21 non-null     float64            
 2   in_reply_to_user_id    21 non-null     float64            
 3   timestamp              1957 non-null   datetime64[ns, UTC]
 4   source                 1957 non-null   object             
 5   text                   1957 non-null   object             
 6   expanded_urls          1957 non-null   object             
 7   rating_numerator       1957 non-null   int64              
 8   rating_denominator     1957 non-null   int64              
 9   name                   1957 non-null   object             
 10  dog_stages             1957 non-null   object             
 11  jpg_url                1957 non-null   object           

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweet_count,favorite_count
0,892420643555336193,,,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,...,0.097049,False,Orange,0.085851,False,Orange,0.07611,False,7733,36346
1,892177421306343426,,,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,...,0.323581,True,Chihuahua,0.090647,True,Chihuahua,0.068957,True,5727,31311
2,891815181378084864,,,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,...,0.716012,True,Chihuahua,0.078253,True,Chihuahua,0.031379,True,3789,23592
3,891689557279858688,,,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,...,0.170278,False,Paper towel,0.168086,True,Paper towel,0.040836,False,7901,39660
4,891327558926688256,,,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,...,0.555712,True,Basset,0.22577,True,Basset,0.175219,True,8520,37832


In [450]:
# Save final dataframe to .csv

df.to_csv('twitter_archive_master.csv')

## Analyzing, and Visualizing 