# Wrangle and Analyze Data


Project Description

In [61]:
import pandas as pd
import os
import io
import requests
import numpy as np
import json
from PIL import Image


## Gather

In [116]:
# WeRateDogs Twitter archive.

df_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [68]:
# Tweet image predictions
urlData = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv').content

df_images = pd.read_csv(io.StringIO(urlData.decode('utf-8')), delimiter='\t')

df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [11]:
# Twitter API 

import tweepy

consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, parser=tweepy.parsers.JSONParser(), wait_on_rate_limit = True, wait_on_rate_limit_notify = True)


In [5]:
# Array all Tweets ID's

tweets_id = np.asarray(tw_archive['tweet_id'])
tweets_id

array([892420643555336193, 892177421306343426, 891815181378084864, ...,
       666033412701032449, 666029285002620928, 666020888022790149])

In [40]:
# Get all JSON files from ID, store in list and dump into .txt file. 
    
with open('tweet_json.txt', 'a+', encoding='utf-8') as outfile:
    for a in tweets_id:
        try:
            tweet = api.get_status(a, tweet_mode = 'extended')
            outfile.write(json.dumps(tweet))
            outfile.write('\n')
        
        except: 
            pass
      
outfile.close()


Rate limit reached. Sleeping for: 3
Rate limit reached. Sleeping for: 30


In [41]:
# Create list from .txt

with open('tweet_json.txt') as file:
    status = []
    for line in file:
        status.append(json.loads(line))

In [46]:
# Create Dataframe from list 

df_tweets = pd.DataFrame(status, columns = ['id','retweet_count', 'favorite_count'])

df_tweets.head()


Unnamed: 0,id,retweet_count,favorite_count
0,892420643555336193,7733,36346
1,892177421306343426,5727,31311
2,891815181378084864,3789,23592
3,891689557279858688,7901,39660
4,891327558926688256,8520,37832


## Assess and Clean


### Quality 
##### `Archive` dataframe
- Erroneous datatypes(columns - timestamp and retweeted_status_timestamp)
- Data inside html tags (column source)
- Innacurate denominator, values different from 10 (column rating_denominator)]
- Innacurate numerator, has too large values (column rating_numerator)
- Retweeted tweets.
- Missing values (column expanded_urls)
- Some sources different than Twitter.


##### `Tweets` dataframe
- No issues

##### `Images` dataframe
- p1, p2 and p3 columns have underscore between words.

In [187]:
df_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [185]:
df_archive.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [186]:
df_archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
860,763167063695355904,,,2016-08-10 00:16:21 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Meet Eve. She's a raging alcoho...,6.732953e+17,4196984000.0,2015-12-06 00:17:55 +0000,https://twitter.com/dog_rates/status/673295268...,8,10,Eve,,,pupper,
1236,712438159032893441,,,2016-03-23 00:37:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Kane. He's a semi-submerged Haitian Hu...,,,,https://twitter.com/dog_rates/status/712438159...,11,10,Kane,,,,
2162,669393256313184256,,,2015-11-25 05:52:43 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Ronduh. She's a Finnish Checkered Blitzkr...,,,,https://twitter.com/dog_rates/status/669393256...,10,10,Ronduh,,,,
913,757400162377592832,,,2016-07-25 02:20:45 +0000,"<a href=""http://twitter.com/download/iphone"" r...",She walks herself up and down the train to be ...,,,,https://twitter.com/dog_rates/status/757400162...,13,10,,,,,
1550,689154315265683456,,,2016-01-18 18:36:07 +0000,"<a href=""http://twitter.com/download/iphone"" r...",We normally don't rate birds but I feel bad co...,,,,https://twitter.com/dog_rates/status/689154315...,9,10,,,,,


#### Define 
    - Convert columns timestamp and retweeted_status_timestamp to timestamp type. 
    - Remove html link tag in column source.
    - Replace denominators different than 10. 
    - Remove rows with non standard numerators. 
    - Remove tweets that are retweets.
    - Remove rows with missing expanded_urls.
    - Replace _ with space in p1, p2 and p3 columns. 

##### Achive Dataframe

In [284]:
df_clean_archive = df_archive.copy()

In [285]:
# Convert columns timestamp and retweeted_status_timestamp to datetime
df_clean_archive['timestamp'] = pd.to_datetime(df_archive['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
df_clean_archive['retweeted_status_timestamp'] = pd.to_datetime(df_archive['retweeted_status_timestamp'], format='%Y-%m-%d %H:%M:%S.%f')

# Test 
df_archive.info(), df_clean_archive.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

(None, None)

In [286]:
# Remove <a> link tag in column source
df_clean_archive['source'].unique()

# Use regular expressions to use only the content of the html tag
df_clean_archive['source'] = df_clean_archive.source.str.extract(r'>(.*?)<')

# Test 
df_clean_archive['source'].unique()


array(['Twitter for iPhone', 'Twitter Web Client', 'Vine - Make a Scene',
       'TweetDeck'], dtype=object)

In [287]:
# Replace denominators different than 10. 
df_clean_archive.query('rating_denominator != 10')

# Set all rating denominators to 10. 
df_clean_archive['rating_denominator'] = 10

# Test 
df_clean_archive.query('rating_denominator != 10')


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [288]:
# Remove non standard numerators
remove_id = df_clean_archive.query('rating_numerator > 20')

# Remove rows with rating_numerators higher than 20. 
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_id.index)]

# Test
df_clean_archive.query('rating_numerator > 20')


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [289]:
# Remove tweets that are retweets 

# Verify possible values for column in_reply_to_status_id
df_clean_archive.retweeted_status_id.unique()

# Store retweets to be removed
remove_retweet = df_clean_archive.query('retweeted_status_id != "nan"')

# Remove retweets 
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_retweet.index)]

# Test 
df_clean_archive.query('retweeted_status_id != "nan"')


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [290]:
# Remove records if expanded_urls column null

# Store missing expanded_urls
remove_miss_exp_url = df_clean_archive[df_clean_archive['expanded_urls'].isnull()]

# Remove missing expanded_urls
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_miss_exp_url.index)]

# Test
df_clean_archive[df_clean_archive['expanded_urls'].isnull()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


##### Tweets Dataframe

In [291]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id              2331 non-null   int64
 1   retweet_count   2331 non-null   int64
 2   favorite_count  2331 non-null   int64
dtypes: int64(3)
memory usage: 54.8 KB


In [292]:
df_tweets.sample(5)

Unnamed: 0,id,retweet_count,favorite_count
2282,666826780179869698,88,242
1214,712097430750289920,1026,3776
1394,698355670425473025,449,1863
1506,690021994562220032,1031,2798
88,875021211251597312,4350,24072


In [293]:
df_tweets.describe()

Unnamed: 0,id,retweet_count,favorite_count
count,2331.0,2331.0,2331.0
mean,7.419079e+17,2715.41613,7592.785929
std,6.82317e+16,4592.831407,11780.997024
min,6.660209e+17,1.0,0.0
25%,6.78267e+17,548.5,1324.5
50%,7.182469e+17,1275.0,3305.0
75%,7.986692e+17,3154.0,9301.5
max,8.924206e+17,78188.0,156904.0


##### Images Dataframe

In [294]:
df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [295]:
df_clean_images = df_images.copy()

In [296]:
df_clean_images.columns

Index(['tweet_id', 'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2',
       'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'],
      dtype='object')

In [297]:
# Replace underscore with space and capitalize

df_clean_images['p1'] = df_clean_images.p1.str.replace('_', ' ')
df_clean_images['p1'] = df_clean_images.p1.str.capitalize()

df_clean_images['p2'] = df_clean_images.p1.str.replace('_', ' ')
df_clean_images['p2'] = df_clean_images.p1.str.capitalize()

df_clean_images['p3'] = df_clean_images.p1.str.replace('_', ' ')
df_clean_images['p3'] = df_clean_images.p1.str.capitalize()

df_clean_images.sample(5)



Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1356,760539183865880579,https://pbs.twimg.com/media/Co36VZfWcAEN3R3.jpg,1,Samoyed,0.988013,True,Samoyed,0.004518,True,Samoyed,0.001189,True
2045,886366144734445568,https://pbs.twimg.com/media/DE0BTnQUwAApKEH.jpg,1,French bulldog,0.999201,True,French bulldog,0.000361,True,French bulldog,7.6e-05,True
174,669015743032369152,https://pbs.twimg.com/media/CUjSRNCXAAQ6Y_8.jpg,1,Comic book,0.275927,False,Comic book,0.173516,False,Comic book,0.073911,False
1872,844979544864018432,https://pbs.twimg.com/media/C7n4aQ0VAAAohkL.jpg,3,Tennis ball,0.999281,False,Tennis ball,0.00037,False,Tennis ball,0.000132,True
1238,746818907684614144,https://pbs.twimg.com/media/Cl071YVWEAAlF7N.jpg,1,Dingo,0.175518,False,Dingo,0.133647,False,Dingo,0.101537,True


### Tidiness 
##### `Archive` dataframe
- Columns doggo, floofer, pupper and puppo have values in both column and rows. 
- Since there is no retweeted tweets, retweeted columns are useless. 
- In some cases, a dog might have two dog stages. 

##### `Tweets` dataframe

##### `Images` dataframe


#### Define 
    - Create new column dog_stage
    - Remove columns retweeted_status_id, retweeted_status_user_id and retweeted_status_timestamp.
    - Remove dogs with more than one dog stage. 

In [298]:
df_clean_archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1493,692752401762250755,,,2016-01-28 16:53:37+00:00,Twitter for iPhone,"""Hello yes could I get one pupper to go please...",,,NaT,https://twitter.com/dog_rates/status/692752401...,13,10,,,,pupper,
329,833479644947025920,,,2017-02-20 00:53:27+00:00,Twitter for iPhone,This is Poppy. She just arrived. 13/10 would s...,,,NaT,https://twitter.com/dog_rates/status/833479644...,13,10,Poppy,,,,
1928,674045139690631169,,,2015-12-08 01:57:39+00:00,Twitter for iPhone,Herd of wild dogs here. Not sure what they're ...,,,NaT,https://twitter.com/dog_rates/status/674045139...,3,10,,,,,
2304,666983947667116034,,,2015-11-18 14:18:59+00:00,Twitter for iPhone,This is a curly Ticonderoga named Pepe. No fee...,,,NaT,https://twitter.com/dog_rates/status/666983947...,11,10,a,,,,
1437,697242256848379904,,,2016-02-10 02:14:42+00:00,Twitter for iPhone,This is Oakley. He has a massive tumor growing...,,,NaT,https://twitter.com/dog_rates/status/697242256...,10,10,Oakley,,,,


In [299]:
# Remove columns related to retweets 

df_clean_archive.drop('retweeted_status_id', axis=1, inplace=True)
df_clean_archive.drop('retweeted_status_user_id', axis=1, inplace=True)
df_clean_archive.drop('retweeted_status_timestamp', axis=1, inplace=True)

# Test 

df_clean_archive.head()


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [349]:
# Verify dog stages values

print(df_clean_archive.doggo.unique())
print(df_clean_archive.floofer.unique())
print(df_clean_archive.pupper.unique())
print(df_clean_archive.puppo.unique())

['None' 'doggo']
['None' 'floofer']
['None' 'pupper']
['None' 'puppo']


In [375]:
df_clean_archive['doggo'].replace('None', '', inplace=True)
df_clean_archive['floofer'].replace('None', '', inplace=True)
df_clean_archive['pupper'].replace('None', '', inplace=True)
df_clean_archive['puppo'].replace('None', '', inplace=True)

# Verify dog stages values after replace

print(df_clean_archive.doggo.unique())
print(df_clean_archive.floofer.unique())
print(df_clean_archive.pupper.unique())
print(df_clean_archive.puppo.unique())

['' 'doggo']
['' 'floofer']
['' 'pupper']
['' 'puppo']


In [378]:
# Create new column dog_stages 

df_clean_archive['dog_stages'] = (df_clean_archive['doggo'] + 
                                  df_clean_archive['floofer'] +
                                  df_clean_archive['pupper'] +
                                  df_clean_archive['puppo'] 
                                 )

df_clean_archive.dog_stages.unique()

array(['', 'doggo', 'puppo', 'pupper', 'floofer', 'doggopuppo',
       'doggofloofer', 'doggopupper'], dtype=object)

In [379]:
# Drop columns doggo, puppo, pupper and floofer 

df_clean_archive.drop('doggo', axis=1, inplace=True)
df_clean_archive.drop('floofer', axis=1, inplace=True)
df_clean_archive.drop('pupper', axis=1, inplace=True)
df_clean_archive.drop('puppo', axis=1, inplace=True)

# Test

df_clean_archive.sample(3)


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,dog_stages
220,850019790995546112,,,2017-04-06 16:18:05+00:00,Twitter for iPhone,Say hello to Boomer. He's a sandy pupper. Havi...,https://twitter.com/dog_rates/status/850019790...,12,10,Boomer,pupper
191,855851453814013952,,,2017-04-22 18:31:02+00:00,Twitter for iPhone,Here's a puppo participating in the #ScienceMa...,https://twitter.com/dog_rates/status/855851453...,13,10,,doggopuppo
1637,684195085588783105,,,2016-01-05 02:09:54+00:00,Twitter for iPhone,This is Tino. He really likes corndogs. 9/10 h...,https://twitter.com/dog_rates/status/684195085...,9,10,Tino,


In [382]:
# Dogs with more than one dog stage

print('before removing', df_clean_archive.dog_stages.unique())

# Remove doggopuppo, doggofloofer, doggopupper

remove_dogstage = df_clean_archive.query('dog_stages == "doggopuppo" or dog_stages == "doggofloofer" or dog_stages == "doggopupper"')
df_clean_archive = df_clean_archive[~df_clean_archive.index.isin(remove_dogstage.index)]   

# Test 
print('after removing', df_clean_archive.dog_stages.unique())                                      

before removing ['' 'doggo' 'puppo' 'pupper' 'floofer' 'doggopuppo' 'doggofloofer'
 'doggopupper']
after removing ['' 'doggo' 'puppo' 'pupper' 'floofer']


In [384]:
df_clean_archive.sample(15)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,dog_stages
1633,684241637099323392,,,2016-01-05 05:14:53+00:00,Twitter for iPhone,This is Obi. He got camera shy. 12/10 https://...,https://twitter.com/dog_rates/status/684241637...,12,10,Obi,
129,867421006826221569,,,2017-05-24 16:44:18+00:00,Twitter for iPhone,This is Shikha. She just watched you drop a sk...,https://twitter.com/dog_rates/status/867421006...,12,10,Shikha,puppo
919,756526248105566208,,,2016-07-22 16:28:07+00:00,Twitter for iPhone,All hail sky doggo. 13/10 would jump super hig...,https://twitter.com/dog_rates/status/756526248...,13,10,,doggo
277,840370681858686976,,,2017-03-11 01:15:58+00:00,Twitter for iPhone,You have been visited by the magical sugar jar...,https://twitter.com/dog_rates/status/840370681...,13,10,,
977,750011400160841729,,,2016-07-04 17:00:26+00:00,TweetDeck,Meet Piper. She's an airport doggo. Please ret...,https://twitter.com/dog_rates/status/750011400...,11,10,Piper,doggo
1674,682259524040966145,,,2015-12-30 17:58:40+00:00,Twitter for iPhone,Meet Jax. He's an Iglesias Hufflepoof. Quite t...,https://twitter.com/dog_rates/status/682259524...,9,10,Jax,
856,764259802650378240,,,2016-08-13 00:38:30+00:00,Twitter for iPhone,This is Kota and her son Benedict. She doesn't...,https://twitter.com/dog_rates/status/764259802...,10,10,Kota,
441,819711362133872643,,,2017-01-13 01:03:12+00:00,Twitter for iPhone,This is Howie. He just bloomed. 11/10 revoluti...,https://twitter.com/dog_rates/status/819711362...,11,10,Howie,
734,781251288990355457,,,2016-09-28 21:56:36+00:00,Twitter for iPhone,This is Oakley. He just got yelled at for goin...,https://twitter.com/dog_rates/status/781251288...,11,10,Oakley,
1680,682003177596559360,,,2015-12-30 01:00:03+00:00,Twitter for iPhone,Unique dog here. Wrinkly as hell. Weird segmen...,https://twitter.com/dog_rates/status/682003177...,5,10,,
