# WeRateDogs Twitter Data Analysis

In [1]:
import pandas as pd
import tweepy
import numpy as np
import json
import os
import requests
import pandas.api.types as ptypes
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

## Gather Data

In this section we'll gather the data from multiple sources:
- WeRateDogs Twitter archive
- Tweet image predictions
- Retweet and favorite counts from Twitter API

### WeRateDogs Twitter Archive

The data will be loaded from the CSV file `twitter-archive-enhanced.csv`.

In [2]:
twitter_df = pd.read_csv('twitter-archive-enhanced.csv')
twitter_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


### Tweet Image Predictions

The data will be retrieved from the following URL:  
https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv

In [3]:
if 'image_predictions.tsv' in os.listdir():
    print('"image_predictions.tsv" file already exists, retrieval will be skipped.')
else:
    response = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
    with open('image_predictions.tsv', 'wb') as file:
        file.write(response.content)

"image_predictions.tsv" file already exists, retrieval will be skipped.


In [4]:
image_pred_df = pd.read_csv('image_predictions.tsv', sep='\t')
image_pred_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


### Get Retweet Counts and Like Counts from Tweepy

We'll use Tweepy to get the `retweet_count` and `favorite_count` of every tweet in `twitter_df`. Note that some of the tweets may no longer exists, and for these the `retweet_count` and `favorite_count` will not be available.

In [5]:
if 'tweet_json.txt' in os.listdir():
    print('"tweet_json.txt" file already exists, retrieval will be skipped.')
else:
    tweepy_auth_dir = 'auth/tweepy_auth.json'
    with open(tweepy_auth_dir, 'r') as file:
        tweepy_auth_json = json.load(file)

    key = tweepy_auth_json['key']
    secret = tweepy_auth_json['secret']
    auth = tweepy.OAuthHandler(key, secret)
    api = tweepy.API(auth)
    
    for tweet_id in twitter_df.tweet_id:
        try:
            status = api.get_status(tweet_id, tweet_mode='extended')
            with open('tweet_json.txt', 'a+') as out_file:
                json.dump(status._json, out_file)
                out_file.write('\n')
        except:
            pass

"tweet_json.txt" file already exists, retrieval will be skipped.


In [6]:
tweet_infos = []
with open('tweet_json.txt', 'r') as file:
    for line in file:
        tweet_json = json.loads(line)
        tweet_infos.append({'tweet_id': tweet_json['id_str'],
                           'retweet_count': tweet_json['retweet_count'],
                           'favorite_count': tweet_json['favorite_count']})
tweet_infos_df = pd.DataFrame(tweet_infos)
tweet_infos_df.to_csv('tweet_infos.csv', index=False)
tweet_infos_df.head()

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,7417,35140
1,892177421306343426,5517,30411
2,891815181378084864,3640,22869
3,891689557279858688,7584,38400
4,891327558926688256,8166,36651


## Data Assessment and Cleaning

### Data Completeness - `twitter_df`

#### Assessment

In [7]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

As you can see above, the following columns have null values: 
- `in_reply_to_status_id`
- `in_reply_to_user_id`
- `retweeted_status_id`
- `retweeted_status_user_id`
- `retweeted_status_timestamp`
- `expanded_urls`

For the `in_reply...` and `retweeted_...` columns, they make sense to have null values, because not all statuses are replying or retweeting another status. As for `expanded_urls`, having null values in them is not an issue, because we most likely won't use this column. So in conclusion, no cleaning action to be done relating to missing data.

### Data Completeness - `tweet_infos_df`

#### Assessment

In [8]:
tweet_infos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2043 entries, 0 to 2042
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        2043 non-null   object
 1   retweet_count   2043 non-null   int64 
 2   favorite_count  2043 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 48.0+ KB


No missing value found, hence no cleaning needed.

### Data Completeness - `image_pred_df`
#### Assessment

In [9]:
image_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


No missing value found, hence no cleaning needed.

### Column Data Types - `twitter_df`

#### Assessment
Below we'll evaluate the data type of the columns in `twitter_df`.

In [10]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

Here are the data types issues found from the `info()` above:
- `tweet_id` is of type integer, while it should be string because it's an ID.
- `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id` are of type float, while they should be string because they are IDs.
- `timestamp` and `retweeted_status_timestamp` are of type string, while they should be datetime.

#### Cleaning
Below we'll then convert the data types of the columns as described above.

In [11]:
twitter_df.tweet_id = twitter_df.tweet_id.astype(str)

# test
assert ptypes.is_string_dtype(twitter_df.tweet_id)

In [12]:
float_to_string_cols = ['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id']
twitter_df[float_to_string_cols] = twitter_df[float_to_string_cols].applymap(lambda x: f'{x:.0f}' if pd.notna(x) else None)

# test
for c in float_to_string_cols:
    assert ptypes.is_string_dtype(twitter_df[c])

In [13]:
str_to_datetime_cols = ['timestamp', 'retweeted_status_timestamp']
twitter_df[str_to_datetime_cols] = twitter_df[str_to_datetime_cols].applymap(pd.to_datetime)

# test
for c in str_to_datetime_cols:
    assert ptypes.is_datetime64_any_dtype(twitter_df[c])

#### Re-assessment
Below we'll reassess the new data types of the columns mentioned above, and evaluate their new values.

In [14]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   object             
 1   in_reply_to_status_id       78 non-null     object             
 2   in_reply_to_user_id         78 non-null     object             
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    object             
 7   retweeted_status_user_id    181 non-null    object             
 8   retweeted_status_timestamp  181 non-null    datetime64[ns, UTC]
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64           

In [15]:
twitter_df.tweet_id.sample(5)

2001    672482722825261057
600     798673117451325440
449     819004803107983360
164     860184849394610176
808     771770456517009408
Name: tweet_id, dtype: object

In [16]:
twitter_df.in_reply_to_status_id[twitter_df.in_reply_to_status_id.notna()].sample(5)

2169    667806454573760512
1005    747648653817413632
857     763865174553964544
1663    682788441537560576
1598    686034024800862208
Name: in_reply_to_status_id, dtype: object

In [17]:
twitter_df.in_reply_to_user_id[twitter_df.in_reply_to_user_id.notna()].sample(5)

1605    4196983835
1914      16374678
186      279280991
313       26259576
55        47384430
Name: in_reply_to_user_id, dtype: object

In [18]:
twitter_df.retweeted_status_id[twitter_df.retweeted_status_id.notna()].sample(5)

742    780465709297995776
146    863062471531167744
506    680055455951884288
868    685325112850124800
720    674291837063053312
Name: retweeted_status_id, dtype: object

In [19]:
twitter_df.retweeted_status_user_id[twitter_df.retweeted_status_user_id.notna()].sample(5)

778    4196983835
411    4196983835
568      24885566
604    4196983835
230    4196983835
Name: retweeted_status_user_id, dtype: object

In [20]:
twitter_df.timestamp[twitter_df.timestamp.notna()].sample(5)

1802   2015-12-16 02:19:04+00:00
74     2017-06-23 18:17:33+00:00
1174   2016-04-13 01:22:10+00:00
1139   2016-05-05 00:16:48+00:00
928    2016-07-17 18:38:22+00:00
Name: timestamp, dtype: datetime64[ns, UTC]

In [21]:
twitter_df.retweeted_status_timestamp[twitter_df.retweeted_status_timestamp.notna()].sample(5)

860    2015-12-06 00:17:55+00:00
2260   2015-11-20 03:41:59+00:00
425    2016-10-01 19:47:08+00:00
677    2016-07-06 15:54:42+00:00
847    2016-04-29 00:21:01+00:00
Name: retweeted_status_timestamp, dtype: datetime64[ns, UTC]

By looking at the assessments above, we can confirm that now the data types are correct and the values seems reasonable.

You may notice that some of the IDs may have shorter length. This is normal, because based on [this documentation from Twitter](https://developer.twitter.com/en/docs/twitter-ids#:~:text=Today%2C%20Twitter%20IDs%20are%20unique,number%2C%20and%20a%20sequence%20number.), they represent ID as 64 bits integer number (but recommended to be stored as string to avoid losing accuracy in systems with lower bits integer representations). Hence, it is expected for the ID to have different lengths. No zero padding is required.

There will be another data type conversion for the dog state columns `doggo`, `floofer`, `pupper`, and `puppo` in latter section as well, where we'll convert the columns into boolean type.

### Column Data Types - `tweet_infos_df`

#### Assessment

In [22]:
tweet_infos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2043 entries, 0 to 2042
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        2043 non-null   object
 1   retweet_count   2043 non-null   int64 
 2   favorite_count  2043 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 48.0+ KB


The data type of the columns above seems to be correct, hence no cleaning needed.

### Column Data Types - `image_pred_df`

#### Assessment

In [23]:
image_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


Data type issues found:
- `tweet_id` is integer, while it should be string since it is an ID.

#### Cleaning
Below we'll convert the `tweet_id` column to be string.

In [24]:
image_pred_df.tweet_id = image_pred_df.tweet_id.astype(str)

# test
assert ptypes.is_string_dtype(image_pred_df.tweet_id)

#### Re-assessment

In [25]:
image_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   object 
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 152.1+ KB


In [26]:
image_pred_df.tweet_id.sample(5)

1200    740995100998766593
655     682242692827447297
1786    829449946868879360
569     678389028614488064
876     698262614669991936
Name: tweet_id, dtype: object

The above assessments show that the data type has been converted correctly.

### Data Tidiness - `tweet_infos_df` separated from `twitter_df`

#### Assessment
An obvious data tidiness issue is the `tweet_infos_df` being separated from the `twitter_df`. Both of them should be combined together, because the columns `retweet_count` and `favorite_count` should belong to `twitter_df`.

#### Cleaning
Below we'll merge the `tweet_infos_df` into `twitter_df`.

In [27]:
original_shape = twitter_df.shape # for testing

# clean
twitter_df = twitter_df.merge(tweet_infos_df, left_on='tweet_id', right_on='tweet_id', how='left')

# test
assert {'retweet_count', 'favorite_count'}.issubset(twitter_df.columns)
assert twitter_df.shape == (original_shape[0], original_shape[1]+2)

#### Re-assessment

In [28]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   object             
 1   in_reply_to_status_id       78 non-null     object             
 2   in_reply_to_user_id         78 non-null     object             
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    object             
 7   retweeted_status_user_id    181 non-null    object             
 8   retweeted_status_timestamp  181 non-null    datetime64[ns, UTC]
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64           

In [29]:
twitter_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweet_count,favorite_count
0,892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,NaT,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,,7417.0,35140.0
1,892177421306343426,,,2017-08-01 00:17:27+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,NaT,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,,5517.0,30411.0
2,891815181378084864,,,2017-07-31 00:18:03+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,NaT,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,,3640.0,22869.0
3,891689557279858688,,,2017-07-30 15:58:51+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,NaT,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,,7584.0,38400.0
4,891327558926688256,,,2017-07-29 16:00:24+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,NaT,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,,8166.0,36651.0


The above assessment confirms that the merge is successful. Note that you may notice the data type of `retweet_count` and `favorite_count` is converted to float after the merge operation. This is a well-known problem in pandas library, and it is because the column contains NaN values. NaN values cannot be represented by integer, thus this is the reason the data type is converted to float after the merge operation. This is not an issue as the float type still allows numeric operation on those columns, while allowing NaN values in those columns as well. Hence, no cleaning needs to be done for this.

### Data Quality - Dog Stage Columns "None" String Values
#### Assessment
Let's now assess the `doggo`, `floofer`, `pupper`, and `puppo` columns in `twitter_df`, which represent the various dog "stages".

In [30]:
print(twitter_df.doggo.unique())
print(twitter_df.floofer.unique())
print(twitter_df.pupper.unique())
print(twitter_df.puppo.unique())

['None' 'doggo']
['None' 'floofer']
['None' 'pupper']
['None' 'puppo']


As you can see, there are string "None" values in those columns, which are invalid and misleading. This can make it hard for programmatic analysis later on.

#### Cleaning
We'll replace the string "None" values to be actual `NaN` values to ease the analysis.

In [31]:
dog_stages_cols = ['doggo', 'floofer', 'pupper', 'puppo']
twitter_df[dog_stages_cols] = twitter_df[dog_stages_cols].replace('None', np.nan)

# test - ensure that there is no more 'None' string values
assert (twitter_df[dog_stages_cols] == 'None').sum().sum() == 0

#### Re-assessment

It is confirmed below that the "None" string values have been replaced with `NaN` values, shown by the `nan` in the list of unique values.

In [32]:
print(twitter_df.doggo.unique())
print(twitter_df.floofer.unique())
print(twitter_df.pupper.unique())
print(twitter_df.puppo.unique())

[nan 'doggo']
[nan 'floofer']
[nan 'pupper']
[nan 'puppo']


Below we'll reassess the actual number of null values for these dog stages columns.

In [33]:
twitter_df[['doggo', 'floofer', 'pupper', 'puppo']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   doggo    97 non-null     object
 1   floofer  10 non-null     object
 2   pupper   257 non-null    object
 3   puppo    30 non-null     object
dtypes: object(4)
memory usage: 92.0+ KB


As you can see now apparently there are a lot of null values for the dog stages columns. However, as we'll see in the next section, these columns actually only represents the existence of those dog stages word in the tweet status. This means it makes sense to have a lot of null values, because not all tweets will mention those dog stages words. Hence no cleaning action will be done regarding these missing dog stages column values.

### Column Data Type - Dog Stages Columns in `twitter_df`

#### Assessment
As described in previous section, apparently the dog stages columns in `twitter_df` actually represents existence of the dog stage word in the tweet status. This column currently is weirdly represented using string, and the representation of the values are also weird. For example, existence of "doggo" word in the tweet status is represented by the column `doggo` having string value of "doggo", which is very redundant. I would expect their data types to be boolean instead, with value `True` if the corresponding dog stage appears in the tweet status.

#### Cleaning
Convert the data type and values of `doggo`, `floofer`, `pupper`, `puppo` in `twitter_df` into boolean. For the values, give value `True` if the value is non-null, and `False` if the value is null. This is safe to be done since in the above section, we've checked the unique values of each dog stage column to only contain either NaN or the dog stage name itself. There won't be any case where a dog stage column contains string value that represents the other dog stage name. For clarity, below we'll reproduce the unique values again.

In [34]:
dog_stages_cols = ['doggo', 'floofer', 'pupper', 'puppo']
for c in dog_stages_cols:
    print(f'{c} unique values: {twitter_df[c].unique()}')

doggo unique values: [nan 'doggo']
floofer unique values: [nan 'floofer']
pupper unique values: [nan 'pupper']
puppo unique values: [nan 'puppo']


Below then we'll do the conversion to boolean.

In [35]:
twitter_df[dog_stages_cols] = twitter_df[dog_stages_cols].applymap(lambda x: False if pd.isna(x) else True)

# test
for c in dog_stages_cols:
    assert ptypes.is_bool_dtype(twitter_df[c])
assert twitter_df.doggo.sum() == 97
assert twitter_df.floofer.sum() == 10
assert twitter_df.pupper.sum() == 257
assert twitter_df.puppo.sum() == 30

#### Re-assessment
Below we'll check the data type of the dog stages columns.

In [36]:
twitter_df[dog_stages_cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   doggo    2356 non-null   bool 
 1   floofer  2356 non-null   bool 
 2   pupper   2356 non-null   bool 
 3   puppo    2356 non-null   bool 
dtypes: bool(4)
memory usage: 27.6 KB


We'll also check the number of True values in those columns, which we expect to be the same as the previous non-null values count of those columns (because after our transformation, null should equal False and non-null should equal True).

In [37]:
twitter_df[dog_stages_cols].sum()

doggo       97
floofer     10
pupper     257
puppo       30
dtype: int64

As can be seen, the number of True values matches the number of non-null values of the columns before the transformation (refer to the values presented in previous section).

### Data Accuracy - Multiple Dog Stages in a Row (while actually there is only one dog)

#### Assessment
Next, let's see whether is it possible for a row to have multiple dog stages.

In [38]:
dog_stages_count = twitter_df[['doggo', 'floofer', 'pupper', 'puppo']].sum(axis=1)
print(f'Possible number of dog stages in a row: {dog_stages_count.unique()}')

Possible number of dog stages in a row: [0 1 2]


There are indeed suprisingly some rows that have 2 dog stages. Since this dog stage information is derived from the corresponding tweet status text, let's now see the corresponding statuses of these rows.

In [39]:
pd.options.display.max_colwidth = 150 # so we can see the whole status text
multi_dog_stages_rows = twitter_df[dog_stages_count >= 2]
statuses = multi_dog_stages_rows.text.values
for s in statuses:
    print('- ' + s)

- Here's a puppo participating in the #ScienceMarch. Cleverly disguising her own doggo agenda. 13/10 would keep the planet habitable for https://t.co/cMhq16isel
- At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https://t.co/TXdT3tmuYk
- This is Dido. She's playing the lead role in "Pupper Stops to Catch Snow Before Resuming Shadow Box with Dried Apple." 13/10 (IG: didodoggo) https://t.co/m7isZrOBX7
- Here we have Burke (pupper) and Dexter (doggo). Pupper wants to be exactly like doggo. Both 12/10 would pet at same time https://t.co/ANBpEYHaho
- Like doggo, like pupper version 2. Both 11/10 https://t.co/9IxWAXFqze
- This is Bones. He's being haunted by another doggo of roughly the same size. 12/10 deep breaths pupper everything's fine https://t.co/55Dqe0SJNj
- This is Pinot. He's a sophisticated doggo. You can tell by the hat. Also pointier than your average pupper. Still 10/10 would pet cautiou

Evaluating the list of twitter statuses above (and also actually opening the pictures of the tweets), most of them have multiple dog stages because the status and the picture itself are involving more than one dogs, which totally makes sense. However, some rows actually only have one dog, but the tweet status mentions multiple dog stages in the text. For these cases, the multiple dog stages will indeed be misleading. Since the number is not much, we will fix them one by one.

#### Cleaning
From visual observation of the above statuses, below we lists down the statuses that actually only have one dog (with misleadingly multiple dog stages in the text).

In [40]:
idxs_invalid_multi_dog_stages = multi_dog_stages_rows.index[[0, 1, 2, 5, 6, 11]]
for s in twitter_df.iloc[idxs_invalid_multi_dog_stages].text.values:
    print('- ' + s)

- Here's a puppo participating in the #ScienceMarch. Cleverly disguising her own doggo agenda. 13/10 would keep the planet habitable for https://t.co/cMhq16isel
- At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https://t.co/TXdT3tmuYk
- This is Dido. She's playing the lead role in "Pupper Stops to Catch Snow Before Resuming Shadow Box with Dried Apple." 13/10 (IG: didodoggo) https://t.co/m7isZrOBX7
- This is Bones. He's being haunted by another doggo of roughly the same size. 12/10 deep breaths pupper everything's fine https://t.co/55Dqe0SJNj
- This is Pinot. He's a sophisticated doggo. You can tell by the hat. Also pointier than your average pupper. Still 10/10 would pet cautiously https://t.co/f2wmLZTPHd
- Please stop sending it pictures that don't even have a doggo or pupper in them. Churlish af. 5/10 neat couch tho https://t.co/u2c9c7qSg8


Below I list down one by one the correct stage, derived from each of the text above, and then immediately do the correction to the dog stage columns values.

In [41]:
correct_stages = ['puppo', 'floofer', 'pupper', 'pupper', 'doggo', 'doggo']
dog_stages_columns = np.array(['doggo', 'floofer', 'pupper', 'puppo'])

for row, col in zip(idxs_invalid_multi_dog_stages, correct_stages):
    cols_to_false = dog_stages_columns[dog_stages_columns != col]
    twitter_df.loc[row, cols_to_false] = False

# test
assert (twitter_df.loc[idxs_invalid_multi_dog_stages, dog_stages_columns].sum(axis=1).unique() == 1).all()

#### Re-assessment
Below we'll check again whether the rows with invalid-multi-dog-stages are now having only one dog stage per row.

In [42]:
twitter_df.loc[idxs_invalid_multi_dog_stages, np.append('text', dog_stages_columns)]

Unnamed: 0,text,doggo,floofer,pupper,puppo
191,Here's a puppo participating in the #ScienceMarch. Cleverly disguising her own doggo agenda. 13/10 would keep the planet habitable for https://t.c...,False,False,False,True
200,"At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https...",False,True,False,False
460,"This is Dido. She's playing the lead role in ""Pupper Stops to Catch Snow Before Resuming Shadow Box with Dried Apple."" 13/10 (IG: didodoggo) https...",False,False,True,False
575,This is Bones. He's being haunted by another doggo of roughly the same size. 12/10 deep breaths pupper everything's fine https://t.co/55Dqe0SJNj,False,False,True,False
705,This is Pinot. He's a sophisticated doggo. You can tell by the hat. Also pointier than your average pupper. Still 10/10 would pet cautiously https...,True,False,False,False
956,Please stop sending it pictures that don't even have a doggo or pupper in them. Churlish af. 5/10 neat couch tho https://t.co/u2c9c7qSg8,True,False,False,False


The above prints shows that those rows no longer have invalid multiple dog stages.

### Data Tidiness - Change Index of `twitter_df` and `image_pred_df` to `tweet_id`

#### Assessment
As seen below, the number of unique values for the `tweet_id` are the same as the total number of rows for both `twitter_df` and `image_pred_df`. This means we can set their indexes to be `tweet_id`, which makes more sense and will ease analysis.

In [43]:
print(f'Number of unique ID ({len(twitter_df.tweet_id.unique())}) '
      + f'is same as the number of rows of twitter_df ({twitter_df.shape[0]}).')

Number of unique ID (2356) is same as the number of rows of twitter_df (2356).


In [44]:
print(f'Number of unique ID ({len(image_pred_df.tweet_id.unique())}) '
      + f'is same as the number of rows of twitter_df ({image_pred_df.shape[0]}).')

Number of unique ID (2075) is same as the number of rows of twitter_df (2075).


#### Cleaning
Below we'll convert the indexes of `twitter_df` and `imaged_pred_df` into `tweet_id`.

In [45]:
twitter_df.set_index('tweet_id', inplace=True)
image_pred_df.set_index('tweet_id', inplace=True)

# test
assert twitter_df.index.name == 'tweet_id'
assert image_pred_df.index.name == 'tweet_id'

#### Re-assessment
Below we'll see that now their indexes are set to be `tweet_id`.

In [46]:
twitter_df.head(2)

Unnamed: 0_level_0,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweet_count,favorite_count
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,NaT,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,False,False,False,False,7417.0,35140.0
892177421306343426,,,2017-08-01 00:17:27+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https:/...",,,NaT,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,False,False,False,False,5517.0,30411.0


In [47]:
image_pred_df.head(2)

Unnamed: 0_level_0,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True


In [48]:
idxs = twitter_df.in_reply_to_status_id.notna()
twitter_df[idxs]

Unnamed: 0_level_0,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweet_count,favorite_count
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
886267009285017600,886266357075128320,2281181600,2017-07-15 16:51:35+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@NonWhiteHat @MayhewMayhem omg hello tanner you are a scary good boy 12/10 would pet with extreme caution,,,NaT,,12,10,,False,False,False,False,3.0,108.0
881633300179243008,881607037314052096,47384430,2017-07-02 21:58:53+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@roushfenway These are good dogs but 17/10 is an emotional impulse rating. More like 13/10s,,,NaT,,17,10,,False,False,False,False,7.0,115.0
879674319642796034,879553827334172672,3105440746,2017-06-27 12:14:36+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@RealKentMurphy 14/10 confirmed,,,NaT,,14,10,,False,False,False,False,10.0,290.0
870726314365509632,870726202742493184,16487760,2017-06-02 19:38:25+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@ComplicitOwl @ShopWeRateDogs &gt;10/10 is reserved for dogs,,,NaT,,10,10,,False,False,False,False,3.0,112.0
863427515083354112,863425645568774144,77596200,2017-05-13 16:15:35+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","@Jack_Septic_Eye I'd need a few more pics to polish a full analysis, but based on the good boy content above I'm leaning towards 12/10",,,NaT,,12,10,,False,False,False,False,85.0,2041.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671550332464455680,671544874165002240,4196983835,2015-12-01 04:44:10+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",After 22 minutes of careful deliberation this dog is being demoted to a 1/10. The longer you look at him the more terrifying he becomes,,,NaT,,1,10,,False,False,False,False,194.0,844.0
669684865554620416,669354382627049472,4196983835,2015-11-26 01:11:28+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",After countless hours of research and hundreds of formula alterations we have concluded that Dug should be bumped to an 11/10,,,NaT,,11,10,,False,False,False,False,82.0,475.0
669353438988365824,667806454573760512,4196983835,2015-11-25 03:14:30+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Tessa. She is also very pleased after finally meeting her biological father. 10/10 https://t.co/qDS1aCqppv,,,NaT,https://twitter.com/dog_rates/status/669353438988365824/photo/1,10,10,Tessa,False,False,False,False,240.0,584.0
668967877119254528,668920717132582912,21435658,2015-11-24 01:42:25+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",12/10 good shit Bubka\n@wane15,,,NaT,,12,10,,False,False,False,False,21.0,135.0


### Data Accuracy - `rating_numerator` and `rating_denominator` in `twitter_df`

#### Assessment
Let's assess the `describe()` below.

In [49]:
twitter_df.describe()

Unnamed: 0,rating_numerator,rating_denominator,retweet_count,favorite_count
count,2356.0,2356.0,2043.0,2043.0
mean,13.126486,10.455433,2767.898189,7830.767988
std,45.876648,6.745237,4593.251203,11943.897292
min,0.0,0.0,1.0,0.0
25%,10.0,10.0,521.0,1038.0
50%,11.0,10.0,1404.0,3574.0
75%,12.0,10.0,3277.0,10130.5
max,1776.0,170.0,74834.0,151127.0


It is weird to see that the `rating_numerator` hs minimum value of 0 and maximum value of 1776. Same goes for `rating_denominator`, with minimum value of 0 and maximum value of 170. The reason this is weird for me is because it is described by the [WeRateDogs wikipedia page](https://en.wikipedia.org/wiki/WeRateDogs) that the rating numerator is in general higher than 10, and the denominator is mostly 10. Below we'll investigate further for the rows that have these weird numerator and denominator values.

In [50]:
idxs = twitter_df.rating_numerator < 10
twitter_df.loc[idxs, ['text', 'rating_numerator', 'rating_denominator']].head()

Unnamed: 0_level_0,text,rating_numerator,rating_denominator
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
883482846933004288,"This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948",5,10
848212111729840128,"This is Jerry. He's doing a distinguished tongue slip. Slightly patronizing tbh. You think you're better than us, Jerry? 6/10 hold me back https:/...",6,10
835152434251116546,When you're so blinded by your systematic plagiarism that you forget what day it is. 0/10 https://t.co/YbEJPkg4Ag,0,10
826598799820865537,"I was going to do 007/10, but the joke wasn't worth the &lt;10 rating",7,10
817502432452313088,RT @dog_rates: Meet Herschel. He's slightly bigger than ur average pupper. Looks lonely. Could probably ride 7/10 would totally pet https:/…,7,10


As you can see above, one of the rows for ID "883482846933004288" has rating that is actually decimal 13.5/10. However, the numerator is detected as 5. It seems that the original rating detection didn't take into account possibility of the rating having decimal point.

#### Cleaning
Hence below I will re-extract the rating from the text, taking into account both the possibilities of:
- the rating having decimal points
- having multiple ratings within a single tweet

I will now store the rating in a separate dataframe than `twitter_df`. This is also because there can be multiple ratings per tweet, hence this will need a separate dataframe to be able to store the data correctly. The new dataframe name will be `dog_ratings_df`

In [51]:
dog_ratings_df = twitter_df.text.str.extractall('(((\d+\.)?\d+)/((\d+\.)?\d+))')[[1,3]]
dog_ratings_df.columns = ['numerator', 'denominator']

Here we check whether the detection of rating with decimal point is successful.

In [52]:
idx_with_dec = dog_ratings_df.numerator.str.contains('\.')
ratings_with_dec = dog_ratings_df[idx_with_dec]
ids_with_dec = ratings_with_dec.index.get_level_values('tweet_id')
ratings_with_dec.join(twitter_df.loc[ids_with_dec].text) # to show together with the tweet status text

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,denominator,text
tweet_id,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
883482846933004288,0,13.5,10,"This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948"
832215909146226688,0,9.75,10,"RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…"
786709082849828864,0,9.75,10,"This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS"
778027034220126208,0,11.27,10,This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random just to smile at the locals. 11.27/10 would smile back https://...
681340665377193984,0,9.5,10,I've been told there's a slight possibility he's checking his mirror. We'll bump to 9.5/10. Still a menace
680494726643068929,0,11.26,10,Here we have uncovered an entire battalion of holiday puppers. Average of 11.26/10 https://t.co/eNm2S6p9BD


You can see that the detection of rating seems to be effective for decimal numerator values. Since we've ensured that decimal detection was successful, we can now convert the data type of the rating values to float.

In [53]:
dog_ratings_df = dog_ratings_df.astype(float)
dog_ratings_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,denominator
tweet_id,match,Unnamed: 2_level_1,Unnamed: 3_level_1
892420643555336193,0,13.0,10.0
892177421306343426,0,13.0,10.0
891815181378084864,0,12.0,10.0
891689557279858688,0,13.0,10.0
891327558926688256,0,12.0,10.0


In [54]:
dog_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2390 entries, ('892420643555336193', 0) to ('666020888022790149', 0)
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   numerator    2390 non-null   float64
 1   denominator  2390 non-null   float64
dtypes: float64(2)
memory usage: 143.0+ KB


Now we'll evaluate the detected denominator values first. At least the denominator values expectation is clearer, where we'd like for the denominator to be exactly 10. Any deviation from that should be investigated further. Below we'll then show the tweets that have denominator being not equal to 10, with their corresponding tweet status.

In [55]:
dog_ratings_df[dog_ratings_df.denominator != 10].join(twitter_df.text)

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,denominator,text
tweet_id,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
835246439529840640,0,960.0,0.0,"@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho"
832088576586297345,0,11.0,15.0,@docmisterio account started on 11/15/15
820690176645140481,0,84.0,70.0,The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd
810984652412424192,0,24.0,7.0,Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t ...
775096608509886464,0,9.0,11.0,"RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/…"
758467244762497024,0,165.0,150.0,Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE
740373189193256964,0,9.0,11.0,"After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ"
731156023742988288,0,204.0,170.0,Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv
722974582966214656,0,4.0,20.0,Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a
716439118184652801,0,50.0,50.0,This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq


As can be seen above, some of them are mistakenly extracted from common phrases such as "7/11" and "24/7", while the others are actually valid rating. However, since this will affect the fairness of rating comparison, we'll simply drop all ratings whose denominator is not 10.

In [56]:
dog_ratings_df = dog_ratings_df[dog_ratings_df.denominator == 10]
dog_ratings_df.denominator.unique()

array([10.])

Now we're sure that the denominator are legit 10. Next, we'll evaluate the numerator values. We'll display the unique values of the numerator, sorted from lowest to highest.

In [57]:
unique_nums = dog_ratings_df.numerator.unique()
unique_nums.sort()
unique_nums.astype(str)

array(['0.0', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0',
       '9.0', '9.5', '9.75', '10.0', '11.0', '11.26', '11.27', '12.0',
       '13.0', '13.5', '14.0', '15.0', '17.0', '182.0', '420.0', '666.0',
       '1776.0'], dtype='<U32')

There seems to be some oddly high numerator values as seen above, i.e. for the values greater than 17. Also there are values that are lower than 10, which is unusual since the site says the numerator should be >= 10.

For now, let's evaluate the tweets that has rating numerator > 17.

In [58]:
ratings_num_big = dog_ratings_df[dog_ratings_df.numerator > 17]
ratings_num_big.join(twitter_df.text)

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,denominator,text
tweet_id,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
855862651834028034,0,420.0,10.0,@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research
855860136149123072,0,666.0,10.0,"@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10"
838150277551247360,0,182.0,10.0,@markhoppus 182/10
749981277374128128,0,1776.0,10.0,This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh
670842764863651840,0,420.0,10.0,After so many requests... here you go.\n\nGood dogg. 420/10 https://t.co/yfAAo1gdeY


Again for fair comparison in the rating, we'll remove any numerator ratings that are greater than 17.

In [59]:
dog_ratings_df = dog_ratings_df[dog_ratings_df.numerator <= 17]
dog_ratings_df.numerator.describe()

count    2362.000000
mean       10.687989
std         2.201211
min         0.000000
25%        10.000000
50%        11.000000
75%        12.000000
max        17.000000
Name: numerator, dtype: float64

Now that the oddly high values of the numerator are settled, we'll now take care of the less than 10 numerator values.

In [60]:
dog_ratings_df.query('numerator < 10')

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,denominator
tweet_id,match,Unnamed: 2_level_1,Unnamed: 3_level_1
848212111729840128,0,6.00,10.0
835152434251116546,0,0.00,10.0
832215909146226688,0,9.75,10.0
826598799820865537,0,7.00,10.0
817502432452313088,0,7.00,10.0
...,...,...,...
666049248165822465,0,5.00,10.0
666044226329800704,0,6.00,10.0
666033412701032449,0,9.00,10.0
666029285002620928,0,7.00,10.0


There appears to be quite a lot, and we assumed that those people that gave numerator < 10 does not know the rule that the numerator should be greater than 10. In this case then we'll assume that people who gave 0 is equivalent to giving rating of 10. Hence, we'll offset the numerator values by 10 for the rows that have their numerators < 10.

In [61]:
dog_ratings_df.loc[(dog_ratings_df.numerator < 10), 'numerator'] += 10

# test
assert dog_ratings_df.numerator.min() >= 10

Below we show that now the numerator ranges from 10 to 19.75, which are very reasonable values.

In [62]:
dog_ratings_df.numerator.describe()

count    2362.000000
mean       12.597388
std         2.622902
min        10.000000
25%        11.000000
50%        12.000000
75%        13.000000
max        19.750000
Name: numerator, dtype: float64

Last but not least, we'll rename the `match` index in `dog_ratings_df` to be `rating_no` to represent the index of multiple ratings within a single tweet status.

In [63]:
dog_ratings_df.index.rename(['tweet_id', 'rating_no'], inplace=True)

# test
assert dog_ratings_df.index.names[0] == 'tweet_id'
assert dog_ratings_df.index.names[1] == 'rating_no'

print(f'Index names: {dog_ratings_df.index.names}')

Index names: ['tweet_id', 'rating_no']


As our last step, we'll drop the rating columns from `twitter_df`.

In [64]:
twitter_df.drop(columns=['rating_numerator', 'rating_denominator'], inplace=True)

# test
assert not twitter_df.columns.isin(['rating_numerator', 'rating_denominator']).any()

#### Re-assessment

In [65]:
dog_ratings_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,denominator
tweet_id,rating_no,Unnamed: 2_level_1,Unnamed: 3_level_1
892420643555336193,0,13.0,10.0
892177421306343426,0,13.0,10.0


In [66]:
dog_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2362 entries, ('892420643555336193', 0) to ('666020888022790149', 0)
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   numerator    2362 non-null   float64
 1   denominator  2362 non-null   float64
dtypes: float64(2)
memory usage: 142.4+ KB


In [67]:
dog_ratings_df.describe()

Unnamed: 0,numerator,denominator
count,2362.0,2362.0
mean,12.597388,10.0
std,2.622902,0.0
min,10.0,10.0
25%,11.0,10.0
50%,12.0,10.0
75%,13.0,10.0
max,19.75,10.0


The assessments above shows that `dog_ratings_df` looks reasonable.

Below we'll also see that the rating columns are no longer in `twitter_df`.

In [68]:
twitter_df.head(2)

Unnamed: 0_level_0,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,name,doggo,floofer,pupper,puppo,retweet_count,favorite_count
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,NaT,https://twitter.com/dog_rates/status/892420643555336193/photo/1,Phineas,False,False,False,False,7417.0,35140.0
892177421306343426,,,2017-08-01 00:17:27+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https:/...",,,NaT,https://twitter.com/dog_rates/status/892177421306343426/photo/1,Tilly,False,False,False,False,5517.0,30411.0


### Data Tidiness - Dog State and Dog Name Columns should not be in `twitter_df`

#### Assessment
The dog state columns `doggo`, `floofer`, `pupper`, and `puppo`, and the `name` column does not seem to belong in the `twitter_df`. They need their own table. We'll separate them into a new dataframe named `tweet_dog_info_df`.

#### Cleaning
Below we'll separate the columns `doggo`, `floofer`, `pupper`, `puppo`, and `name` into a new dataframe named `tweet_dog_info_df`.

In [69]:
dog_info_cols = ['name', 'doggo', 'floofer', 'pupper', 'puppo']
tweet_dog_info_df = twitter_df[dog_info_cols].copy()

We'll then drop the columns from `twitter_df`.

In [70]:
twitter_df.drop(columns=dog_info_cols, inplace=True)

# test
assert not twitter_df.columns.isin(dog_info_cols).any()

#### Re-assessment
Below we'll evaluate again the resulting dataframes.

In [71]:
tweet_dog_info_df.head(2)

Unnamed: 0_level_0,name,doggo,floofer,pupper,puppo
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892420643555336193,Phineas,False,False,False,False
892177421306343426,Tilly,False,False,False,False


In [72]:
tweet_dog_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2356 entries, 892420643555336193 to 666020888022790149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     2356 non-null   object
 1   doggo    2356 non-null   bool  
 2   floofer  2356 non-null   bool  
 3   pupper   2356 non-null   bool  
 4   puppo    2356 non-null   bool  
dtypes: bool(4), object(1)
memory usage: 126.0+ KB


In [73]:
twitter_df.head(2)

Unnamed: 0_level_0,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,retweet_count,favorite_count
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,NaT,https://twitter.com/dog_rates/status/892420643555336193/photo/1,7417.0,35140.0
892177421306343426,,,2017-08-01 00:17:27+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https:/...",,,NaT,https://twitter.com/dog_rates/status/892177421306343426/photo/1,5517.0,30411.0


In [74]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2356 entries, 892420643555336193 to 666020888022790149
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   in_reply_to_status_id       78 non-null     object             
 1   in_reply_to_user_id         78 non-null     object             
 2   timestamp                   2356 non-null   datetime64[ns, UTC]
 3   source                      2356 non-null   object             
 4   text                        2356 non-null   object             
 5   retweeted_status_id         181 non-null    object             
 6   retweeted_status_user_id    181 non-null    object             
 7   retweeted_status_timestamp  181 non-null    datetime64[ns, UTC]
 8   expanded_urls               2297 non-null   object             
 9   retweet_count               2043 non-null   float64            
 10  favorite_count              2043 n

It is the concluded from assessments above that the structure of `twitter_df` is now tidy as it serves for a single function: to contain technical information for each tweet. `tweet_dog_info_df` then contains dog related information found from the tweet's status.

### Data Validity - Dog Name in `tweet_dog_info_df` Containing Invalid Names
#### Assessment

In [75]:
names = tweet_dog_info_df.name.unique()
names.sort()
names

array(['Abby', 'Ace', 'Acro', 'Adele', 'Aiden', 'Aja', 'Akumi', 'Al',
       'Albert', 'Albus', 'Aldrick', 'Alejandro', 'Alexander',
       'Alexanderson', 'Alf', 'Alfie', 'Alfy', 'Alice', 'Amber',
       'Ambrose', 'Amy', 'Amélie', 'Anakin', 'Andru', 'Andy', 'Angel',
       'Anna', 'Anthony', 'Antony', 'Apollo', 'Aqua', 'Archie', 'Arlen',
       'Arlo', 'Arnie', 'Arnold', 'Arya', 'Ash', 'Asher', 'Ashleigh',
       'Aspen', 'Astrid', 'Atlas', 'Atticus', 'Aubie', 'Augie', 'Autumn',
       'Ava', 'Axel', 'Bailey', 'Baloo', 'Balto', 'Banditt', 'Banjo',
       'Barclay', 'Barney', 'Baron', 'Barry', 'Batdog', 'Bauer', 'Baxter',
       'Bayley', 'BeBe', 'Bear', 'Beau', 'Beckham', 'Beebop', 'Beemo',
       'Bell', 'Bella', 'Belle', 'Ben', 'Benedict', 'Benji', 'Benny',
       'Bentley', 'Berb', 'Berkeley', 'Bernie', 'Bert', 'Bertson',
       'Betty', 'Beya', 'Biden', 'Bilbo', 'Billl', 'Billy', 'Binky',
       'Birf', 'Bisquick', 'Blakely', 'Blanket', 'Blipson', 'Blitz',
       'Bloo', 'Bloop',

From the above list of names, we can find several invalid names:
- 'None'
- 'a'
- 'actually'
- 'all'
- 'an'
- 'by'
- 'getting'
- 'his'
- 'incredibly'
- 'infuriating'
- 'just'
- 'life'
- 'light'
- 'mad'
- 'my'
- 'not'
- 'officially'
- 'old'
- 'one'
- 'quite'
- 'space'
- 'such'
- 'the'
- 'this'
- 'unacceptable'
- 'very'

Mostly the invalid names are the "None" and all the names that starts with lower case.

#### Cleaning
We'll now change the names that are "None" or starting with lower case to be `NaN`.

In [76]:
lower_case_names = tweet_dog_info_df.name.str.contains('^[a-z]')
tweet_dog_info_df.loc[lower_case_names, 'name'] = None

# test
assert tweet_dog_info_df.name.str.contains('^[a-z]').sum() == 0

In [77]:
idxs_none = tweet_dog_info_df.name == 'None'
tweet_dog_info_df.loc[idxs_none, 'name'] = None

# test
assert (tweet_dog_info_df.name == 'None').sum() == 0

#### Re-assessment

In [78]:
names = tweet_dog_info_df.name.astype(str).unique()
names.sort()
names

array(['Abby', 'Ace', 'Acro', 'Adele', 'Aiden', 'Aja', 'Akumi', 'Al',
       'Albert', 'Albus', 'Aldrick', 'Alejandro', 'Alexander',
       'Alexanderson', 'Alf', 'Alfie', 'Alfy', 'Alice', 'Amber',
       'Ambrose', 'Amy', 'Amélie', 'Anakin', 'Andru', 'Andy', 'Angel',
       'Anna', 'Anthony', 'Antony', 'Apollo', 'Aqua', 'Archie', 'Arlen',
       'Arlo', 'Arnie', 'Arnold', 'Arya', 'Ash', 'Asher', 'Ashleigh',
       'Aspen', 'Astrid', 'Atlas', 'Atticus', 'Aubie', 'Augie', 'Autumn',
       'Ava', 'Axel', 'Bailey', 'Baloo', 'Balto', 'Banditt', 'Banjo',
       'Barclay', 'Barney', 'Baron', 'Barry', 'Batdog', 'Bauer', 'Baxter',
       'Bayley', 'BeBe', 'Bear', 'Beau', 'Beckham', 'Beebop', 'Beemo',
       'Bell', 'Bella', 'Belle', 'Ben', 'Benedict', 'Benji', 'Benny',
       'Bentley', 'Berb', 'Berkeley', 'Bernie', 'Bert', 'Bertson',
       'Betty', 'Beya', 'Biden', 'Bilbo', 'Billl', 'Billy', 'Binky',
       'Birf', 'Bisquick', 'Blakely', 'Blanket', 'Blipson', 'Blitz',
       'Bloo', 'Bloop',

As can be seen from the names above that all the invalid names that we observed before are now removed (replaced with `NaN`).

# Conclusion
Final dataframes:
- `twitter_df`: only contains tweets technical informations.
- `dog_ratings_df`: only contains dog ratings found in each tweet. 
- `image_pred_df`: only contains dog breed classifications of the image in each tweet.
- `tweet_dog_info_df`: only contains the dog state and dog name detected in each tweet.

In [79]:
twitter_df.head()

Unnamed: 0_level_0,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,retweet_count,favorite_count
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,NaT,https://twitter.com/dog_rates/status/892420643555336193/photo/1,7417.0,35140.0
892177421306343426,,,2017-08-01 00:17:27+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https:/...",,,NaT,https://twitter.com/dog_rates/status/892177421306343426/photo/1,5517.0,30411.0
891815181378084864,,,2017-07-31 00:18:03+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,NaT,https://twitter.com/dog_rates/status/891815181378084864/photo/1,3640.0,22869.0
891689557279858688,,,2017-07-30 15:58:51+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,NaT,https://twitter.com/dog_rates/status/891689557279858688/photo/1,7584.0,38400.0
891327558926688256,,,2017-07-29 16:00:24+00:00,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https:/...",,,NaT,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",8166.0,36651.0


In [80]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2356 entries, 892420643555336193 to 666020888022790149
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   in_reply_to_status_id       78 non-null     object             
 1   in_reply_to_user_id         78 non-null     object             
 2   timestamp                   2356 non-null   datetime64[ns, UTC]
 3   source                      2356 non-null   object             
 4   text                        2356 non-null   object             
 5   retweeted_status_id         181 non-null    object             
 6   retweeted_status_user_id    181 non-null    object             
 7   retweeted_status_timestamp  181 non-null    datetime64[ns, UTC]
 8   expanded_urls               2297 non-null   object             
 9   retweet_count               2043 non-null   float64            
 10  favorite_count              2043 n

In [81]:
twitter_df.describe()

Unnamed: 0,retweet_count,favorite_count
count,2043.0,2043.0
mean,2767.898189,7830.767988
std,4593.251203,11943.897292
min,1.0,0.0
25%,521.0,1038.0
50%,1404.0,3574.0
75%,3277.0,10130.5
max,74834.0,151127.0


In [82]:
dog_ratings_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,denominator
tweet_id,rating_no,Unnamed: 2_level_1,Unnamed: 3_level_1
892420643555336193,0,13.0,10.0
892177421306343426,0,13.0,10.0
891815181378084864,0,12.0,10.0
891689557279858688,0,13.0,10.0
891327558926688256,0,12.0,10.0


In [83]:
dog_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2362 entries, ('892420643555336193', 0) to ('666020888022790149', 0)
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   numerator    2362 non-null   float64
 1   denominator  2362 non-null   float64
dtypes: float64(2)
memory usage: 142.4+ KB


In [84]:
dog_ratings_df.describe()

Unnamed: 0,numerator,denominator
count,2362.0,2362.0
mean,12.597388,10.0
std,2.622902,0.0
min,10.0,10.0
25%,11.0,10.0
50%,12.0,10.0
75%,13.0,10.0
max,19.75,10.0


In [85]:
image_pred_df.head()

Unnamed: 0_level_0,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [86]:
image_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2075 entries, 666020888022790149 to 892420643555336193
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   jpg_url  2075 non-null   object 
 1   img_num  2075 non-null   int64  
 2   p1       2075 non-null   object 
 3   p1_conf  2075 non-null   float64
 4   p1_dog   2075 non-null   bool   
 5   p2       2075 non-null   object 
 6   p2_conf  2075 non-null   float64
 7   p2_dog   2075 non-null   bool   
 8   p3       2075 non-null   object 
 9   p3_conf  2075 non-null   float64
 10  p3_dog   2075 non-null   bool   
dtypes: bool(3), float64(3), int64(1), object(4)
memory usage: 152.0+ KB


In [87]:
image_pred_df.describe()

Unnamed: 0,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0
mean,1.203855,0.594548,0.1345886,0.06032417
std,0.561875,0.271174,0.1006657,0.05090593
min,1.0,0.044333,1.0113e-08,1.74017e-10
25%,1.0,0.364412,0.05388625,0.0162224
50%,1.0,0.58823,0.118181,0.0494438
75%,1.0,0.843855,0.1955655,0.09180755
max,4.0,1.0,0.488014,0.273419


In [88]:
tweet_dog_info_df.head()

Unnamed: 0_level_0,name,doggo,floofer,pupper,puppo
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892420643555336193,Phineas,False,False,False,False
892177421306343426,Tilly,False,False,False,False
891815181378084864,Archie,False,False,False,False
891689557279858688,Darla,False,False,False,False
891327558926688256,Franklin,False,False,False,False


In [89]:
tweet_dog_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2356 entries, 892420643555336193 to 666020888022790149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     1502 non-null   object
 1   doggo    2356 non-null   bool  
 2   floofer  2356 non-null   bool  
 3   pupper   2356 non-null   bool  
 4   puppo    2356 non-null   bool  
dtypes: bool(4), object(1)
memory usage: 126.0+ KB


In [90]:
tweet_dog_info_df.describe()

Unnamed: 0,name,doggo,floofer,pupper,puppo
count,1502,2356,2356,2356,2356
unique,931,2,2,2,2
top,Charlie,False,False,False,False
freq,12,2263,2346,2101,2326


## Appendix

### Dog Breed Values Assessment in `image_pred_df`

In [91]:
dog_breeds = pd.concat((image_pred_df.p1, image_pred_df.p2))
dog_breeds = pd.concat((dog_breeds, image_pred_df.p3))
dog_breeds = dog_breeds.unique()
dog_breeds.sort()
a = dog_breeds[:-1]
b = dog_breeds[1:]
a_lens = np.array([len(x) for x in a])
b_lens = np.array([len(x) for x in b])

In [92]:
# observe only the dog breed names whose length are similar with the previous (this is a sorted list)
len_diffs = np.abs(b_lens - a_lens)
idxs = len_diffs <= 2
idxs = np.insert(idxs, 0, True)
idxs_final = idxs[:-1] | idxs[1:]
idxs_final = np.append(idxs_final, idxs[-1])

In [93]:
dog_breeds[idxs_final]

array(['Afghan_hound', 'African_chameleon', 'African_crocodile',
       'American_alligator', 'American_black_bear', 'Appenzeller',
       'Arabian_camel', 'Bedlington_terrier', 'Bernese_mountain_dog',
       'Border_collie', 'Border_terrier', 'Brabancon_griffon',
       'Brittany_spaniel', 'English_foxhound', 'English_setter',
       'English_springer', 'EntleBucher', 'Eskimo_dog', 'French_horn',
       'French_loaf', 'Gila_monster', 'Gordon_setter', 'Irish_setter',
       'Irish_terrier', 'Irish_wolfhound', 'Italian_greyhound',
       'Japanese_spaniel', 'Kerry_blue_terrier', 'Labrador_retriever',
       'Lakeland_terrier', 'Lhasa', 'Loafer', 'Pekinese', 'Pembroke',
       'Persian_cat', 'Pomeranian', 'Saluki', 'Samoyed',
       'Scottish_deerhound', 'Sealyham_terrier', 'Shetland_sheepdog',
       'Sussex_spaniel', 'Tibetan_mastiff', 'Tibetan_terrier',
       'Walker_hound', 'Weimaraner', 'acorn_squash', 'affenpinscher',
       'agama', 'alp', 'apron', 'axolotl', 'badger', 'bagel', '

Visually assessing the list of dog breed names above shows that there is no ambiguously similar dog breeds.