# Project: Wrangling and Analyze Data

In [1]:
import pandas as pd
import numpy as np
import requests
import json

## Data Gathering
In the cell below, gather **all** three pieces of data for this project and load them in the notebook. **Note:** the methods required to gather each data are different.
1. Directly download the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

In [2]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
twitter_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [3]:
# checking for tweet_id dtype, must be integer
twitter_archive.tweet_id.dtype

dtype('int64')

2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [4]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

response = requests.get(url)

In [5]:
# Save response to a tsv file

with open("image-predictions.tsv", mode='wb') as file:
    file.write(response.content)

In [6]:
predictions = pd.read_csv('image-predictions.tsv',sep='\t')
predictions.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [11]:
import tweepy

consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
# test access
tweet = api.get_status(twitter_archive['tweet_id'][0],tweet_mode='extended')
#print(tweet._json)
#print(tweet._json['full_text'])

In [None]:
done = []
failed = []

with open('tweet_json.txt', 'w') as f:
    for tweet_id in twitter_archive['tweet_id']:
        try:
            tweet = api.get_status(tweet_id,tweet_mode='extended')
            json.dump(tweet._json, f)
            done.append(tweet_id)
            f.write('\n')
        except:
            failed.append(tweet_id)
            pass
        
print("extracted tweets: ", len(done))
print("not extracted: ", len(failed))

In [17]:
#Loading JSON into pandas dataframe
tweets = pd.read_json('tweet_json.txt', lines = True, encoding='utf-8')
tweets.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,extended_entities,source,in_reply_to_status_id,...,favorited,retweeted,possibly_sensitive,possibly_sensitive_appealable,lang,retweeted_status,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status
0,2017-08-01 16:23:56+00:00,892420643555336193,892420643555336192,This is Phineas. He's a mystical boy. Only eve...,False,"[0, 85]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892420639486877696, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,False,False,0.0,0.0,en,,,,,
1,2017-08-01 00:17:27+00:00,892177421306343426,892177421306343424,This is Tilly. She's just checking pup on you....,False,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892177413194625024, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,False,False,0.0,0.0,en,,,,,
2,2017-07-31 00:18:03+00:00,891815181378084864,891815181378084864,This is Archie. He is a rare Norwegian Pouncin...,False,"[0, 121]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 891815175371796480, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,False,False,0.0,0.0,en,,,,,
3,2017-07-30 15:58:51+00:00,891689557279858688,891689557279858688,This is Darla. She commenced a snooze mid meal...,False,"[0, 79]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 891689552724799489, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,False,False,0.0,0.0,en,,,,,
4,2017-07-29 16:00:24+00:00,891327558926688256,891327558926688256,This is Franklin. He would like you to stop ca...,False,"[0, 138]","{'hashtags': [{'text': 'BarkWeek', 'indices': ...","{'media': [{'id': 891327551943041024, 'id_str'...","<a href=""http://twitter.com/download/iphone"" r...",,...,False,False,0.0,0.0,en,,,,,


## Assessing Data
In this section, detect and document at least **eight (8) quality issues and two (2) tidiness issue**. You must use **both** visual assessment
programmatic assessement to assess the data.

**Note:** pay attention to the following key points when you access the data.

* You only want original ratings (no retweets) that have images. Though there are 5000+ tweets in the dataset, not all are dog ratings and some are retweets.
* Assessing and cleaning the entire dataset completely would require a lot of time, and is not necessary to practice and demonstrate your skills in data wrangling. Therefore, the requirements of this project are only to assess and clean at least 8 quality issues and at least 2 tidiness issues in this dataset.
* The fact that the rating numerators are greater than the denominators does not need to be cleaned. This [unique rating system](http://knowyourmeme.com/memes/theyre-good-dogs-brent) is a big part of the popularity of WeRateDogs.
* You do not need to gather the tweets beyond August 1st, 2017. You can, but note that you won't be able to gather the image predictions for these tweets since you don't have access to the algorithm used.



In [14]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [42]:
twitter_archive.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [23]:
twitter_archive.duplicated().sum()

0

In [15]:
predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [43]:
predictions.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [24]:
predictions.duplicated().sum()

0

In [28]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2327 entries, 0 to 2326
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2327 non-null   datetime64[ns, UTC]
 1   id                             2327 non-null   int64              
 2   id_str                         2327 non-null   int64              
 3   full_text                      2327 non-null   object             
 4   truncated                      2327 non-null   bool               
 5   display_text_range             2327 non-null   object             
 6   entities                       2327 non-null   object             
 7   extended_entities              2057 non-null   object             
 8   source                         2327 non-null   object             
 9   in_reply_to_status_id          77 non-null     float64            
 10  in_reply_to_status_id_st

### Quality issues

`twitter_archive.csv`
	
*Visual assessment*
	
	1. "None" is not a NaN dtype in doggo, floofer, pupper, puppo columns
	
*Programmatic assessment*
	
	2. in_reply and retweeted rows are not useful
	3. timestamp column is not a datetime dtype
    4. invalid values in nominator and denominator columns (0)
    
`predictions.csv`
	
*Visual assessment*
	
	5. some predicted dog races are capitalized, others not
    6. non explicit column names for predictions
	
*Programmatic assessment*
    
    -

`tweets.csv`

*Visual assessment*
    
    7. id column name is not valid further merging (it's id_tweet in other tables)
    
*Programmatic assessment*

    8. in_reply and is_quote rows are not useful, as well as source, user, geo, coordinates, place, contributors, favourited, retwitted, possibly_sensitive, lang

### Tidiness issues

`twitter_archive.csv`
	
*Visual assessment*
	
	9. doggo, floofer, pupper, puppo are in separate columns for no particular reason
    
`twitter_archive.csv`, `predictions.csv`, `tweets.csv`

*Programmatic assessment*

    10. All the three dataframes can be merged on tweet_id, after dropping non relevant columns

## Cleaning Data
In this section, clean **all** of the issues you documented while assessing. 

**Note:** Make a copy of the original data before cleaning. Cleaning includes merging individual pieces of data according to the rules of [tidy data](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html). The result should be a high-quality and tidy master pandas DataFrame (or DataFrames, if appropriate).

In [178]:
# Make copies of original pieces of data
twitter_archive_clean = twitter_archive.copy()
predictions_clean = predictions.copy()
tweets_clean = tweets.copy()

### Issue #1: 

#### Define: 

"None" is not a NaN dtype in doggo, floofer, pupper, puppo columns in `twitter_archive.csv`

#### Code

In [179]:
twitter_archive_clean = twitter_archive_clean.replace('None', np.nan)

#### Test

In [180]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        1611 non-null   object 
 13  doggo                       97 no

### Issue #2:

#### Define

in_reply and retweeted rows are not useful in `twitter_archive.csv`

#### Code

In [181]:
twitter_archive_clean = twitter_archive_clean[~twitter_archive_clean['in_reply_to_status_id'].notna()]
twitter_archive_clean = twitter_archive_clean[~twitter_archive_clean['retweeted_status_id'].notna()]

In [182]:
twitter_archive_clean.drop(['in_reply_to_status_id','in_reply_to_user_id',
                             'retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp'], axis=1, inplace=True)

#### Test

In [183]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_id            2097 non-null   int64 
 1   timestamp           2097 non-null   object
 2   source              2097 non-null   object
 3   text                2097 non-null   object
 4   expanded_urls       2094 non-null   object
 5   rating_numerator    2097 non-null   int64 
 6   rating_denominator  2097 non-null   int64 
 7   name                1494 non-null   object
 8   doggo               83 non-null     object
 9   floofer             10 non-null     object
 10  pupper              230 non-null    object
 11  puppo               24 non-null     object
dtypes: int64(3), object(9)
memory usage: 213.0+ KB


In [184]:
twitter_archive_clean.drop(['source'], axis=1, inplace=True)

In [185]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_id            2097 non-null   int64 
 1   timestamp           2097 non-null   object
 2   text                2097 non-null   object
 3   expanded_urls       2094 non-null   object
 4   rating_numerator    2097 non-null   int64 
 5   rating_denominator  2097 non-null   int64 
 6   name                1494 non-null   object
 7   doggo               83 non-null     object
 8   floofer             10 non-null     object
 9   pupper              230 non-null    object
 10  puppo               24 non-null     object
dtypes: int64(3), object(8)
memory usage: 196.6+ KB


### Issue #3:

#### Define

timestamp column is not a datetime dtype in `twitter_archive.csv`

#### Code

In [186]:
twitter_archive_clean['timestamp'] = pd.to_datetime(twitter_archive_clean['timestamp'])

#### Test

In [187]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2097 non-null   int64              
 1   timestamp           2097 non-null   datetime64[ns, UTC]
 2   text                2097 non-null   object             
 3   expanded_urls       2094 non-null   object             
 4   rating_numerator    2097 non-null   int64              
 5   rating_denominator  2097 non-null   int64              
 6   name                1494 non-null   object             
 7   doggo               83 non-null     object             
 8   floofer             10 non-null     object             
 9   pupper              230 non-null    object             
 10  puppo               24 non-null     object             
dtypes: datetime64[ns, UTC](1), int64(3), object(7)
memory usage: 196.6+ KB


### Issue #4:

#### Define

invalid values in nominator and denominator columns (0) in `twitter_archive.csv`

#### Code

In [188]:
twitter_archive_clean[twitter_archive_clean['rating_numerator'] == 0]

Unnamed: 0,tweet_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
315,835152434251116546,2017-02-24 15:40:31+00:00,When you're so blinded by your systematic plag...,https://twitter.com/dog_rates/status/835152434...,0,10,,,,,


In [189]:
twitter_archive_clean[twitter_archive_clean['rating_denominator'] == 0]

Unnamed: 0,tweet_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [190]:
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['rating_numerator'] != 0]
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['rating_denominator'] != 0]

#### Test

In [191]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2096 entries, 0 to 2355
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2096 non-null   int64              
 1   timestamp           2096 non-null   datetime64[ns, UTC]
 2   text                2096 non-null   object             
 3   expanded_urls       2093 non-null   object             
 4   rating_numerator    2096 non-null   int64              
 5   rating_denominator  2096 non-null   int64              
 6   name                1494 non-null   object             
 7   doggo               83 non-null     object             
 8   floofer             10 non-null     object             
 9   pupper              230 non-null    object             
 10  puppo               24 non-null     object             
dtypes: datetime64[ns, UTC](1), int64(3), object(7)
memory usage: 196.5+ KB


### Issue #5:

#### Define

some predicted dog races are capitalized, others not in `predictions.csv`

#### Code

In [192]:
predictions_clean['p1'] = predictions_clean['p1'].str.lower()
predictions_clean['p2'] = predictions_clean['p2'].str.lower()
predictions_clean['p3'] = predictions_clean['p3'].str.lower()

#### Test

In [193]:
predictions_clean.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,welsh_springer_spaniel,0.465074,True,collie,0.156665,True,shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,german_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,rottweiler,0.243682,True,doberman,0.154629,True


### Issue #6:

#### Define

non explicit column names for predictions in `predictions.csv`

#### Code

In [194]:
predictions_clean.rename(columns={'p1': 'test1_prediction', 
                                  'p1_conf': 'test1_confidence',
                                 'p1_dog': 'test1_isDog',
                                 'p2': 'test2_prediction', 
                                  'p2_conf': 'test2_confidence',
                                 'p2_dog': 'test2_isDog',
                                 'p3': 'test3_prediction', 
                                  'p3_conf': 'test3_confidence',
                                 'p3_dog': 'test3_isDog'}, inplace=True)

#### Test

In [195]:
predictions_clean.columns

Index(['tweet_id', 'jpg_url', 'img_num', 'test1_prediction',
       'test1_confidence', 'test1_isDog', 'test2_prediction',
       'test2_confidence', 'test2_isDog', 'test3_prediction',
       'test3_confidence', 'test3_isDog'],
      dtype='object')

### Issue #7:

#### Define

id column name is not valid further merging (it's id_tweet in other tables) for `tweets.csv`

#### Code

In [196]:
tweets_clean.rename(columns={'id': 'tweet_id'}, inplace=True)

#### Test

In [197]:
tweets_clean.columns

Index(['created_at', 'tweet_id', 'id_str', 'full_text', 'truncated',
       'display_text_range', 'entities', 'extended_entities', 'source',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'is_quote_status', 'retweet_count', 'favorite_count',
       'favorited', 'retweeted', 'possibly_sensitive',
       'possibly_sensitive_appealable', 'lang', 'retweeted_status',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink',
       'quoted_status'],
      dtype='object')

### Issue #8:

#### Define

in_reply and is_quote rows are not useful for `tweets.csv`

#### Code

In [198]:
tweets_clean = tweets_clean[~tweets_clean['in_reply_to_status_id'].notna()]
tweets_clean = tweets_clean[~tweets_clean['quoted_status_id'].notna()]

In [199]:
tweets_clean.drop(['in_reply_to_status_id','in_reply_to_status_id_str','in_reply_to_user_id',
                  'in_reply_to_user_id_str','in_reply_to_screen_name',
                  'quoted_status_id','quoted_status_id_str','quoted_status_permalink','quoted_status'],
                  axis=1,inplace=True)

#### Test

In [200]:
tweets_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2224 entries, 0 to 2326
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2224 non-null   datetime64[ns, UTC]
 1   tweet_id                       2224 non-null   int64              
 2   id_str                         2224 non-null   int64              
 3   full_text                      2224 non-null   object             
 4   truncated                      2224 non-null   bool               
 5   display_text_range             2224 non-null   object             
 6   entities                       2224 non-null   object             
 7   extended_entities              2034 non-null   object             
 8   source                         2224 non-null   object             
 9   user                           2224 non-null   object             
 10  geo                     

In [201]:
tweets_clean.drop(['created_at','truncated','display_text_range','entities','extended_entities','source','user',
                   'geo','coordinates','place','contributors','is_quote_status','favorited','retweeted',
                  'possibly_sensitive','possibly_sensitive_appealable','lang','retweeted_status','id_str','full_text'],
                  axis=1,inplace=True)

In [202]:
tweets_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2224 entries, 0 to 2326
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   tweet_id        2224 non-null   int64
 1   retweet_count   2224 non-null   int64
 2   favorite_count  2224 non-null   int64
dtypes: int64(3)
memory usage: 69.5 KB


### Issue #9:

#### Define

doggo, floofer, pupper, puppo are in separate columns for no particular reason `twitter_archive.csv`

#### Code

In [203]:
twitter_archive_clean.loc[twitter_archive_clean['doggo'] == 'doggo', 'dog_class'] = 'doggo'
twitter_archive_clean.loc[twitter_archive_clean['floofer'] == 'floofer', 'dog_class'] = 'floofer'
twitter_archive_clean.loc[twitter_archive_clean['pupper'] == 'pupper', 'dog_class'] = 'pupper'
twitter_archive_clean.loc[twitter_archive_clean['puppo'] == 'puppo', 'dog_class'] = 'puppo'

In [204]:
twitter_archive_clean.drop(['doggo','floofer','pupper','puppo'],axis=1,inplace=True)

#### Test

In [205]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2096 entries, 0 to 2355
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2096 non-null   int64              
 1   timestamp           2096 non-null   datetime64[ns, UTC]
 2   text                2096 non-null   object             
 3   expanded_urls       2093 non-null   object             
 4   rating_numerator    2096 non-null   int64              
 5   rating_denominator  2096 non-null   int64              
 6   name                1494 non-null   object             
 7   dog_class           336 non-null    object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 147.4+ KB


### Issue #10:

#### Define

All the three dataframes can be merged on tweet_id, after dropping non relevant columns

#### Code

In [206]:
all_data = pd.merge(twitter_archive_clean, predictions_clean, on=['tweet_id'])

In [207]:
all_data = pd.merge(all_data, tweets_clean, on=['tweet_id'])

#### Test

In [208]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1962 entries, 0 to 1961
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            1962 non-null   int64              
 1   timestamp           1962 non-null   datetime64[ns, UTC]
 2   text                1962 non-null   object             
 3   expanded_urls       1962 non-null   object             
 4   rating_numerator    1962 non-null   int64              
 5   rating_denominator  1962 non-null   int64              
 6   name                1440 non-null   object             
 7   dog_class           302 non-null    object             
 8   jpg_url             1962 non-null   object             
 9   img_num             1962 non-null   int64              
 10  test1_prediction    1962 non-null   object             
 11  test1_confidence    1962 non-null   float64            
 12  test1_isDog         1962 non-null 

## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

In [209]:
all_data.to_csv("twitter_archive_master.csv",sep='\t',index=False)

## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

In [210]:
twitter_data = pd.read_csv("twitter_archive_master.csv",sep='\t')
twitter_data.head()

Unnamed: 0,tweet_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,dog_class,jpg_url,img_num,...,test1_confidence,test1_isDog,test2_prediction,test2_confidence,test2_isDog,test3_prediction,test3_confidence,test3_isDog,retweet_count,favorite_count
0,892420643555336193,2017-08-01 16:23:56+00:00,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1,...,0.097049,False,bagel,0.085851,False,banana,0.07611,False,7102,34218
1,892177421306343426,2017-08-01 00:17:27+00:00,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,...,0.323581,True,pekinese,0.090647,True,papillon,0.068957,True,5350,29677
2,891815181378084864,2017-07-31 00:18:03+00:00,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,...,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True,3520,22349
3,891689557279858688,2017-07-30 15:58:51+00:00,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,...,0.170278,False,labrador_retriever,0.168086,True,spatula,0.040836,False,7303,37392
4,891327558926688256,2017-07-29 16:00:24+00:00,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,...,0.555712,True,english_springer,0.22577,True,german_short-haired_pointer,0.175219,True,7848,35743


### Insights:


1. The highest ratings

2. Which prediction has the highest confidence

3. Most retwitted and favourite tweets

### Visualization