# Importing all required libraries

In [1]:
import os
import requests
import json
cwd = os.getcwd()

In [2]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib as mpl
import seaborn as sb
import numpy as np
import pandas as pd
from pandas import Series
import zipfile

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
rcParams['figure.figsize'] = 5, 4
sb.set_style('whitegrid')


pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# Assessing Data

## Assessing the Twitter Archive

In [3]:
df = pd.read_csv('twitter-archive-enhanced.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [44]:
df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


## <font color='red'>Issues within the Tweeter Archive DataFrame:</font>

`Tidyness Issues`
- T-1 ** denominator of 10 and the numerators could all be in one column**
- T-2 ** dog stages need to be in one single column (doggo, floofer, pupper, puppo)**
- T-3 ** it's not clear if the tweet is the original satus or a retweet of something else, so we need to add a column for Retweet (Yes/No)**

`Quality Issues`
- Q-1 ** too many NaNs in (in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp)**
- Q-2 **source column is not clear and we need to extract the exact source like iPhone, etc.**
- Q-3 **for Dos Stage and Name, for missing values, "None" is used rather than NaN**
- Q-4 **timestamp is not standard, so we could change it to a standard format of Date-time**

## Assessing the Image Prediction DataFrame

In [8]:
df_image = pd.read_csv('image_predictions.tsv', sep = '\t')
df_image.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [10]:
df_image.sample(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
319,671768281401958400,https://pbs.twimg.com/media/CVKZsHtWwAA6gPj.jpg,2,Chihuahua,0.500373,True,French_bulldog,0.112796,True,Italian_greyhound,0.062893,True
1695,816336735214911488,https://pbs.twimg.com/media/C1Q17WdWEAAjKFO.jpg,1,Labrador_retriever,0.91933,True,kuvasz,0.04948,True,golden_retriever,0.011934,True
350,672488522314567680,https://pbs.twimg.com/media/CVUovvHWwAAD-nu.jpg,1,Doberman,0.605358,True,Rottweiler,0.108382,True,Appenzeller,0.077798,True
982,707387676719185920,https://pbs.twimg.com/media/CdElVm7XEAADP6o.jpg,1,Chihuahua,0.888468,True,Italian_greyhound,0.088635,True,toy_terrier,0.015938,True
903,700062718104104960,https://pbs.twimg.com/media/CbcfUxoUAAAlHGK.jpg,1,hummingbird,0.180998,False,peacock,0.135179,False,eel,0.075371,False
1618,802572683846291456,https://pbs.twimg.com/media/CyNPmJgXcAECPuB.jpg,1,golden_retriever,0.610171,True,Labrador_retriever,0.173252,True,cocker_spaniel,0.163257,True
65,667174963120574464,https://pbs.twimg.com/media/CUJIFoJWsAAL3Dc.jpg,1,toy_poodle,0.266437,True,Chihuahua,0.243223,True,bluetick,0.072806,True
134,668484198282485761,https://pbs.twimg.com/media/CUbu1GAWsAEH3E-.jpg,1,standard_poodle,0.587372,True,Bedlington_terrier,0.182411,True,Afghan_hound,0.040968,True
823,693155686491000832,https://pbs.twimg.com/media/CZ6VatdWwAAwHly.jpg,3,Shih-Tzu,0.69748,True,Lhasa,0.200151,True,Tibetan_terrier,0.09097,True
758,688789766343622656,https://pbs.twimg.com/media/CY8SocAWsAARuyh.jpg,1,American_Staffordshire_terrier,0.59966,True,Staffordshire_bullterrier,0.380976,True,bull_mastiff,0.003889,True


In [11]:
df_image.p1.value_counts()

golden_retriever                  150
Labrador_retriever                100
Pembroke                           89
Chihuahua                          83
pug                                57
chow                               44
Samoyed                            43
toy_poodle                         39
Pomeranian                         38
cocker_spaniel                     30
malamute                           30
French_bulldog                     26
miniature_pinscher                 23
Chesapeake_Bay_retriever           23
seat_belt                          22
German_shepherd                    20
Siberian_husky                     20
Staffordshire_bullterrier          20
Cardigan                           19
web_site                           19
Eskimo_dog                         18
beagle                             18
teddy                              18
Maltese_dog                        18
Shetland_sheepdog                  18
Rottweiler                         17
Lakeland_ter

## <font color='red'>Issues within the Image Prediction DataFrame:</font>

`Tidyness Issues:`
- T-4 **The Columns titles are not descriptive enough (img_num, p(1-3), p(1-3)_dog)**
- T-5 **number of records for this DatFrame is not the same as the archive. It has 2075 records while he other has 2356 records**

`quality Issues:`
- Q-5 **Some of the photos are not dog (like the p1 says: hummingbird, p2: peacock, p3: eel)**
- Q-6 **some of the names in the p1-p3 are not real dog breads (web-site, llama, tub, etc.)**





## Assessing the Tweet extra data - DataFrame

In [34]:
df_tweet = pd.read_csv('tweet_df.csv')
df_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345 entries, 0 to 2344
Data columns (total 6 columns):
Unnamed: 0        2345 non-null int64
favorite_count    2345 non-null int64
full_text         2345 non-null object
retweet_count     2345 non-null int64
tweet_id          2345 non-null int64
tweet_status      2345 non-null object
dtypes: int64(4), object(2)
memory usage: 110.0+ KB


In [35]:
df_tweet.head()

Unnamed: 0.1,Unnamed: 0,favorite_count,full_text,retweet_count,tweet_id,tweet_status
0,0,38994,This is Phineas. He's a mystical boy. Only eve...,8653,892420643555336193,"Status(truncated=False, _api=<tweepy.api.API o..."
1,1,33390,This is Tilly. She's just checking pup on you....,6355,892177421306343426,"Status(truncated=False, _api=<tweepy.api.API o..."
2,2,25168,This is Archie. He is a rare Norwegian Pouncin...,4220,891815181378084864,"Status(truncated=False, _api=<tweepy.api.API o..."
3,3,42347,This is Darla. She commenced a snooze mid meal...,8768,891689557279858688,"Status(truncated=False, _api=<tweepy.api.API o..."
4,4,40514,This is Franklin. He would like you to stop ca...,9531,891327558926688256,"Status(truncated=False, _api=<tweepy.api.API o..."


In [46]:
df_tweet.full_text[1]

"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV"

## <font color='red'>Issues within the Tweet DataFrame:</font>

`Tidyness Issues:`
- T-6 **Columns are not optimally arranged**
- T-7 **no need for Unnamed id column at the begining**

    
`Quality Issues:`
- Q-7 **full text contain some url and also rating which are irrelevant, need to be removed**
- Q-8 **Tweet_id is in Integer form, needs to be string**

# Clean