# We Rate Dogs 

## Introduction

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
from PIL import Image
from io import BytesIO
import numpy as np
import requests
import tweepy
import json
from PIL import Image
from io import BytesIO

pd.set_option('float_format', '{:f}'.format)

### Gathering

In [2]:
archive = pd.read_csv('twitter_archive_enhanced.csv')
archive.shape

(2356, 17)

In [3]:
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
response = requests.get(url)
print(response.status_code)

with open("image_predictions.tsv", mode='wb') as file:
    file.write(response.content)

predict = pd.read_csv('image_predictions.tsv', sep='\t')
predict.shape

200


(2075, 12)

In [7]:
json = pd.read_json('tweet_json.txt', lines=True)

In [9]:
json['tweet_id']=json['id']

KeyError: 'id'

In [None]:
json

In [None]:
#Selecting columns 
twitter_api = json[['tweet_id','retweet_count', 'favorite_count']]

In [None]:
# Merging for Assesing
m1 = pd.merge(twitter_api,archive, on="tweet_id",how='left')
m2 = pd.merge(m1,predict, on="tweet_id",how='left')
df_final = m2

## Assesing 

In [None]:
pd.set_option("max_colwidth", 10)
pd.set_option('display.max_columns', 30)
df_final.head()

In [None]:
df_final.tail()

In [None]:
df_final.info()

In [None]:
df_final.describe()

In [None]:
df_final.describe(include=[object])

In [None]:
#Cheking for nulls
df_final.isnull().values.sum()

In [None]:
df_final.isnull().sum()

In [None]:
#Checking numerator rating
df_final.rating_numerator.value_counts().sort_index(ascending=True)

In [None]:
df_final.rating_numerator.describe()

In [None]:
#Checking Denominator rating
df_final.rating_denominator.value_counts().sort_index(ascending=True)

In [None]:
#Inspecting for duplicates
df_final.duplicated().sum()

In [None]:
pd.set_option("max_colwidth", 30)
#Inspecting dog breeds
df_final[['p1', 'p2', 'p3']].describe(include=['object'])

In [None]:
#Inspecting confidence levels
df_final[['p1_conf', 'p2_conf',
          'p3_conf']].describe().loc[['min', 'max', 'mean']].round(5)

In [None]:
#Inspecting dog stages
df_final[['doggo', 'floofer', 'pupper', 'puppo']].apply(pd.Series.value_counts)

In [None]:
#Inspecting Names
df_final.name.loc[50:60]

In [None]:
#Inspecting favorite and retweet count
df_final[['favorite_count', 'p1', 'p2', 'p3']].min()

In [None]:
df_final[['retweet_count', 'p1', 'p2', 'p3']].max()

 ## Observations:
 Quality Issues:
 - Completeness: 10124 missing records.
 - Incorrect datatype in columns: `"tweet_id"`,`"timestamp"` 
 - The `"name"` column has accuracy issues
 - The `"in_reply_to_status_id"`, `"in_reply_to_user_id`` columns has missing values and we are not interest in replies.
 - The `"p1"`,`"p2"`,`"p3"` columns with validity and consistency issues
 - The `"p1_dog"`,`"p2_dog"`,`"p3_dog"` columns  with validity issues
 - We can observe that there are some 'None',objects present in `"doggo"`,`"floofer"`,`"pupper"` and`"puppo"` which have to be convert into 'NaN'

- The `"retweeted_status_id"`, `"retweeted_status_user_id"`,`"retweeted_status_timestamp"` are related to retweets should be dropped
 
 
 
## Tidyness
- The `"p1_conf"`,`"p2_conf"`,`"p3_conf"` are related to dog test, shoud be converted to one column
- The `"doggo"`,`"floofer"`,`"pupper"` and`"puppo"` are related to dog stage and doesnt follow rules of tidy data.

In [None]:
#Creating copies before cleaning
# Making a copies of original pieces
twitter_api_clean = twitter_api.copy()
archive_clean = archive.copy()
predict_clean = predict.copy()

## Clean

### 1. Define
Using pd.isna() on:
`'in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id',
 'retweeted_status_user_id','retweeted_status_timestamp'`


### Code

In [None]:
archive_clean = archive_clean[archive_clean['in_reply_to_status_id'].isna()]
archive_clean = archive_clean[archive_clean['in_reply_to_user_id'].isna()]
archive_clean = archive_clean[archive_clean['retweeted_status_id'].isna()]
archive_clean = archive_clean[archive_clean['retweeted_status_user_id'].isna()]
archive_clean = archive_clean[archive_clean['retweeted_status_timestamp'].isna()]

### Test

In [None]:
archive_clean.info()

### 2. Define
 Fixing Dog Stages

In [None]:
# replacing the stage name with 1, and 'None' with 0, like a dummy variable
make_dummy = lambda x: 0 if x == 'None' else 1

archive_clean.doggo = archive_clean.doggo.apply(make_dummy)
archive_clean.floofer = archive_clean.floofer.apply(make_dummy)
archive_clean.pupper = archive_clean.pupper.apply(make_dummy)
archive_clean.puppo = archive_clean.puppo.apply(make_dummy)

# by adding the stage columns, we can see how many are 'none' and how many stages are set
archive_clean['none'] = archive_clean['doggo'] + archive_clean['floofer'] + \
                        archive_clean['pupper'] + archive_clean['puppo']

In [None]:
stage_none = lambda x: 1 if x == 0 else 0

archive_clean['none'] = archive_clean['none'].apply(stage_none)

stage = ['floofer', 'puppo', 'doggo', 'pupper', 'none']

conditions = [(archive_clean[stage[0]] == 1), (archive_clean[stage[1]] == 1),
              (archive_clean[stage[2]] == 1), (archive_clean[stage[3]] == 1),
              (archive_clean[stage[4]] == 1)]

archive_clean['stage'] = np.select(conditions, stage, default=stage[4])

archive_clean.drop(stage, axis=1, inplace=True)

archive_clean['stage'] = archive_clean.stage.astype('category')

# Source for algorithm:    https://code.i-harness.com/en/q/19c9fbcm/en/q/19c9fbc

### Test

In [None]:
archive_clean.stage.value_counts()

### 4. Define
 Change ['timestamp'] using pd.to_datetime

### Code

In [None]:
archive_clean['timestamp'] = pd.to_datetime(archive_clean['timestamp'])

### Test

In [None]:
archive_clean.timestamp.dtypes

### 5. Define
- Change tweet_id datatype in dataframe from int64 to object 

### Code

In [None]:
archive_clean['tweet_id'] = archive_clean['tweet_id'].astype(str)
predict_clean['tweet_id'] = archive_clean['tweet_id'].astype(str)
twitter_api_clean['tweet_id'] = archive_clean['tweet_id'].astype(str)

### Test

In [None]:
archive_clean.tweet_id.dtype

In [None]:
predict_clean.tweet_id.dtype

In [None]:
twitter_api_clean.tweet_id.dtype

### 6. Define
-  Creating new 'breed' and 'confidence' collumn

### Code

In [None]:
breed = []
confidence = []


def choose_breed(predict_clean):
    if predict_clean['p1_dog'] == True:
        breed.append(predict_clean['p1'])
        confidence.append(predict_clean['p1_conf'])
    else:
        breed.append('Unknown')
        confidence.append(0)


# function call
predict_clean.apply(choose_breed, axis=1)
# add the two columns
predict_clean['breed'] = breed
predict_clean['confidence'] = confidence
predict_clean.sample(1)

In [None]:
#fixing breed spelling
predict_clean['breed'] = predict_clean['breed'].str.replace('_', ' ')

In [None]:
# Converting all breeds to lower case
predict_clean['breed'] = predict_clean['breed'].str.lower()

In [None]:
predict_clean.breed.str.replace('Unknown', 'np.nan')

### Test

In [None]:
predict_clean.breed.value_counts()

### 8. Define
- Converting None to NaN



### Code 

In [None]:
archive_clean.replace(to_replace='None', value=np.nan, inplace=True)
archive_clean.replace(to_replace='Unknown', value=np.nan, inplace=True)
archive_clean.replace(to_replace='unknown', value=np.nan, inplace=True)

### Test

In [None]:
archive_clean[archive_clean == 'None']

# Final View

In [None]:
df_clean = twitter_api_clean.merge(archive_clean,on='tweet_id',how="right").merge(predict_clean,on='tweet_id',how="right")

In [None]:
df_clean.sample(3)

In [None]:
#saving to csv
df_clean.to_csv('twitter_archive_master.csv',index=False)

# Visuals  

In [None]:
def labels_and_title(x, y, title):
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(title)
    return labels_and_title


#Pick a palette
sns.color_palette("rocket", as_cmap=True)
# Generating a graphical plot for the data
sns.set(rc={'figure.figsize': (10, 8)})

# Question 1: What is WeRateDogs's posting trend overtime?

In [None]:
years = df_clean.timestamp.dt.year
ret_fav = df_clean.pivot_table(index=years, values=['favorite_count','retweet_count'], aggfunc='sum')
ret_fav
ret_fav.groupby(years).plot(kind='line')
plt.title('Retweeting and Favoriting trend over time')
plt.ylabel('Count')
plt.xlabel('(Year)')
plt.legend(('Retweets', 'Favorites'))
plt.savefig('ret_fav')

###  Regarding to activity overtime, we can cleary notice a drop from 2016 to 2017 with a diference of 42,09 % in `"favorite_count"`, and  a drop from 2016 to 2017 with a diference of 49,96 % in `"retweet_count"`.


## Question 2: Which breed of dogs got the highest  favorite counts?

In [None]:
color = (sns.color_palette("husl", 10))
sns.set(rc={'figure.figsize': (15, 4)})
sns.set(font_scale=1)
pivot_breed= df_clean.pivot_table(index=['breed'], values=['favorite_count'], aggfunc='sum').sort_values(by="favorite_count",ascending=False)

ax = sns.barplot(x='favorite_count', y=pivot_breed.index, data=pivot_breed, palette=color,order=df_clean.breed.value_counts().iloc[:10].index, )
plt.title('Highest favorite count per breed')
plt.ylabel('Favorite Count')
plt.xlabel('Breed')
plt.savefig('breed')
for i in ax.containers:
    ax.bar_label(i,)
    pd.set_option('float_format', '{:f}'.format)

In [None]:
pivot_stage

###   We can observe that "golden retriever" breed  with 1,257,407 likes, in second place we have the Labrador Retriever with 811848 likes and in the third place the "pembroke" with 601228. 

## Question 3: Which stage of dogs got the highest favorite counts?¶ 

In [None]:
pivot_stage = df_clean.pivot_table(index=['stage'], values=['favorite_count'], aggfunc='sum')
pivot_stage
color = (sns.color_palette("husl", 4))
sns.set(rc={'figure.figsize': (10, 5)})
sns.set(font_scale=1)
#f = df_clean.sort_values(by='favorite_count', ascending=False)
ax = sns.barplot(y=pivot_stage.index,
                 x='favorite_count',
                 data=pivot_stage,
                 palette=color,
                 order=["pupper","doggo", "floofer",  "puppo"],
                 ci=None)
plt.title('Highest favorite count per stage')
plt.ylabel('')
plt.xlabel('')
plt.savefig('stages')
for i in ax.containers:
    ax.bar_label(i,)

### Cleary the most common stage is `"pupper"` with a total of 2,052,302 of favorites, representing 84,19 % of all stages, excluding "nones".

In [None]:
#Showing Top Breed image
top_breed = df_clean[df_clean["breed"]=="golden retriever"]
url = top_breed.jpg_url.iloc[50]
response = requests.get(url)
Image.open(BytesIO(response.content))

In [None]:
import string

def substrings_in_string(big_string, substrings):
    matches = []
    for substring in substrings:
        if string.find(big_string, substring) != -1:
            matches.append(substring)
    if len(matches) == 0:
        print(big_string)
        return None
    else:
        return matches
