# Data Wrangling Template

## Gather

In [None]:
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import requests
import matplotlib.pyplot as plt
import warnings

In [None]:
#Loading the downloaded file in a dataframe
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [None]:
twitter_archive.sample(3)

In [None]:
#Use requests library to download tsv file from a website
url="https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
response = requests.get(url)

with open('image_predictions.tsv', 'wb') as file:
    file.write(response.content)
    
image_predictions = pd.read_csv('image_predictions.tsv', sep = '\t')

In [None]:
image_predictions.sample(3)

In [None]:
#Token key setup for twitter API
ckey = 'zQd7Uq0i7FTfuAxy9rFm0OwF4'
csecret = 'T6vl5Uf2ATJa2cKWrHIajtOrOqz2epkY1kNynupWXebxN5zfqi'
atoken = '218933524-ig1iw9LIoGJZ0tqqrZYRAYBvC9rY382nj1fctw0M'
asecret = 'OpHiMkgXeHKdyuIa6SY1z2S5F0ggMZY8PLWmb6BHgailC'

In [None]:
#Authentications for twitter API
auth = tweepy.OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [None]:
#For loop to add each tweet to json file

with open('tweet_json.txt', 'a', encoding='utf8') as f:
    for tweet_id in twitter_archive['tweet_id']:
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            json.dump(tweet._json, f)
            f.write('\n')
        except:
            continue

In [None]:
#For loop to append each tweet in a list
tweets_data = []

tweet_file = open('tweet_json.txt', "r")

for line in tweet_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
        
tweet_file.close()

In [None]:
# Create tweet_info DataFrame
tweet_info = pd.DataFrame()

In [None]:
# Add selected variables to tweet_info DataFrame
tweet_info['id'] = list(map(lambda tweet: tweet['id'], tweets_data))
tweet_info['retweet_count'] = list(map(lambda tweet: tweet['retweet_count'], tweets_data))
tweet_info['favorite_count'] = list(map(lambda tweet: tweet['favorite_count'], tweets_data))

In [None]:
tweet_info.sample(3)

Saving all files in a csv

In [None]:
twitter_archive.to_csv('twitter_archive.csv', sep='\t', encoding='utf-8')

In [None]:
tweet_info.to_csv('twitter_info.csv', sep='\t', encoding='utf-8')

In [None]:
image_predictions.to_csv('image_predictions.csv', sep='\t', encoding='utf-8')

## Assess

### Loading all 3 files

In [None]:
twitter_archive = pd.read_csv('twitter_archive.csv', sep = '\t')
image_predictions = pd.read_csv('image_predictions.csv', sep = '\t')
twitter_info = pd.read_csv('twitter_info.csv', sep = '\t')

In [None]:
twitter_archive.info()

In [None]:
twitter_archive.describe()

In [None]:
twitter_archive.sample(3)

In [None]:
twitter_archive

In [None]:
image_predictions.info()

In [None]:
image_predictions.describe()

In [None]:
image_predictions

In [None]:
twitter_info.info()

In [None]:
twitter_info.describe()

In [None]:
twitter_info

In [None]:
twitter_archive['rating_denominator'].unique()

In [None]:
twitter_archive.source.value_counts()

In [None]:
image_predictions['jpg_url'].value_counts()

In [None]:
twitter_info['id'].unique()

In [None]:
twitter_info['id'].value_counts()[twitter_info['id'].value_counts() > 1]

In [None]:
twitter_archive[twitter_archive['name'].str.islower()]['name'].value_counts()

## Issues:

#### Tidiness

- Three separate dataframes
- Dog "breed" variable in four columns: doggo, floofer, pupper, puppo
- Show entire text of twitter_archive.text

#### Quality Issues
- Name of Dog is None
- Remove any retweets
- Records with no images
- Dog names consist of letters like a , an ,the etc
- Some records have 0 rating in the denominator
- Some rating in the denominator have values less than or more than 10. 
- Incorrect Numerator rating for some records.
- Variables (timestamp and retweeted_status_timestamp) has object datatype it should be datetime
- tweet_id is int


# Clean

In [None]:
archive_clean = pd.read_csv('twitter_archive.csv', sep = '\t', index_col=0)
image_clean = pd.read_csv('image_predictions.csv', sep = '\t', index_col=0)
info_clean = pd.read_csv('twitter_info.csv', sep = '\t', index_col=0)

## Issue 1:
### Define
Show entire text in archive_clean

### Code

In [None]:
pd.set_option('display.max_colwidth', -1)

## Test

In [None]:
archive_clean.sample(2)

## Issue 2:
### Define
Mergin all 3 dataframes to archive_clean dataframe
### Code

In [None]:
archive_clean = pd.merge(left = archive_clean, right = info_clean, left_on = 'tweet_id', right_on = 'id', how = 'inner')

In [None]:
archive_clean = pd.merge(left = archive_clean, right = image_clean, on = 'tweet_id', how = 'inner')

In [None]:
archive_clean = archive_clean.drop(['id'], axis = 1)

### Test

In [None]:
archive_clean.info()

## Issue 3:
### Define
Making 1 column to store dog stages
### Code

In [None]:
archive_clean['Stage'] = archive_clean['text'].str.extract('(puppo|pupper|floofer|doggo)', expand=True)

In [None]:
# dropping extra stage columns
archive_clean = archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis = 1)

### Test

In [None]:
archive_clean.info()

In [None]:
archive_clean.sample(3)

## Issue 4:
### Define
Discarding any retweets, we don't need any retweets
### Code

In [None]:
archive_clean = archive_clean[archive_clean['retweeted_status_id'].isnull()] 

### Test

In [None]:
archive_clean.info()

## Issue 5:
### Define
Removing unnecessary retweeted columns
### Code

In [None]:
archive_clean = archive_clean.drop(['retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis = 1)

### Test

In [None]:
archive_clean.info()

## Issue 6:
### Define
Remove rows with no images
### Code

In [None]:
archive_clean = archive_clean.dropna(subset=['expanded_urls'])

### Test

In [None]:
archive_clean.info()

## Issue 7:
### Define
Changing the name of dogs from None to NAN
### Code

In [None]:
archive_clean['name'] = archive_clean['name'].replace('None', np.NaN)

### Test

In [None]:
archive_clean[archive_clean['name'].isnull()]['name']

## Issue 8:
### Define
Fixing dog names with 'a' , 'an', 'my', 'his'
### Code

In [None]:
#Replacing an
archive_clean['name'][1917] = 'Berta'

In [None]:
#Replacing my
archive_clean['name'][686] = 'Zoey'

In [None]:
#replacing a
archive_clean['name'][1752] = 'Jacob'
archive_clean['name'][1781] = 'Rufus'
archive_clean['name'][1831] = 'Spork'
archive_clean['name'][1840] = 'Cherokee'
archive_clean['name'][1843] = 'Hemry'
archive_clean['name'][1861] = 'Alfred'
archive_clean['name'][1875] = 'Alfredo'
archive_clean['name'][1905] = 'Leorio'
archive_clean['name'][1930] = 'Chuk'
archive_clean['name'][1947] = 'Alfonso'
archive_clean['name'][1961] = 'Cheryl'
archive_clean['name'][1967] = 'Jessiga'
archive_clean['name'][1976] = 'Klint'
archive_clean['name'][1985] = 'Kohl'
archive_clean['name'][1999] = 'Daryl'
archive_clean['name'][2015] = 'Pepe'
archive_clean['name'][2022] = 'Octaviath'
archive_clean['name'][2025] = 'Johm'

In [None]:
# Replacing his
archive_clean['name'][810] = 'Quizno'

### Test

In [None]:
archive_clean[archive_clean['name'].notnull()]['name'].sort_values()

## Issue 9:
### Define
Inconsistent numerator and denominator
### Code

In [None]:
archive_clean.rating_denominator.value_counts()

In [None]:
archive_clean[archive_clean['rating_denominator'] == 2]['text']

In [None]:
archive_clean['rating_numerator'][2046] = 9
archive_clean['rating_denominator'][2046] = 10

In [None]:
archive_clean[archive_clean['rating_denominator'] == 11]['text']

In [None]:
archive_clean['rating_numerator'][872] = 14
archive_clean['rating_denominator'][872] = 10
archive_clean['rating_numerator'][1401] = 10
archive_clean['rating_denominator'][1401] = 10

### Issue 10:
### Define
Convert object variables (timestamp and retweeted_status_timestamp) to datetime and tweet_id to str
### Code

In [None]:
archive_clean.timestamp = pd.to_datetime(archive_clean.timestamp)
archive_clean.tweet_id = archive_clean.tweet_id.astype(str)

### Test

In [None]:
archive_clean.info()

### Issue 11:
### Define
Converting Stage variable from object to categorical datatype
### Code

In [None]:
archive_clean.Stage = archive_clean.Stage.astype('category')

### Test

In [None]:
archive_clean.info()

## Issue 12:
### Define
Replace all meaningless names with Nan
### Code

In [None]:
archive_clean['name'].str.lower()

In [None]:
meaningless_names = ['a', 'an', 'the', 'just', 'one', 'very', 'quite', 'not', 'actually', 
             'mad', 'space', 'infuriating', 'all', 'officially', '0', 'old', 'life',
             'unacceptable', 'my', 'incredibly', 'by', 'his', 'such']

for name in archive_clean['name']:
    if name in meaningless_names:
        archive_clean.name[archive_clean.name == name] = np.NaN

### Test

In [None]:
meaningless_names = ['a', 'an', 'the', 'just', 'one', 'very', 'quite', 'not', 'actually', 
             'mad', 'space', 'infuriating', 'all', 'officially', '0', 'old', 'life',
             'unacceptable', 'my', 'incredibly', 'by', 'his', 'such']

for name in archive_clean['name']:
    if name in meaningless_names:
        print(name)

In [None]:
archive_clean.info()

## Issue 13:
### Define
Creating a new variable "Rating", by taking ratio of numerator and denomenator ratings
### Code

In [None]:
archive_clean['Rating'] = archive_clean['rating_numerator'] / archive_clean['rating_denominator']

### Test

In [None]:
archive_clean.info()

## Issue 14:
### Define
Combining the dog breeds column to 1 column and only taking the dog breed when its true
### Code

In [None]:
Dog_breeds = []
for n1,d1,n2,d2,n3,d3 in zip(archive_clean['p1'],archive_clean['p1_dog'],archive_clean['p2'],archive_clean['p2_dog'],archive_clean['p3'],archive_clean['p3_dog'] ):
    if d1 == True:
        Dog_breeds.append(n1)
    elif d1 == False and d2 == True:
        Dog_breeds.append(n2)
    elif d1 == False and d2 == False and d3 == True:
        Dog_breeds.append(n3)
    else:
        Dog_breeds.append(np.NaN)

### Test

In [None]:
#Removing unnecessary columns
archive_clean = archive_clean.drop(['p1', 'p1_conf', 'p1_dog','p2', 'p2_conf', 'p2_dog','p3', 'p3_conf', 'p3_dog'], axis = 1)

In [None]:
archive_clean.info()

# Store

In [107]:
archive_clean.to_csv('twitter_archive_master.csv', sep='\t', encoding='utf-8')

# Analyze & Visualize

In [108]:
np.set_printoptions(suppress=True)
np.sort(archive_clean.Rating.unique())

array([  0.        ,   0.1       ,   0.2       ,   0.3       ,
         0.4       ,   0.5       ,   0.6       ,   0.7       ,
         0.8       ,   0.9       ,   1.        ,   1.1       ,
         1.2       ,   1.3       ,   1.4       ,   2.6       ,
         2.7       ,   3.42857143,   7.5       ,  42.        ,
       177.6       ])

# I have shown  3 visualizations and Analysis via a Tableau Story