### Connecting Pandas to a Database

This notebook will walk us through how to use pandas to interact with a database, and then query an API and add new values to it.  

Running this code continuously will allow us to programmatically collect a unique data source.

In [121]:
import pandas as pd
import numpy as np
# you will need to import this -- pip install SQLAlchemy
from sqlalchemy import create_engine, types
# you will need to import this -- pip install mysql-connector-python
import mysql.connector
import requests
# you will need to install this  -- pip install requests-oauthlib
from requests_oauthlib import OAuth1

### Step 1:  Fetching the Data From the Database

In [122]:
# we'll create a dictionary to store all of our database information
df_dict = {
    'connector': 'mysql+mysqlconnector',
    'username' : 'dat1019',
    'password' : 'dat1019password',
    'server'   : 'dat-10-19.cfvn8ddij95j.us-east-1.rds.amazonaws.com',
    'port'     : '3306',
    'database' : 'dat1019'
}

In [123]:
# this string contains all the information
connection_string = f"{df_dict['connector']}://{df_dict['username']}:{df_dict['password']}@{df_dict['server']}:{df_dict['port']}/{df_dict['database']}"

In [124]:
connection_string

'mysql+mysqlconnector://dat1019:dat1019password@dat-10-19.cfvn8ddij95j.us-east-1.rds.amazonaws.com:3306/dat1019'

In [125]:
# this is what we need to connect to our database
engine = create_engine(connection_string)

In [126]:
# we'll now connect to the database and pull in the info
with engine.connect() as connection:
    tweets = pd.read_sql_query("SELECT * FROM tweets", con=connection)

In [127]:
# let's look at our results
tweets.head()

Unnamed: 0,name,screen_name,text,retweets,favorites,id
0,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: VOTE! VOTE! VOTE!\nhttps:...,117124,0,1323692146147340299
1,Donald J. Trump,realDonaldTrump,https://t.co/zX4bqgtWqH,10186,47177,1323692020880297986
2,Donald J. Trump,realDonaldTrump,https://t.co/SMaOF79kPV,10806,44040,1323691984989622272
3,Donald J. Trump,realDonaldTrump,RT @BarstoolNewsN: The Amish are not playing a...,17887,0,1323684790894297089
4,Donald J. Trump,realDonaldTrump,"A parade for me in Nigeria, a great honor! htt...",39398,199959,1323680963310866435


In [128]:
# we will turn this column into a string -- useful for later processing
tweets['id'] = tweets.id.astype(str)

### Step 2:  Getting API Data

In [133]:
# authorization information for the Twitter API
tokens = OAuth1('NOZHm1aLT1AVmchGbCmiZOAga', 'nPyaYCt8L7ymqGZtU8EqC0a2ypI9aSJgVNIhtoZ0wGsaf3BJw9',
                '1079981876864008192-AlhO4yOa06oW2sXZpLpWPwnOxEERYS', 'o3E0AsKJfDoTBk77UQYExzOG7E46jPYvpWNGAKsD6lUBY')

In [134]:
# the api endpoint we'll ping to get our API results
base_url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=realDonaldTrump&count=200'

In [135]:
# this is our list of dictionaries that contains Donald Trump's tweets
tweet_results = requests.get(base_url, auth=tokens).json()

In [136]:
# here's our resulting dictionary
tweet_results

[{'created_at': 'Tue Nov 03 19:51:40 +0000 2020',
  'id': 1323714481353068546,
  'id_str': '1323714481353068546',
  'text': 'RT @realDonaldTrump: VOTE! VOTE! VOTE!\nhttps://t.co/85ySh1KYkh',
  'truncated': False,
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [{'screen_name': 'realDonaldTrump',
     'name': 'Donald J. Trump',
     'id': 25073877,
     'id_str': '25073877',
     'indices': [3, 19]}],
   'urls': [],
   'media': [{'id': 1323298431088107520,
     'id_str': '1323298431088107520',
     'indices': [39, 62],
     'media_url': 'http://pbs.twimg.com/amplify_video_thumb/1323298431088107520/img/XSsqukn6q0bMx8Dj.jpg',
     'media_url_https': 'https://pbs.twimg.com/amplify_video_thumb/1323298431088107520/img/XSsqukn6q0bMx8Dj.jpg',
     'url': 'https://t.co/85ySh1KYkh',
     'display_url': 'pic.twitter.com/85ySh1KYkh',
     'expanded_url': 'https://twitter.com/som3thingwicked/status/1323300764853350401/video/1',
     'type': 'photo',
     'sizes': {'thumb': {'w'

In [138]:
# we'll take our results and turn them into a dataframe
results = pd.DataFrame({
    'name': [result['user']['name'] for result in tweet_results],
    'screen_name': [result['user']['screen_name'] for result in tweet_results],
    'text': [result['text'] for result in tweet_results],
    'retweets': [result['retweet_count'] for result in tweet_results],
    'favorites': [result['favorite_count'] for result in tweet_results],
    'id': [result['id_str'] for result in tweet_results]
})

print(f"API call brought in {results.shape[0]} new tweets")
results.head()

API call brought in 60 new tweets


Unnamed: 0,name,screen_name,text,retweets,favorites,id
0,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: VOTE! VOTE! VOTE!\nhttps:...,121165,0,1323714481353068546
1,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,14850,0,1323714393184653323
2,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSgh2KPc h...,11699,0,1323714331922583556
3,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,11580,0,1323714155967373312
4,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/voxNnIYMpe,8901,0,1323714095225384962


### Step 3:  Checking For New Values

In [139]:
# this does a check for new values
# left merge means 'tweets' is the dominant table
# indicator lets you know if the value was present in one of the tables or both
results.merge(tweets, on='id', how='left', indicator=True)

Unnamed: 0,name_x,screen_name_x,text_x,retweets_x,favorites_x,id,name_y,screen_name_y,text_y,retweets_y,favorites_y,_merge
0,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: VOTE! VOTE! VOTE!\nhttps:...,121165,0,1323714481353068546,,,,,,left_only
1,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,14850,0,1323714393184653323,,,,,,left_only
2,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSgh2KPc h...,11699,0,1323714331922583556,,,,,,left_only
3,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,11580,0,1323714155967373312,,,,,,left_only
4,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/voxNnIYMpe,8901,0,1323714095225384962,,,,,,left_only
5,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,13069,0,1323713956280754182,,,,,,left_only
6,Donald J. Trump,realDonaldTrump,https://t.co/zX4bqgtWqH,12858,60294,1323692020880297986,Donald J. Trump,realDonaldTrump,https://t.co/zX4bqgtWqH,10186.0,47177.0,both
7,Donald J. Trump,realDonaldTrump,https://t.co/SMaOF79kPV,14128,58980,1323691984989622272,Donald J. Trump,realDonaldTrump,https://t.co/SMaOF79kPV,10806.0,44040.0,both
8,Donald J. Trump,realDonaldTrump,RT @BarstoolNewsN: The Amish are not playing a...,19878,0,1323684790894297089,Donald J. Trump,realDonaldTrump,RT @BarstoolNewsN: The Amish are not playing a...,17887.0,0.0,both
9,Donald J. Trump,realDonaldTrump,"A parade for me in Nigeria, a great honor! htt...",46197,237545,1323680963310866435,Donald J. Trump,realDonaldTrump,"A parade for me in Nigeria, a great honor! htt...",39398.0,199959.0,both


In [140]:
# we'll save the variable
merged_df = results.merge(tweets, on='id', how='left', indicator=True)

In [141]:
# select the values that are left_only in the _merge column -- these are new values
print(f"Found {merged_df[merged_df._merge == 'left_only'].shape[0]} new tweets not currently in the database")
merged_df[merged_df._merge == 'left_only']

Found 6 new tweets not currently in the database


Unnamed: 0,name_x,screen_name_x,text_x,retweets_x,favorites_x,id,name_y,screen_name_y,text_y,retweets_y,favorites_y,_merge
0,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: VOTE! VOTE! VOTE!\nhttps:...,121165,0,1323714481353068546,,,,,,left_only
1,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,14850,0,1323714393184653323,,,,,,left_only
2,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSgh2KPc h...,11699,0,1323714331922583556,,,,,,left_only
3,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,11580,0,1323714155967373312,,,,,,left_only
4,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/voxNnIYMpe,8901,0,1323714095225384962,,,,,,left_only
5,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,13069,0,1323713956280754182,,,,,,left_only


In [142]:
# get the index positions where this value is True
merged_df[merged_df._merge == 'left_only'].index

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [143]:
results.index

RangeIndex(start=0, stop=60, step=1)

In [144]:
# use these values to look up tweets in the original results df
idx = merged_df[merged_df._merge == 'left_only'].index
# and these are our new tweets
results.iloc[idx]

Unnamed: 0,name,screen_name,text,retweets,favorites,id
0,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: VOTE! VOTE! VOTE!\nhttps:...,121165,0,1323714481353068546
1,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,14850,0,1323714393184653323
2,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSgh2KPc h...,11699,0,1323714331922583556
3,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,11580,0,1323714155967373312
4,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/voxNnIYMpe,8901,0,1323714095225384962
5,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: https://t.co/gsFSghkmdM h...,13069,0,1323713956280754182


In [145]:
# and now we'll connect to the database and add in the values
with engine.connect() as connection:
    # declare data types for the db -- not necessary but a useful step if you are doing this the first time
    dtypes = {
        'name': types.String(length=65535),
        'screen_name': types.String(length=65535),
        'text': types.String(length=65535),
        'retweets': types.Integer(),
        'favorites': types.Integer(),
        'id': types.String(length=65535)
    }
    print("Adding values to database")
    try:
        results.iloc[idx].to_sql('tweets', con=connection, index=False, if_exists='append', dtype=dtypes)
        print("Successful")
    except Exception as e:
        print("Could not add results to the database.", e)

Adding values to database
Successful
