### Connecting Pandas to a Database

This notebook will walk us through how to use pandas to interact with a database, and then query an API and add new values to it.  

Running this code continuously will allow us to programmatically collect a unique data source.

In [19]:
import pandas as pd
import numpy as np
# you will need to import this -- pip install SQLAlchemy
from sqlalchemy import create_engine, types
# you will need to import this -- pip install mysql-connector-python
import mysql.connector
import requests
# you will need to install this  -- pip install requests-oauthlib
from requests_oauthlib import OAuth1


### Step 1:  Fetching the Data From the Database

In [11]:
# we'll create a dictionary to store all of our database information
df_dict = {
    'connector': 'mysql+mysqlconnector',
    'username' : 'dat1019',
    'password' : 'dat1019password',
    'server'   : 'dat-10-19.cfvn8ddij95j.us-east-1.rds.amazonaws.com',
    'port'     : '3306',
    'database' : 'dat1019'
}

In [6]:
# this string contains all the information
connection_string = f"{df_dict['connector']}://{df_dict['username']}:{df_dict['password']}@{df_dict['server']}:{df_dict['port']}/{df_dict['database']}"

In [7]:
connection_string

'mysql+mysqlconnector://dat1019:dat1019password@dat-10-19.cfvn8ddij95j.us-east-1.rds.amazonaws.com:3306/dat1019'

In [20]:
# this is what we need to connect to our database
engine = create_engine(connection_string)

In [22]:
# we'll now connect to the database and pull in the info
with engine.connect() as connection:
    tweets = pd.read_sql_query("SELECT * FROM tweets", con=connection)

In [23]:
# let's look at our results
tweets.head()

Unnamed: 0,name,screen_name,text,retweets,favorites,id
0,Donald J. Trump,realDonaldTrump,RT @realDonaldTrump: VOTE! VOTE! VOTE!\nhttps:...,117124,0,1323692146147340299
1,Donald J. Trump,realDonaldTrump,https://t.co/zX4bqgtWqH,10186,47177,1323692020880297986
2,Donald J. Trump,realDonaldTrump,https://t.co/SMaOF79kPV,10806,44040,1323691984989622272
3,Donald J. Trump,realDonaldTrump,RT @BarstoolNewsN: The Amish are not playing a...,17887,0,1323684790894297089
4,Donald J. Trump,realDonaldTrump,"A parade for me in Nigeria, a great honor! htt...",39398,199959,1323680963310866435


In [24]:
# we will turn this column into a string -- useful for later processing
tweets['id'] = tweets.id.astype(str)

### Step 2:  Getting API Data

In [25]:
# authorization information for the Twitter API
tokens = OAuth1('NOZHm1aLT1AVmchGbCmiZOAga', 'nPyaYCt8L7ymqGZtU8EqC0a2ypI9aSJgVNIhtoZ0wGsaf3BJw9',
                '1079981876864008192-AlhO4yOa06oW2sXZpLpWPwnOxEERYS', 'o3E0AsKJfDoTBk77UQYExzOG7E46jPYvpWNGAKsD6lUBY')

In [26]:
# the api endpoint we'll ping to get our API results
base_url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=realDonaldTrump&count=200'

In [27]:
# this is our list of dictionaries that contains Donald Trump's tweets
tweet_results = requests.get(base_url, auth=tokens).json()

In [28]:
# here's our resulting dictionary
tweet_results

[{'created_at': 'Wed Nov 04 23:27:55 +0000 2020',
  'id': 1324131291135168512,
  'id_str': '1324131291135168512',
  'text': 'RT @GOPLeader: The Republican coalition is bigger, more diverse, and more energetic than ever before—thanks to President @realDonaldTrump.…',
  'truncated': False,
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [{'screen_name': 'GOPLeader',
     'name': 'Kevin McCarthy',
     'id': 19739126,
     'id_str': '19739126',
     'indices': [3, 13]},
    {'screen_name': 'realDonaldTrump',
     'name': 'Donald J. Trump',
     'id': 25073877,
     'id_str': '25073877',
     'indices': [121, 137]}],
   'urls': []},
  'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 25073877,
   'id_str': '25073877',
   'name': 'Donald J.

In [29]:
# we'll take our results and turn them into a dataframe
results = pd.DataFrame({
    'name': [result['user']['name'] for result in tweet_results],
    'screen_name': [result['user']['screen_name'] for result in tweet_results],
    'text': [result['text'] for result in tweet_results],
    'retweets': [result['retweet_count'] for result in tweet_results],
    'favorites': [result['favorite_count'] for result in tweet_results],
    'id': [result['id_str'] for result in tweet_results]
})

print(f"API call brought in {results.shape[0]} new tweets")
results.head()

API call brought in 21 new tweets


Unnamed: 0,name,screen_name,text,retweets,favorites,id
0,Donald J. Trump,realDonaldTrump,RT @GOPLeader: The Republican coalition is big...,3250,0,1324131291135168512
1,Donald J. Trump,realDonaldTrump,Our lawyers have asked for “meaningful access”...,24048,114338,1324130551234727936
2,Donald J. Trump,realDonaldTrump,.....there was a large number of secretly dump...,0,0,1324108206801563650
3,Donald J. Trump,realDonaldTrump,"We have claimed, for Electoral Vote purposes, ...",49889,207707,1324108200141082624
4,Donald J. Trump,realDonaldTrump,"We are winning Pennsylvania big, but the PA Se...",52149,292330,1324061986779504642


### Step 3:  Checking For New Values

In [30]:
# this does a check for new values
# left merge means 'tweets' is the dominant table
# indicator lets you know if the value was present in one of the tables or both
results.merge(tweets, on='id', how='left', indicator=True)

Unnamed: 0,name_x,screen_name_x,text_x,retweets_x,favorites_x,id,name_y,screen_name_y,text_y,retweets_y,favorites_y,_merge
0,Donald J. Trump,realDonaldTrump,RT @GOPLeader: The Republican coalition is big...,3250,0,1324131291135168512,,,,,,left_only
1,Donald J. Trump,realDonaldTrump,Our lawyers have asked for “meaningful access”...,24048,114338,1324130551234727936,,,,,,left_only
2,Donald J. Trump,realDonaldTrump,.....there was a large number of secretly dump...,0,0,1324108206801563650,Donald J. Trump,realDonaldTrump,.....there was a large number of secretly dump...,0.0,0.0,both
3,Donald J. Trump,realDonaldTrump,.....there was a large number of secretly dump...,0,0,1324108206801563650,Donald J. Trump,realDonaldTrump,.....there was a large number of secretly dump...,0.0,0.0,both
4,Donald J. Trump,realDonaldTrump,"We have claimed, for Electoral Vote purposes, ...",49889,207707,1324108200141082624,Donald J. Trump,realDonaldTrump,"We have claimed, for Electoral Vote purposes, ...",46023.0,187590.0,both
5,Donald J. Trump,realDonaldTrump,"We have claimed, for Electoral Vote purposes, ...",49889,207707,1324108200141082624,Donald J. Trump,realDonaldTrump,"We have claimed, for Electoral Vote purposes, ...",46019.0,187548.0,both
6,Donald J. Trump,realDonaldTrump,"We are winning Pennsylvania big, but the PA Se...",52149,292330,1324061986779504642,Donald J. Trump,realDonaldTrump,"We are winning Pennsylvania big, but the PA Se...",51333.0,286551.0,both
7,Donald J. Trump,realDonaldTrump,"We are winning Pennsylvania big, but the PA Se...",52149,292330,1324061986779504642,Donald J. Trump,realDonaldTrump,"We are winning Pennsylvania big, but the PA Se...",51336.0,286565.0,both
8,Donald J. Trump,realDonaldTrump,Wow! It looks like Michigan has now found the ...,62856,316627,1324059767581671425,Donald J. Trump,realDonaldTrump,Wow! It looks like Michigan has now found the ...,61614.0,309044.0,both
9,Donald J. Trump,realDonaldTrump,Wow! It looks like Michigan has now found the ...,62856,316627,1324059767581671425,Donald J. Trump,realDonaldTrump,Wow! It looks like Michigan has now found the ...,61619.0,309059.0,both


In [31]:
# we'll save the variable
merged_df = results.merge(tweets, on='id', how='left', indicator=True)

In [32]:
# select the values that are left_only in the _merge column -- these are new values
print(f"Found {merged_df[merged_df._merge == 'left_only'].shape[0]} new tweets not currently in the database")
merged_df[merged_df._merge == 'left_only']

Found 2 new tweets not currently in the database


Unnamed: 0,name_x,screen_name_x,text_x,retweets_x,favorites_x,id,name_y,screen_name_y,text_y,retweets_y,favorites_y,_merge
0,Donald J. Trump,realDonaldTrump,RT @GOPLeader: The Republican coalition is big...,3250,0,1324131291135168512,,,,,,left_only
1,Donald J. Trump,realDonaldTrump,Our lawyers have asked for “meaningful access”...,24048,114338,1324130551234727936,,,,,,left_only


In [33]:
# get the index positions where this value is True
merged_df[merged_df._merge == 'left_only'].index

Int64Index([0, 1], dtype='int64')

In [34]:
results.index

RangeIndex(start=0, stop=21, step=1)

In [35]:
# use these values to look up tweets in the original results df
idx = merged_df[merged_df._merge == 'left_only'].index
# and these are our new tweets
results.iloc[idx]

Unnamed: 0,name,screen_name,text,retweets,favorites,id
0,Donald J. Trump,realDonaldTrump,RT @GOPLeader: The Republican coalition is big...,3250,0,1324131291135168512
1,Donald J. Trump,realDonaldTrump,Our lawyers have asked for “meaningful access”...,24048,114338,1324130551234727936


In [36]:
# and now we'll connect to the database and add in the values
with engine.connect() as connection:
    # declare data types for the db -- not necessary but a useful step if you are doing this the first time
    dtypes = {
        'name': types.String(length=65535),
        'screen_name': types.String(length=65535),
        'text': types.String(length=65535),
        'retweets': types.Integer(),
        'favorites': types.Integer(),
        'id': types.String(length=65535)
    }
    print("Adding values to database")
    try:
        results.iloc[idx].to_sql('tweets', con=connection, index=False, if_exists='append', dtype=dtypes)
        print("Successful")
    except Exception as e:
        print("Could not add results to the database.", e)

Adding values to database
Successful
