# Data Sourcing With Pandas

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib
import numpy as np
import pandas as pd

## Reading CSV's - The Easy Way

In [3]:
!tree

[01;34m.[00m
├── Data\ Sourcing\ PT.ipynb
└── [01;34mdata[00m
    ├── soccer.sqlite
    └── spotify_2017.csv

1 directory, 3 files


In [4]:
tracks_df = pd.read_csv('data/spotify_2017.csv')

In [5]:
tracks_df.head()

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7qiZfU4dY1lWllzX7mPBI,Shape of You,Ed Sheeran,0.825,0.652,1.0,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,233713.0,4.0
1,5CtI0qwDJkDQGwXD1H1cL,Despacito - Remix,Luis Fonsi,0.694,0.815,2.0,-4.328,1.0,0.12,0.229,0.0,0.0924,0.813,88.931,228827.0,4.0
2,4aWmUDTfIPGksMNLV2rQP,Despacito (Featuring Daddy Yankee),Luis Fonsi,0.66,0.786,2.0,-4.757,1.0,0.17,0.209,0.0,0.112,0.846,177.833,228200.0,4.0
3,6RUKPb4LETWmmr3iAEQkt,Something Just Like This,The Chainsmokers,0.617,0.635,11.0,-6.769,0.0,0.0317,0.0498,1.4e-05,0.164,0.446,103.019,247160.0,4.0
4,3DXncPQOG4VBw3QHh3S81,I'm the One,DJ Khaled,0.609,0.668,7.0,-4.284,1.0,0.0367,0.0552,0.0,0.167,0.811,80.924,288600.0,4.0


In [6]:
# Saving a csv is just as easy
tracks_df.to_csv('data/new_spotify.csv')

In [7]:
!tree

[01;34m.[00m
├── Data\ Sourcing\ PT.ipynb
└── [01;34mdata[00m
    ├── new_spotify.csv
    ├── soccer.sqlite
    └── spotify_2017.csv

1 directory, 4 files


## API's - again

In [8]:
import requests

In [9]:
artist = 'Ed Sheeran'
title = 'Shape of You'
url = 'https://lyrics.lewagon.ai/search?'
params_dict = {'artist': artist, 'title':title}
response = requests.get(url, params=params_dict)

In [13]:
# What is response?
response

<Response [200]>

In [14]:
# How can I check my repsonse in the browser?
response.url

'https://lyrics.lewagon.ai/search?artist=Ed+Sheeran&title=Shape+of+You'

In [18]:
# What are some ways to protect for a bad response?
# if response.status_code == 200:
#     print(response)

if response.ok:
    print(response)
    

<Response [200]>


In [23]:
# How do we convert our object into useable code?
response.json()

{'lyrics': "The club isn't the best place to find a lover\nSo the bar is where I go (mmmm)\nMe and my friends at the table doing shots\nDrinking fast and then we talk slow (mmmm)\nAnd you come over and start up a conversation with just me\nAnd trust me I'll give it a chance now (mmmm)\nTake my hand, stop, put Van The Man on the jukebox\nAnd then we start to dance\nAnd now I'm singing like\n\nGirl, you know I want your love\nYour love was handmade for somebody like me\nCome on now, follow my lead\nI may be crazy, don't mind me\nSay, boy, let's not talk too much\nGrab on my waist and put that body on me\nCome on now, follow my lead\nCome, come on now, follow my lead (mmmm)\n\nI'm in love with the shape of you\nWe push and pull like a magnet do\nAlthough my heart is falling too\nI'm in love with your body\nLast night you were in my room\nAnd now my bedsheets smell like you\nEvery day discovering something brand new\nI'm in love with your body\n\nOh I oh I oh I oh I\nI'm in love with your 

## SQL again 

In [24]:
import pandas as pd
import sqlite3
conn = sqlite3.connect("data/soccer.sqlite")

In [25]:
query =  '''
    SELECT l.id, l.name, c.name as country_name
    FROM League l
    JOIN Country c ON c.id = l.country_id
    '''

In [26]:
# LONG Python with annoying result
cursor = conn.cursor()
cursor.execute(query)
result = cursor.fetchall()
print(result)

[(1, 'Belgium Jupiler League', 'Belgium'), (1729, 'England Premier League', 'England'), (4769, 'France Ligue 1', 'France'), (7809, 'Germany 1. Bundesliga', 'Germany'), (10257, 'Italy Serie A', 'Italy'), (13274, 'Netherlands Eredivisie', 'Netherlands'), (15722, 'Poland Ekstraklasa', 'Poland'), (17642, 'Portugal Liga ZON Sagres', 'Portugal'), (19694, 'Scotland Premier League', 'Scotland'), (21518, 'Spain LIGA BBVA', 'Spain'), (24558, 'Switzerland Super League', 'Switzerland')]


In [27]:
# SHORT Pandas with beautiful result
league_df = pd.read_sql(query, conn)
league_df.head()

Unnamed: 0,id,name,country_name
0,1,Belgium Jupiler League,Belgium
1,1729,England Premier League,England
2,4769,France Ligue 1,France
3,7809,Germany 1. Bundesliga,Germany
4,10257,Italy Serie A,Italy


## Scraping

In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
url = 'https://www.imdb.com/list/ls055386972/'
response = requests.get(url)

In [3]:
# what is our response?
response

<Response [200]>

In [41]:
# how can we see the raw results from the response?
# LONG printout
# response.content

In [4]:
# how do we turn this into useable code?
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
# soup

In [7]:
# find_all returns a list of soup objects that we can loop through and continue to query the inner objects
soup.find_all('div', 'lister-item-content')[1]

<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">2.</span>
<a href="/title/tt0108052/">Schindler's List</a>
<span class="lister-item-year text-muted unbold">(1993)</span>
</h3>
<p class="text-muted text-small">
<span class="certificate">15</span>
<span class="ghost">|</span>
<span class="runtime">195 min</span>
<span class="ghost">|</span>
<span class="genre">
Biography, Drama, History            </span>
</p>
<div class="ipl-rating-widget">
<div class="ipl-rating-star small">
<span class="ipl-rating-star__star">
<svg class="ipl-icon ipl-star-icon" fill="#000000" height="24" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg">
<path d="M0 0h24v24H0z" fill="none"></path>
<path d="M12 17.27L18.18 21l-1.64-7.03L22 9.24l-7.19-.61L12 2 9.19 8.63 2 9.24l5.46 4.73L5.82 21z"></path>
<path d="M0 0h24v24H0z" fill="none"></path>
</svg>
</span>
<span class="ipl-rating-star__rating">8.9</span>
</div>
<div class="ipl-r

## Regex

https://rubular.com/

https://regex101.com/

https://jex.im/regulex/#!flags=&re=.*%40.*%5B.%5D%5Cw*%7B2%2C%7D