# Getting Data (API's, scraping, etc.)

In [None]:
from IPython.display import Image
Image(url='https://cdn.sanity.io/images/rzso0e8h/production/da8c3c8105c221ea003ea67057b5a75de819d36d-1920x1280.jpg?w=1920&h=1280&auto=format')

source: https://labor.org.mx/en/exhibitions/printing-out-the-internet

In [None]:
https://christopherbaker.net/projects/murmur-study/

In [None]:
https://twitter.com/everylotla

### Data sets

Need a collection of hugs and kisses, a huge collection of New York Times articles?  
https://academictorrents.com

Text corpora  
https://github.com/dariusk/corpora/tree/master/data

Data from around the world  
http://datacatalogs.org/

Machine learning datasets  
https://www.kaggle.com/datasets


### API's

#### Wikipedia

In [None]:
!pip install wikipedia

In [None]:
import wikipedia

'''
Install wikipedia first
pip install wikipedia

Check wikipedia API doc: 
https://wikipedia.readthedocs.io/en/latest/code.html#api
'''

# Search for something
print(wikipedia.search("wtf"))

# Get a summary of page 
print(wikipedia.summary("random"))

# Get a number of random topics
print(wikipedia.random(pages=2))

# Get a list of image urls of a page
print(wikipedia.page("random").images)

# Or just the first image
image_url = wikipedia.page("random").images[0]

from IPython.display import Image
Image(url=image_url)

https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy

### Reddit API wrapper (PRAW)

In [None]:
!pip install praw

In [None]:
import praw

In [None]:
print("INFO: Connecting to reddit...")

# For this api to work you need to first register an account 
# and fill in the data here

r = praw.Reddit(
    client_id="XWqof3jNyi70G1kOhxfVQg",
    client_secret="DmjiBIDvPDHf7vIMsVZDZC_LVJt_tw",
    user_agent="Zealousideal-Iron724",
)

if not r.read_only:

    raise Exception("ERROR: Can't connect to reddit")

else:

    print("INFO: Connected")
    
for submission in reddit.subreddit("worldnews").hot(limit=10):
    
    print(submission.title)

### Youtube downloader (YT-DLP)

In [None]:
!pip install yt-dlp

In [None]:
!mkdir yt_vids

In [None]:
import yt_dlp

downloads = 'yt_vids'

ydl_opts = {
    'outtmpl': downloads + '/%(id)s.%(ext)s',
    'quiet': False,
    'format_sort': ['res:1080', 'ext:mp4:m4a'],
    'ignoreerrors': False
}

URL = 'https://www.youtube.com/watch?v=ysU9hh4pBjc'

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    
    info = ydl.extract_info(URL, download=False)
    
    if info:
        width = info.get('width')
        height = info.get('height')
        duration = info.get('duration')
        title = info.get('title')
        id = info.get('id')
        
    
    print("width: {}, height: {}, duration: {}, title: {}, id: {}".format(width, height, duration, title, id))
    ydl.download(URL)   

In [None]:
print(id)

In [None]:
from IPython.display import Video

Video("yt_vids/{}.mp4".format(id))  

You can find some documentation here: https://github.com/yt-dlp/yt-dlp#embedding-yt-dlp

### Downloading unsecured IP cameras

In [None]:
http://insecam.org/en/view/921591/

In [None]:
!pip install ffmpeg-python

In [None]:
import ffmpeg 
import os
import time

In [None]:
cam_urls = [
    "http://128.101.85.194/mjpg/video.mjpg",
    "http://96.91.239.26:1024/mjpg/video.mjpg",
    "http://84.82.29.229:8080/mjpg/video.mjpg"
]

In [None]:
def download_from_ip(url, dir, idx):

    out_path = os.path.join(dir, "stream_{:03d}".format(idx))
    
    process = (
        ffmpeg
        .input(url)
        .output('{}.mkv'.format(out_path), codec="copy")
        .overwrite_output()
        .run_async(pipe_stdout=True)
    )
    
    print("starting capture")
    time.sleep(10)
    print("ending capture")
    process.stdout.close()
    process.kill()
        

In [None]:
out_dir = "insecam_streams"

if not os.path.exists(out_dir):
    
    os.makedirs(out_dir)

for idx, url in enumerate(cam_urls):

    download_from_ip(url, out_dir, idx)

### Download livstreams with streamlink

In [None]:
!pip install streamlink

In [None]:
import streamlink

url = "https://www.bloomberg.com/live/europe"

out_dir = "streamlink"

if not os.path.exists(out_dir):
    
    os.makedirs(out_dir)

try:
    
    stream_url = streamlink.streams(url)['best'].url
    
except:
    
    print("couldn't find stream at {}".format(url))

print("found stream at {}".format(stream_url))

streamer = url.split("/")[-1]

out_path = os.path.join(out_dir, streamer)

process = (
    ffmpeg
    .input(stream_url)
    .output('{}.mkv'.format(out_path), codec="copy")
    .overwrite_output()
    .run_async(pipe_stdout=True)
)

print("starting capture")
time.sleep(10)
print("ending capture")
process.stdout.close()
process.kill()

You can check all streamlink plugins here: https://streamlink.github.io/plugins.html#plugins

In [None]:
!pip install opencv-python

In [None]:
!pip install pandas

In [None]:
import pandas as pd

In [None]:
pd.Series(['San Francisco', 'San Jose', 'Sacramento'])

`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. Example:

In [None]:
city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])
population = pd.Series([852469, 1015785, 485199])

pd.DataFrame({ 'City name': city_names, 'Population': population })

In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od

od.download("https://www.kaggle.com/datasets/unsdsn/world-happiness")

The example above used `DataFrame.describe` to show interesting statistics about a `DataFrame`. Another useful function is `DataFrame.head`, which displays the first few records of a `DataFrame`:

In [None]:
happy_dataframe = pd.read_csv("world-happiness/2015.csv", sep=",")
happy_dataframe.describe()

In [None]:
happy_dataframe.head()

Another powerful feature of *pandas* is graphing. For example, `DataFrame.hist` lets you quickly study the distribution of values in a column:

In [None]:
happy_dataframe.hist('Freedom')

#### PPRINT

In [None]:
import pprint

json_dict = {"hyperspace": {"constraints": [], "design": [["windFarm.windparkSize.k", "continuous", [0, 0, 5]], ["hydroPlant.primaryControlMax", "continuous", [100, 300]]], "kpis": ["frequency.y", "city.load.p[2]"]}, "lhc_size": 10, "number_of_runs": 10}

formatted_json_str = pprint.pformat(json_dict)
print(formatted_json_str)
pprint.pprint(json_dict)