# Unconventional Data Sources
###### Cole Plum

### PDF Extractor

In [8]:
import fitz
import pandas as pd
from collections import Counter

doc = fitz.open('dissertation.pdf')
text = "".join(page.get_text("text") for page in doc)
words = pd.Series(text.split())
words.value_counts().head(30)

the         2640
.           2323
of          1580
to          1384
and         1174
a            977
in           694
is           612
for          524
that         507
be           431
data         407
The          361
are          349
with         337
as           307
students     259
can          257
this         247
course       233
it           222
or           221
on           210
an           190
their        182
was          176
Data         171
by           171
not          168
I            155
dtype: int64

In [14]:
import camelot
tables = camelot.read_pdf('calendar.pdf')
df = tables[0].df
df

Unnamed: 0,0,1,2,3,4
0,Fall,2020 Fall Term,,2021 Fall Term,
1,First Day of Classes,Tue,September 1,Tue,August 31
2,Labor Day - Classes Will Meet,Mon,September 7,,
3,Labor Day - Classes Suspended,,,Mon,September 6
4,Last day to add or drop courses,Tue,September 15,Tue,September 14
...,...,...,...,...,...
70,Summer Session - 10 week - classes begin,Mon,June 7,Mon,June 6
71,Last day to add or drop courses,Wed,June 16,Wed,June 15
72,Last day to change registration or withdraw fr...,Thur,July 15,Thur,July 14
73,Final Exams,Fri,August 13,Fri,August 12


In [19]:
doc = fitz.open('Final Algorithm Flowchart.pdf')
text = "".join(page.get_text("text") for page in doc)
words = pd.Series(text.split())
words.value_counts().head(30)

the           18
a             10
Yes           10
No            10
of             9
              ..
enough?        1
much           1
does           1
part           1
algorithm!     1
Length: 229, dtype: int64

### Reddit Image Transcriber

In [82]:
import requests
import pprint
from PIL import Image
import io
import pytesseract
from textblob import TextBlob

# Set a User Agent to avoid being blocked
data = requests.get("https://www.reddit.com/r/comics/.json", headers = {'User-agent': 'your bot 0.1'}).json()
#pprint.pprint(data)
map = {}
for child in data['data']['children']:
    if(child['data']['url'].endswith(".png") or child['data']['url'].endswith(".jpg") or 
       child['data']['url'].endswith(".jpeg") or child['data']['url'].endswith(".jfif")):
        map[child['data']['title']] =  child['data']['url']

all_text = ""
for url in list(map.values()):
    response = requests.get(url)
    img = Image.open(io.BytesIO(response.content))
    text = pytesseract.image_to_string(img)
    all_text += text
    
    blob = TextBlob(text)
    print(blob.sentiment.polarity)
    # Value from -1 to 1 about how positive/negative the post's words are

words = pd.Series(all_text.split())
print('\n', words.value_counts(), '\n')

blob = TextBlob(all_text)
print(blob.noun_phrases, '\n')

# Sorted with pandas
print(pd.Series(blob.word_counts).sort_values().tail(30), '\n')

0.98828125
0.0
0.5
0.0
-0.4
0.2725
-0.7
0.0
0.0
0.390625
0.22421875
0.35
0.25
0.05
-0.5
0.0
0.03285714285714285
0.51
0.5
0.0
0.41818181818181815
0.5
0.0

 |           18
THE          8
TO           7
A            7
to           7
            ..
19!          1
practice     1
Time’s       1
>            1
girls.       1
Length: 415, dtype: int64 

['dude', 'love rock', 'hell yeah', 'like rock', 'rock', 'the best', 'mrlovenstein.com', "\\ 'm", 'dying', 'call', 'you have to call', 'it', '’ s', 'okay', 'understand', 'remi_lascault', '[ ngewe', 'cc', 'drowning in the burdens of my lifes man', "you're", 'fish', 'vve got fish burdens', 'poorly drawn lines', 'the', 'slime', 'beginners blade', 'welcome to your first dungeon', 'don t think were ready for that just yet', "let's come back later with some ice magi— h—-hey", 'don t just leave', 't feel ugly', '“ m', 'going to dress vp', 'felissabumblehead', '@ © ersychosuzanne', 'pet-proof', 'wow to eat swake comics thanks', 'meatball sub', 'sure', "

##### Dogecoin

In [88]:
# Set a User Agent to avoid being blocked
data = requests.get("https://www.reddit.com/r/dogecoin/.json", headers = {'User-agent': 'your bot 0.1'}).json()
#pprint.pprint(data)
map = {}
for child in data['data']['children']:
    if(child['data']['url'].endswith(".png") or child['data']['url'].endswith(".jpg") or 
       child['data']['url'].endswith(".jpeg") or child['data']['url'].endswith(".jfif")):
        map[child['data']['title']] =  child['data']['url']


all_text = ""
for url in list(map.values()):
    response = requests.get(url)
    img = Image.open(io.BytesIO(response.content))
    text = pytesseract.image_to_string(img)
    all_text += text

blob = TextBlob(all_text)
print(blob.noun_phrases, '\n')

# Sorted with pandas
print(pd.Series(blob.word_counts).sort_values().tail(30))

['reddit', 'reddit premium = > oo', 'ba', 'c ew', '—| ry-|', 'oo [ -t', '— ] ‘', 'a. “', 'hole el', 'ba se ss g54', 'dooloe—', 'gg-t-', 'son non', 'veare', 'scoccccoce ccea be der', 'li lrn', 'ees cs re ny', 'sy o', 'paakareaaiiidgisi ddd did dos', 'tesla', 'doge', 'millionaires', 'billionaires', '= |', 'reddit', '—— pay', 'reddit', 'elon musk', '@ ky @ elonmusk', 'dyomv', '{ ol6', 'mm', '\\\\c- ] alam', 'l=ss', 'mcok- [ erer-', 'dlole', 'wa', 'xe', 'may', 'twitter', 'search', 'lte', '% 0_', 'rq xora', 'ce', 'gc s6.26m', 'balance', 't e', 'bybit', 'bonus bash', 'win', 'bonus', '@ plete } = $ 6.25m $', '% ——_ cane oo', '— | \\ id', 'nal aici', 'win', 'bonus', 'bybit', 'bonus bash vo mire', 'portfolio elon musk', 'tony stark', 'game', 'level sacrifice', 'entire stock', 'tesla', 'support dogecoin', 'absolute legend', 'elon', 'november', 'een hits', 'blunt .. ‘', 'people ”', 'new wealth |', 'travel space .. ’', 'elon musk', '@ @ elonmusk', 'tesla', 'doge', 'twitter', 'iphone 15k', 'retweet

### OpenCV

In [4]:
import numpy as np
import cv2

images = ['cakeboss', 'gordon', 'simon']

face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier('haarcascade_eye.xml')

for image in images:
    img = cv2.imread("images/" + image + '.jpg')
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    for (x,y,w,h) in faces:
        img = cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
        roi_gray = gray[y:y+h, x:x+w]
        roi_color = img[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi_gray)
        for (ex,ey,ew,eh) in eyes:
            cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)
            
    cv2.imwrite("images/" + image + "_detected.jpg", img)

##### w/ Smiles

In [5]:
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
smile_cascade = cv2.CascadeClassifier('haarcascade_smile.xml')

for num in range(1,4):
    img = cv2.imread("images/smile" + str(num) + '.jpg')
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    for (x,y,w,h) in faces:
        img = cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
        roi_gray = gray[y:y+h, x:x+w]
        roi_color = img[y:y+h, x:x+w]
        smiles = smile_cascade.detectMultiScale(roi_gray)
        for (ex,ey,ew,eh) in smiles:
            cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)
            
    cv2.imwrite("images/smile" + str(num) + "_detected.jpg", img)

The smile cla