Skip to content

Commit

Permalink
Major update. Replaced search_terms.py with a configparser and 'confi…
Browse files Browse the repository at this point in the history
…g.ini'. The config file is populated with the maximum page of search results for each term in the PAGE_LIMS array when configure() is called if any of the elements are '-1'. When tweet_pic.py is run, a random tag is selected from config.ini to be searched for. The SEARCH_TAGS and PAGE_LIMS parameters have to match up with this implementation. Also added a __repr__ function to the Image class.
  • Loading branch information
geoffreyn committed Sep 28, 2015
1 parent 3cff357 commit 950f324
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 17 deletions.
6 changes: 6 additions & 0 deletions .gitignore
@@ -0,0 +1,6 @@
.DS_Store
*.pyc
*.log
*jpg
credentials.py
recent_tweets.dat
4 changes: 4 additions & 0 deletions config.ini
@@ -0,0 +1,4 @@
[PixivSearch]
SEARCH_TAGS = ['Key','Kanon','CLANNAD','LittleBusters!','AngelBeats!','AB!','Rewrite','Charlotte']
PAGE_LIMS = [-1]*len(SEARCH_TAGS)
DEFAULT_LIM = 200
3 changes: 2 additions & 1 deletion requirements.txt
@@ -1,4 +1,5 @@
beautifulsoup4>=4.4.0
requests>=2.6.0
tweepy>=3.3.0
datetime>=4.0.0
datetime>=4.0.0
configparser>=3.3.0
2 changes: 0 additions & 2 deletions search_terms.py

This file was deleted.

101 changes: 87 additions & 14 deletions tweet_pic.py
@@ -1,6 +1,7 @@
from bs4 import BeautifulSoup
import requests
import tweepy
import configparser

import datetime
import os
Expand All @@ -9,12 +10,13 @@
import time

from credentials import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET, BITLY_TOKEN
from search_terms import SEARCH_DICT

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
IMAGE_PATH = SCRIPT_DIR + '/Downloaded_Images'
CONFIG_FILE = SCRIPT_DIR + '/config.ini'
RECENT_TWEETS_FILE = SCRIPT_DIR + '/recent_tweets.dat'
NUM_RECENT_TWEETS = 200 # Number of recent posts to track
NUM_RECENT_TWEETS = 200 # Number of recent posts to track


class TwitterAPI(object):

Expand All @@ -35,6 +37,9 @@ class Image(object):
def __init__(self):
self.site = 'http://www.pixiv.net'
self.header = {'referer':self.site}

self.searchTerm = None
self.searchPage = None

self.title = None
self.imageId = None
Expand All @@ -56,10 +61,29 @@ def __str__(self):
photo_str = 'Photo URL: {0}\n'.format(self.photoURL)
uploadTime_str = 'Uploaded on: {0}\n'.format(self.uploadTime.ctime())
accessTime_str = 'First accessed on: {0}\n'.format(self.accessTime.ctime())
search_str = 'Searched tag/page: {0},{1}\n'.format(self.searchTerm,self.searchPage)

full_str = title_str + name_str + id_str + illustURL_str + short_illustURL_str + photo_str + uploadTime_str + accessTime_str
full_str = title_str + name_str + id_str + illustURL_str + short_illustURL_str + photo_str + uploadTime_str + accessTime_str + search_str
return full_str


def __repr__(self):
repr_dict = {'site': self.site,
'header': self.header,
'title': self.title,
'imageId': self.imageId,
'illustName': self.illustName,
'illustId': self.illustId,
'illustURL': self.illustURL,
'uploadTime': self.uploadTime,
'photoURL': self.photoURL,
'accessTime': self.accessTime,
'shortURL': self.shortURL,
'searchTerm': self.searchTerm,
'searchPage': self.searchPage
}

return str(repr_dict)

def get_image(self):
'''Download image, write to local file'''
if not os.path.exists(IMAGE_PATH):
Expand Down Expand Up @@ -121,7 +145,7 @@ def update_recent_tweets(self):
for item in recent_tweets:
f.write('{0}\n'.format(item))


def get_recent_tweets():
'''Return list of URLs to recently tweeted images'''
# Get recent tweets from local file if they exist
Expand All @@ -133,14 +157,14 @@ def get_recent_tweets():

return recent_tweets

def fetch_imagelist(page_num):

def fetch_imagelist(page_term, page_num):
'''Find webpage on Pixiv. Return HTML content.'''
SITE_URL = 'http://www.pixiv.net/search.php'
NUM_REQUEST_ATTEMPTS = 5 # Number of times to attempt to query search page



payload = {
'word': SEARCH_DICT['tags'],
'word': page_term,
's_mode': 's_tag_full',
'order': 'date_d',
'p': page_num
Expand All @@ -158,7 +182,7 @@ def fetch_imagelist(page_num):
return page


def parse_images(page):
def parse_images(page, page_term, page_num):
'''Extract content from Pixiv page, return a list of Image objects'''
image_list = []

Expand Down Expand Up @@ -198,26 +222,74 @@ def parse_images(page):
# Set image attributes
newImage.title = title
newImage.illustName = illustName
newImage.imageId = imageId
newImage.uploadTime = uploadTime
newImage.photoURL = photoURL
newImage.illustId = illustId
newImage.illustURL = illustURL
newImage.accessTime = accessTime
newImage.imageId = imageId
newImage.searchTerm = page_term
newImage.searchPage = page_num

# Add image to list
image_list.append(newImage)

return image_list



def configure():

## Process config.ini file
cf = configparser.RawConfigParser()
cf.read(CONFIG_FILE)

# The config file contains the raw string form of the list of search tags
# and corresponding number of pages, so evaluate them
SEARCH_TAGS = eval(cf.get('PixivSearch','SEARCH_TAGS'))
PAGE_LIMS = eval(cf.get('PixivSearch','PAGE_LIMS'))
DEFAULT_LIM = cf.getint('PixivSearch','DEFAULT_LIM')

# Check integrity of search tags and page limits
for index,tag in enumerate(SEARCH_TAGS):
# If PAGE_LIMS is filled with default values (-1), determine maximum page number
if PAGE_LIMS[index] <= 0:
page_lim = DEFAULT_LIM
image_page = fetch_imagelist(tag,str(page_lim))
while image_page.find('div',{'class':'_no-item'}) is not None:
# decrement page if last one was empty
page_lim -= 1

# See if page contains images
image_page = fetch_imagelist(tag,page_lim)

if page_lim <= 1:
print 'Failed to determine page limit for tag: {0}'.format(tag)
break

if page_lim > 1:
print 'New limit for \'{0}\': {1}'.format(tag,str(page_lim))
PAGE_LIMS[index] = page_lim

cf.set('PixivSearch','PAGE_LIMS', str(PAGE_LIMS))
with open(CONFIG_FILE, 'wb') as configOut:
cf.write(configOut)
return SEARCH_TAGS, PAGE_LIMS


def main():
SEARCH_TAGS, PAGE_LIMS = configure()

## Process and Tweet Images
image_tweeted = False

while not image_tweeted:
# Randomly select a tag and a page from the configuration file
selected_index = random.randint(0,len(SEARCH_TAGS))
selected_tag = SEARCH_TAGS[selected_index]

# Get a page of images
image_page = fetch_imagelist(random.randint(0,SEARCH_DICT['page_lim']))
image_list = parse_images(image_page)
image_page = fetch_imagelist(selected_tag,str(random.randint(0,PAGE_LIMS[selected_index])))
image_list = parse_images(image_page,selected_tag,selected_index)

# Tweet a random image from list
random.shuffle(image_list)
Expand All @@ -236,6 +308,7 @@ def main():

print image


if __name__ == '__main__':
main()

0 comments on commit 950f324

Please sign in to comment.