Major update. Replaced search_terms.py with a configparser and 'confi…

…g.ini'. The config file is populated with the maximum page of search results for each term in the PAGE_LIMS array when configure() is called if any of the elements are '-1'. When tweet_pic.py is run, a random tag is selected from config.ini to be searched for. The SEARCH_TAGS and PAGE_LIMS parameters have to match up with this implementation. Also added a __repr__ function to the Image class.
geoffreyn · Sep 28, 2015 · 950f324 · 950f324
1 parent 3cff357
commit 950f324
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+.DS_Store
+*.pyc
+*.log
+*jpg
+credentials.py
+recent_tweets.dat
diff --git a/config.ini b/config.ini
@@ -0,0 +1,4 @@
+[PixivSearch]
+SEARCH_TAGS = ['Key','Kanon','CLANNAD','LittleBusters!','AngelBeats!','AB!','Rewrite','Charlotte']
+PAGE_LIMS = [-1]*len(SEARCH_TAGS)
+DEFAULT_LIM = 200
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 beautifulsoup4>=4.4.0
 requests>=2.6.0
 tweepy>=3.3.0
-datetime>=4.0.0
+datetime>=4.0.0
+configparser>=3.3.0
diff --git a/search_terms.py b/search_terms.py
diff --git a/tweet_pic.py b/tweet_pic.py
@@ -1,6 +1,7 @@
 from bs4 import BeautifulSoup
 import requests
 import tweepy
+import configparser
 
 import datetime
 import os
@@ -9,12 +10,13 @@
 import time
 
 from credentials import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET, BITLY_TOKEN
-from search_terms import SEARCH_DICT
 
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 IMAGE_PATH = SCRIPT_DIR + '/Downloaded_Images'
+CONFIG_FILE = SCRIPT_DIR + '/config.ini'
 RECENT_TWEETS_FILE = SCRIPT_DIR + '/recent_tweets.dat'
-NUM_RECENT_TWEETS = 200 # Number of recent posts to track
+NUM_RECENT_TWEETS = 200   # Number of recent posts to track
+
 
 class TwitterAPI(object):
 
@@ -35,6 +37,9 @@ class Image(object):
     def __init__(self):
         self.site = 'http://www.pixiv.net'
         self.header = {'referer':self.site}
+
+        self.searchTerm = None
+        self.searchPage = None
 
         self.title = None
         self.imageId = None
@@ -56,10 +61,29 @@ def __str__(self):
         photo_str = 'Photo URL: {0}\n'.format(self.photoURL)
         uploadTime_str = 'Uploaded on: {0}\n'.format(self.uploadTime.ctime())
         accessTime_str = 'First accessed on: {0}\n'.format(self.accessTime.ctime())
+        search_str = 'Searched tag/page: {0},{1}\n'.format(self.searchTerm,self.searchPage)
 
-        full_str = title_str + name_str + id_str + illustURL_str + short_illustURL_str + photo_str + uploadTime_str + accessTime_str
+        full_str = title_str + name_str + id_str + illustURL_str + short_illustURL_str + photo_str + uploadTime_str + accessTime_str + search_str
         return full_str
-
+
+    def __repr__(self):
+        repr_dict = {'site': self.site, 
+            'header': self.header, 
+            'title': self.title, 
+            'imageId': self.imageId, 
+            'illustName': self.illustName, 
+            'illustId': self.illustId,
+            'illustURL': self.illustURL, 
+            'uploadTime': self.uploadTime, 
+            'photoURL': self.photoURL,
+            'accessTime': self.accessTime, 
+            'shortURL': self.shortURL,
+            'searchTerm': self.searchTerm, 
+            'searchPage': self.searchPage
+        }
+
+        return str(repr_dict)
+
     def get_image(self):
         '''Download image, write to local file'''
         if not os.path.exists(IMAGE_PATH):
@@ -121,7 +145,7 @@ def update_recent_tweets(self):
             for item in recent_tweets:
                 f.write('{0}\n'.format(item))
 
-
+                
 def get_recent_tweets():
     '''Return list of URLs to recently tweeted images'''
     # Get recent tweets from local file if they exist
@@ -133,14 +157,14 @@ def get_recent_tweets():
 
     return recent_tweets
 
-def fetch_imagelist(page_num):
+
+def fetch_imagelist(page_term, page_num):
     '''Find webpage on Pixiv.  Return HTML content.'''
     SITE_URL = 'http://www.pixiv.net/search.php'
     NUM_REQUEST_ATTEMPTS = 5 # Number of times to attempt to query search page
-
-
+
     payload = {
-        'word': SEARCH_DICT['tags'],
+        'word': page_term,
         's_mode': 's_tag_full',
         'order': 'date_d', 
         'p': page_num
@@ -158,7 +182,7 @@ def fetch_imagelist(page_num):
     return page
 
 
-def parse_images(page):
+def parse_images(page, page_term, page_num):
     '''Extract content from Pixiv page, return a list of Image objects'''
     image_list = []
 
@@ -198,26 +222,74 @@ def parse_images(page):
         # Set image attributes
         newImage.title = title
         newImage.illustName = illustName
+        newImage.imageId = imageId
         newImage.uploadTime = uploadTime
         newImage.photoURL = photoURL
         newImage.illustId = illustId
         newImage.illustURL = illustURL
         newImage.accessTime = accessTime
-        newImage.imageId = imageId
+        newImage.searchTerm = page_term
+        newImage.searchPage = page_num
 
         # Add image to list
         image_list.append(newImage)
 
     return image_list
 
-
+
+def configure():
+
+    ## Process config.ini file
+    cf = configparser.RawConfigParser()
+    cf.read(CONFIG_FILE)
+
+    # The config file contains the raw string form of the list of search tags 
+    #  and corresponding number of pages, so evaluate them
+    SEARCH_TAGS = eval(cf.get('PixivSearch','SEARCH_TAGS'))
+    PAGE_LIMS = eval(cf.get('PixivSearch','PAGE_LIMS'))
+    DEFAULT_LIM = cf.getint('PixivSearch','DEFAULT_LIM')
+
+    # Check integrity of search tags and page limits
+    for index,tag in enumerate(SEARCH_TAGS):
+        # If PAGE_LIMS is filled with default values (-1), determine maximum page number
+        if PAGE_LIMS[index] <= 0:
+            page_lim = DEFAULT_LIM
+            image_page = fetch_imagelist(tag,str(page_lim))
+            while image_page.find('div',{'class':'_no-item'}) is not None:
+                # decrement page if last one was empty
+                page_lim -= 1
+
+                # See if page contains images
+                image_page = fetch_imagelist(tag,page_lim)
+
+                if page_lim <= 1:
+                    print 'Failed to determine page limit for tag: {0}'.format(tag)
+                    break
+
+            if page_lim > 1:
+                print 'New limit for \'{0}\': {1}'.format(tag,str(page_lim))
+                PAGE_LIMS[index] = page_lim
+
+    cf.set('PixivSearch','PAGE_LIMS', str(PAGE_LIMS))
+    with open(CONFIG_FILE, 'wb') as configOut:
+        cf.write(configOut)
+    return SEARCH_TAGS, PAGE_LIMS
+
+
 def main():
+    SEARCH_TAGS, PAGE_LIMS = configure()
+
+    ## Process and Tweet Images
     image_tweeted = False
 
     while not image_tweeted:
+        # Randomly select a tag and a page from the configuration file
+        selected_index = random.randint(0,len(SEARCH_TAGS))
+        selected_tag = SEARCH_TAGS[selected_index]
+
         # Get a page of images
-        image_page = fetch_imagelist(random.randint(0,SEARCH_DICT['page_lim']))
-        image_list = parse_images(image_page)
+        image_page = fetch_imagelist(selected_tag,str(random.randint(0,PAGE_LIMS[selected_index])))
+        image_list = parse_images(image_page,selected_tag,selected_index)
 
         # Tweet a random image from list
         random.shuffle(image_list)
@@ -236,6 +308,7 @@ def main():
 
     print image
 
+
 if __name__ == '__main__':
     main()