diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 1c2d3af..c98e8dc 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -7,8 +7,6 @@ class ApplicationController < ActionController::Base protect_from_forgery # See ActionController::RequestForgeryProtection for details Rubaidh::GoogleAnalytics.tracker_id = APP_CONFIG['tracker_id'] - - # Scrub sensitive parameters from your log # filter_parameter_logging :password diff --git a/app/controllers/daily_timelines_controller.rb b/app/controllers/daily_timelines_controller.rb index 4dccb21..171289b 100644 --- a/app/controllers/daily_timelines_controller.rb +++ b/app/controllers/daily_timelines_controller.rb @@ -23,65 +23,4 @@ def show end end - # # GET /daily_timelines/new - # # GET /daily_timelines/new.xml - # def new - # @daily_timeline = DailyTimeline.new - # - # respond_to do |format| - # format.html # new.html.erb - # format.xml { render :xml => @daily_timeline } - # end - # end - # - # # GET /daily_timelines/1/edit - # def edit - # @daily_timeline = DailyTimeline.find(params[:id]) - # end - # - # # POST /daily_timelines - # # POST /daily_timelines.xml - # def create - # @daily_timeline = DailyTimeline.new(params[:daily_timeline]) - # - # respond_to do |format| - # if @daily_timeline.save - # flash[:notice] = 'DailyTimeline was successfully created.' - # format.html { redirect_to(@daily_timeline) } - # format.xml { render :xml => @daily_timeline, :status => :created, :location => @daily_timeline } - # else - # format.html { render :action => "new" } - # format.xml { render :xml => @daily_timeline.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # PUT /daily_timelines/1 - # # PUT /daily_timelines/1.xml - # def update - # @daily_timeline = DailyTimeline.find(params[:id]) - # - # respond_to do |format| - # if @daily_timeline.update_attributes(params[:daily_timeline]) - # flash[:notice] = 'DailyTimeline was successfully updated.' - # format.html { redirect_to(@daily_timeline) } - # format.xml { head :ok } - # else - # format.html { render :action => "edit" } - # format.xml { render :xml => @daily_timeline.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # DELETE /daily_timelines/1 - # # DELETE /daily_timelines/1.xml - # def destroy - # @daily_timeline = DailyTimeline.find(params[:id]) - # @daily_timeline.destroy - # - # respond_to do |format| - # format.html { redirect_to(daily_timelines_url) } - # format.xml { head :ok } - # end - # end end diff --git a/app/controllers/daily_trends_controller.rb b/app/controllers/daily_trends_controller.rb index 30d7456..65beffd 100644 --- a/app/controllers/daily_trends_controller.rb +++ b/app/controllers/daily_trends_controller.rb @@ -30,65 +30,4 @@ def show end end - # # GET /daily_trends/new - # # GET /daily_trends/new.xml - # def new - # @daily_trend = DailyTrend.new - # - # respond_to do |format| - # format.html # new.html.erb - # format.xml { render :xml => @daily_trend } - # end - # end - # - # # GET /daily_trends/1/edit - # def edit - # @daily_trend = DailyTrend.find(params[:id]) - # end - # - # # POST /daily_trends - # # POST /daily_trends.xml - # def create - # @daily_trend = DailyTrend.new(params[:daily_trend]) - # - # respond_to do |format| - # if @daily_trend.save - # flash[:notice] = 'DailyTrend was successfully created.' - # format.html { redirect_to(@daily_trend) } - # format.xml { render :xml => @daily_trend, :status => :created, :location => @daily_trend } - # else - # format.html { render :action => "new" } - # format.xml { render :xml => @daily_trend.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # PUT /daily_trends/1 - # # PUT /daily_trends/1.xml - # def update - # @daily_trend = DailyTrend.find(params[:id]) - # - # respond_to do |format| - # if @daily_trend.update_attributes(params[:daily_trend]) - # flash[:notice] = 'DailyTrend was successfully updated.' - # format.html { redirect_to(@daily_trend) } - # format.xml { head :ok } - # else - # format.html { render :action => "edit" } - # format.xml { render :xml => @daily_trend.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # DELETE /daily_trends/1 - # # DELETE /daily_trends/1.xml - # def destroy - # @daily_trend = DailyTrend.find(params[:id]) - # @daily_trend.destroy - # - # respond_to do |format| - # format.html { redirect_to(daily_trends_url) } - # format.xml { head :ok } - # end - # end end diff --git a/app/controllers/pages_controller.rb b/app/controllers/pages_controller.rb index 18de11d..eec68e0 100755 --- a/app/controllers/pages_controller.rb +++ b/app/controllers/pages_controller.rb @@ -7,16 +7,6 @@ class PagesController < ApplicationController caches_page :show caches_page :csv - - # def to_param - # "#{self.id}-#{self.title.parameterize}" - # end - - # def to_param - # require 'unicode' - # "#{id}"+Unicode::normalize_KD("-"+title+"-").downcase.gsub(/[^a-z0-9\s_-]+/,'').gsub(/[\s_-]+/,'-')[0..-2] - # end - def auto_complete_for_search_query # look for autosuggest results in memcached @@ -30,27 +20,12 @@ def index if params[:search] @pages = Page.title_like(params["search"]["query"]).paginate(:page => params[:page], :order => 'monthly_trend DESC', :per_page => APP_CONFIG['articles_per_page']) else - @pages = Page.paginate(:page => params[:page], :conditions => ["pages.id NOT IN (?)", APP_CONFIG['blacklist']], :order => 'monthly_trend DESC', :per_page => APP_CONFIG['articles_per_page']) + @pages = Page.paginate(:page => params[:page], :conditions => ["pages.id NOT IN (?) and page_id NOT IN (select page_id from featured_pages)", APP_CONFIG['blacklist']], :order => 'monthly_trend DESC', :per_page => APP_CONFIG['articles_per_page']) end - - # TODO: News @page needs to tie to this if selection random article... - # # random mover - # @page = Page.find(:all,:limit => 20, :order => 'monthly_trend DESC', :conditions => ["id NOT IN (?)", APP_CONFIG['blacklist']] ).rand - - # # Top Mover - # @page = Page.find(:first, :order => 'monthly_trend DESC', :conditions => ["id NOT IN (?)", APP_CONFIG['blacklist']] ) # random rising, rotates @page = DailyTrend.find(:all, :limit => 20 , :order => 'trend DESC', :conditions => ["page_id NOT IN (?)", APP_CONFIG['blacklist']] ).rand.page - - # # fastest rising - # @page = DailyTrend.find(:first, :order => 'trend DESC', :conditions => ["page_id NOT IN (?)", APP_CONFIG['blacklist']] ).page - - # @rising = DailyTrend.find(:all, :limit => 20, :order => 'trend DESC') - # @dropping = DailyTrend.find(:all, :limit => 6, :order => 'trend ASC') - - - + unless params[:page] params[:page]='1' end @@ -60,9 +35,6 @@ def index format.xml { render :xml => @pages } format.atom { render :layout => false} end - - - end # GET /pages/1 @@ -88,68 +60,5 @@ def csv :disposition => 'attachment' end - - - # # GET /pages/new - # # GET /pages/new.xml - # def new - # @page = Page.new - # - # respond_to do |format| - # format.html # new.html.erb - # format.xml { render :xml => @page } - # end - # end - # - # # GET /pages/1/edit - # def edit - # @page = Page.find(params[:id]) - # end - # - # # POST /pages - # # POST /pages.xml - # def create - # @page = Page.new(params[:page]) - # - # respond_to do |format| - # if @page.save - # flash[:notice] = 'Page was successfully created.' - # format.html { redirect_to(@page) } - # format.xml { render :xml => @page, :status => :created, :location => @page } - # else - # format.html { render :action => "new" } - # format.xml { render :xml => @page.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # PUT /pages/1 - # # PUT /pages/1.xml - # def update - # @page = Page.find(params[:id]) - # - # respond_to do |format| - # if @page.update_attributes(params[:page]) - # flash[:notice] = 'Page was successfully updated.' - # format.html { redirect_to(@page) } - # format.xml { head :ok } - # else - # format.html { render :action => "edit" } - # format.xml { render :xml => @page.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # DELETE /pages/1 - # # DELETE /pages/1.xml - # def destroy - # @page = Page.find(params[:id]) - # @page.destroy - # - # respond_to do |format| - # format.html { redirect_to(pages_url) } - # format.xml { head :ok } - # end - # end end diff --git a/app/controllers/weekly_trends_controller.rb b/app/controllers/weekly_trends_controller.rb index 3b4683e..cebf846 100644 --- a/app/controllers/weekly_trends_controller.rb +++ b/app/controllers/weekly_trends_controller.rb @@ -21,65 +21,4 @@ def show end end - # # GET /weekly_trends/new - # # GET /weekly_trends/new.xml - # def new - # @weekly_trend = WeeklyTrend.new - # - # respond_to do |format| - # format.html # new.html.erb - # format.xml { render :xml => @weekly_trend } - # end - # end - # - # # GET /weekly_trends/1/edit - # def edit - # @weekly_trend = WeeklyTrend.find(params[:id]) - # end - # - # # POST /weekly_trends - # # POST /weekly_trends.xml - # def create - # @weekly_trend = WeeklyTrend.new(params[:weekly_trend]) - # - # respond_to do |format| - # if @weekly_trend.save - # flash[:notice] = 'WeeklyTrend was successfully created.' - # format.html { redirect_to(@weekly_trend) } - # format.xml { render :xml => @weekly_trend, :status => :created, :location => @weekly_trend } - # else - # format.html { render :action => "new" } - # format.xml { render :xml => @weekly_trend.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # PUT /weekly_trends/1 - # # PUT /weekly_trends/1.xml - # def update - # @weekly_trend = WeeklyTrend.find(params[:id]) - # - # respond_to do |format| - # if @weekly_trend.update_attributes(params[:weekly_trend]) - # flash[:notice] = 'WeeklyTrend was successfully updated.' - # format.html { redirect_to(@weekly_trend) } - # format.xml { head :ok } - # else - # format.html { render :action => "edit" } - # format.xml { render :xml => @weekly_trend.errors, :status => :unprocessable_entity } - # end - # end - # end - # - # # DELETE /weekly_trends/1 - # # DELETE /weekly_trends/1.xml - # def destroy - # @weekly_trend = WeeklyTrend.find(params[:id]) - # @weekly_trend.destroy - # - # respond_to do |format| - # format.html { redirect_to(weekly_trends_url) } - # format.xml { head :ok } - # end - # end end diff --git a/app/models/featured_page.rb b/app/models/featured_page.rb new file mode 100644 index 0000000..f47b3a1 --- /dev/null +++ b/app/models/featured_page.rb @@ -0,0 +1,3 @@ +class FeaturedPage < ActiveRecord::Base + belongs_to :page +end diff --git a/app/models/page.rb b/app/models/page.rb index 9e9ccb0..62e1fe0 100755 --- a/app/models/page.rb +++ b/app/models/page.rb @@ -4,7 +4,7 @@ class Page < ActiveRecord::Base has_one :person has_one :company has_one :weekly_trend - named_scope :title_like, lambda { |query| { :conditions => ['title like ? and id NOT IN (?)', "#{query}%", APP_CONFIG['blacklist']], :order => '`monthly_trend` DESC', :limit => 14 } } + named_scope :title_like, lambda { |query| { :conditions => ['title like ? and id NOT IN (?) and page_id NOT IN (select page_id from featured_pages)', "#{query}%", APP_CONFIG['blacklist']], :order => '`monthly_trend` DESC', :limit => 14 } } named_scope :full_title_like, lambda { |query| { :conditions => ['title like ? and id NOT IN (?)', "%#{query}%", APP_CONFIG['blacklist']], :order => '`monthly_trend` DESC', :limit => 14 } } diff --git a/app/views/pages/index.html.erb b/app/views/pages/index.html.erb index d613155..d58e58d 100755 --- a/app/views/pages/index.html.erb +++ b/app/views/pages/index.html.erb @@ -71,7 +71,7 @@ Trend Article - <% DailyTrend.find(:all, :limit => 20,:conditions => ["page_id NOT IN (?)", APP_CONFIG['blacklist']], :order => 'trend DESC').each_with_index do |rising, index| %> + <% DailyTrend.find(:all, :limit => 20,:conditions => ["page_id NOT IN (?) and page_id NOT IN (select page_id from featured_pages)", APP_CONFIG['blacklist']], :order => 'trend DESC').each_with_index do |rising, index| %> <%= link_to image_tag(rising.page.sparkline('888888', 30).to_url, :alt => 'Rising Trend data for ' + rising.page.title), rising.page %> <%= link_to truncate(rising.page.title,20), rising.page, :class => "title" %> diff --git a/config/deploy.rb.sample b/config/deploy.rb.sample index cfca4e6..a81ac4c 100644 --- a/config/deploy.rb.sample +++ b/config/deploy.rb.sample @@ -56,7 +56,7 @@ set :ec2onrails_config, { # Any extra Ubuntu packages to install if desired # If you don't want to install extra packages then remove this. - :packages => ["logwatch", "imagemagick"], + :packages => ["logwatch", "imagemagick", "s3cmd", "python-beautifulsoup", "python-mysqldb"], # Any extra RubyGems to install if desired: can be "gemname" or if a # particular version is desired "gemname -v 1.0.1" diff --git a/db/migrate/20090621034525_create_featured_pages.rb b/db/migrate/20090621034525_create_featured_pages.rb new file mode 100644 index 0000000..06edfc8 --- /dev/null +++ b/db/migrate/20090621034525_create_featured_pages.rb @@ -0,0 +1,13 @@ +class CreateFeaturedPages < ActiveRecord::Migration + def self.up + create_table :featured_pages do |t| + t.references :page + + t.timestamps + end + end + + def self.down + drop_table :featured_pages + end +end diff --git a/db/schema.rb b/db/schema.rb index 2a16584..c60573f 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -9,7 +9,7 @@ # # It's strongly recommended to check this file into your version control system. -ActiveRecord::Schema.define(:version => 20090619212730) do +ActiveRecord::Schema.define(:version => 20090621034525) do create_table "companies", :force => true do |t| t.integer "page_id" @@ -38,6 +38,12 @@ add_index "daily_trends", ["page_id", "trend"], :name => "daily_trends_index" + create_table "featured_pages", :force => true do |t| + t.integer "page_id" + t.datetime "created_at" + t.datetime "updated_at" + end + create_table "new_daily_timelines", :force => true do |t| t.integer "page_id" t.text "dates" diff --git a/lib/scripts/daily_load.sh b/lib/scripts/daily_load.sh index 92030a4..e5d22d6 100644 --- a/lib/scripts/daily_load.sh +++ b/lib/scripts/daily_load.sh @@ -15,9 +15,11 @@ echo MAILTO is $MAILTO cd /mnt && tar -xzvf trendsdb.tar.gz RESULTSET=`mysql -u root trendingtopics_production -e "select count(*) from information_schema.TABLES where Table_Name='new_pages' and TABLE_SCHEMA='trendingtopics_production';"` - NEWCOUNT=`echo $RESULTSET | awk '{print $2}'` +RESULTSET=`mysql -u root trendingtopics_production -e "select LEFT(RIGHT(dates,9),8) from daily_timelines where page_id=29812;"` +LASTDATE=`echo $RESULTSET | awk '{print $2}'` + # rename backup if staging tables don't exist: if [ $NEWCOUNT -eq 0 ]; then echo renaming backup tables to staging tables @@ -55,6 +57,11 @@ MAXDATE=`echo $RESULTSET | awk '{print $2}'` # echo $LASTDATE # 20090612 +echo loading featured pages +cd /mnt +python /mnt/app/current/lib/scripts/generate_featured_pages.py -d $MAXDATE > /mnt/featured_pages.txt +time mysql -u root trendingtopics_production < /mnt/app/current/lib/sql/load_featured_pages.sql + echo archiving the data to S3 # back up the trendsdb data, this copy will be pulled by the next daily job time s3cmd --config=/root/.s3cfg put trendsdb.tar.gz s3://$MYBUCKET/archive/trendsdb.tar.gz diff --git a/lib/scripts/generate_featured_pages.py b/lib/scripts/generate_featured_pages.py new file mode 100644 index 0000000..04f8369 --- /dev/null +++ b/lib/scripts/generate_featured_pages.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +generate_featured_pages.py + +Created by Peter Skomoroch on 2009-06-20. +Copyright (c) 2009 Data Wrangling LLC. All rights reserved. +""" + +import sys +import getopt +import urllib +import urllib2 +from BeautifulSoup import BeautifulSoup +import datetime +import MySQLdb + +# TODO pass as parameters +MYSERVER = 'trendingtopics.org' +DBNAME = 'trendingtopics_production' +USER = 'root' +PASSWD = '' + +help_message = ''' +Dynamically creates a blacklist of pageids based on given date by removing +wikipedia featured articles and "on this day" references from the main page. + +Usage: + +$ python generate_featured_pages.py -d 20090618 > featured_pages.txt + +''' + +class Usage(Exception): + def __init__(self, msg): + self.msg = msg + +def pageid(title): + # quick hack to get page_id from db, rails app might not be running yet + try: + conn = MySQLdb.connect(db=DBNAME, user=USER, passwd=PASSWD) + cursor = conn.cursor() + cursor.execute("""SELECT id FROM pages + WHERE title = '%s';""" % title) + row = cursor.fetchone() + pageid = row[0] + cursor.close() + conn.close() + except: + pageid = 1 + return pageid + +def get_titles(soup): + """ + Extract wikipedia links from soup instance + """ + links = [x['href'] for x in soup.findAll('a') if x['href'][0:5]=='/wiki'] + ns_zero_urls = [x.replace('/wiki/','') for x in links if x.find(':') == -1] + titles = [urllib.unquote_plus(x.replace('_', ' ')) for x in ns_zero_urls] + return titles + +def soupify_url(url): + opener = urllib2.build_opener() + opener.addheaders = [('User-agent', 'TrendingTopics/0.1')] + page = opener.open( url ).read() + soup = BeautifulSoup(page) + return soup + +def featured_pages(date): + base = 'http://en.wikipedia.org/wiki/Wikipedia:Today%27s_featured_article/' + # get previous 3 days of featured articles... + url = base + date.strftime("%B_%d,_%Y") + soup = soupify_url(url) + div = soup.findAll(id="bodyContent") + titles = get_titles(div[0]) + return titles + +def featured_pictures(date): + base = 'http://en.wikipedia.org/wiki/Template:POTD/' + url = base + date.strftime("%Y-%m-%d") + soup = soupify_url(url) + table = soup.findAll(cellspacing="5") + titles = get_titles(table[0]) + return titles + +def date_pages(date): + return [date.strftime("%B %d")] + +def anniversaries(date): + base = 'http://en.wikipedia.org/wiki/Wikipedia:Selected_anniversaries/' + url = base + date.strftime("%B_%d") + soup = soupify_url(url) + div = soup.findAll(id="bodyContent") + titles = get_titles(div[0]) + return titles + +def titles_for_date(date): + titles = featured_pages(date) + titles.extend(featured_pictures(date)) + titles.extend(date_pages(date)) + titles.extend(anniversaries(date)) + return titles + +def main(argv=None): + if argv is None: + argv = sys.argv + try: + try: + opts, args = getopt.getopt(argv[1:], "hd:v", ["help", "date="]) + except getopt.error, msg: + raise Usage(msg) + + # option processing + for option, value in opts: + if option == "-v": + verbose = True + if option in ("-h", "--help"): + raise Usage(help_message) + if option in ("-d", "--date"): + datestr = value + maxdate = datetime.date(int(datestr[0:4]), int(datestr[4:6]), int(datestr[6:8])) + + # find urls recently featured on main page of wikipedia + titles = titles_for_date(maxdate) + titles.extend(titles_for_date(maxdate - datetime.timedelta(1))) + titles.extend(titles_for_date(maxdate - datetime.timedelta(2))) + + # generate blacklist of page_ids: + pageids = [pageid(x) for x in set(titles)] + for x in pageids: + try: + sys.stdout.write('%s\n' % x) + except: + pass + + except Usage, err: + print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) + print >> sys.stderr, "\t for help use --help" + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/lib/sql/load_featured_pages.sql b/lib/sql/load_featured_pages.sql new file mode 100644 index 0000000..aaa6f36 --- /dev/null +++ b/lib/sql/load_featured_pages.sql @@ -0,0 +1,7 @@ +TRUNCATE TABLE new_featured_pages; + +LOAD DATA LOCAL INFILE '/mnt/featured_pages.txt' +INTO TABLE new_featured_pages +FIELDS TERMINATED BY '\t' +LINES TERMINATED BY '\n' +(page_id,trend,error); \ No newline at end of file diff --git a/lib/sql/load_history.sql b/lib/sql/load_history.sql index 24421ae..5337a54 100644 --- a/lib/sql/load_history.sql +++ b/lib/sql/load_history.sql @@ -66,6 +66,7 @@ CALL dropindex('new_pages', 'pages_autocomp_index'); CALL dropindex('new_pages', 'pages_trend_index'); CALL dropindex('new_daily_timelines', 'timeline_pageid_index'); + set foreign_key_checks=0; set sql_log_bin=0; set unique_checks=0; @@ -103,6 +104,7 @@ create index pages_autocomp_index on new_pages (title(64), total_pageviews); -- Query OK, 2783939 rows affected (6 min 20.95 sec) -- Records: 2783939 Duplicates: 0 Warnings: 0 + -- for main pagination create index pages_trend_index on new_pages (monthly_trend); -- Query OK, 2783939 rows affected (1 min 25.65 sec) diff --git a/lib/sql/rename_backup_to_new.sql b/lib/sql/rename_backup_to_new.sql index 90e8f53..0f3d593 100644 --- a/lib/sql/rename_backup_to_new.sql +++ b/lib/sql/rename_backup_to_new.sql @@ -1,3 +1,4 @@ RENAME TABLE backup_pages TO new_pages; RENAME TABLE backup_daily_timelines TO new_daily_timelines; -RENAME TABLE backup_daily_trends TO new_daily_trends; \ No newline at end of file +RENAME TABLE backup_daily_trends TO new_daily_trends; +RENAME TABLE backup_featured_pages TO new_featured_pages; \ No newline at end of file diff --git a/lib/sql/rename_new_to_live.sql b/lib/sql/rename_new_to_live.sql index 2fce608..d99a3c5 100644 --- a/lib/sql/rename_new_to_live.sql +++ b/lib/sql/rename_new_to_live.sql @@ -1,3 +1,4 @@ RENAME TABLE pages TO backup_pages, new_pages TO pages; RENAME TABLE daily_timelines TO backup_daily_timelines, new_daily_timelines TO daily_timelines; RENAME TABLE daily_trends TO backup_daily_trends, new_daily_trends TO daily_trends; +RENAME TABLE featured_pages TO backup_featured_pages, new_featured_pages TO featured_pages; \ No newline at end of file diff --git a/test/fixtures/featured_pages.yml b/test/fixtures/featured_pages.yml new file mode 100644 index 0000000..bcfa029 --- /dev/null +++ b/test/fixtures/featured_pages.yml @@ -0,0 +1,7 @@ +# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html + +one: + page: + +two: + page: diff --git a/test/unit/featured_page_test.rb b/test/unit/featured_page_test.rb new file mode 100644 index 0000000..ed1e559 --- /dev/null +++ b/test/unit/featured_page_test.rb @@ -0,0 +1,8 @@ +require 'test_helper' + +class FeaturedPageTest < ActiveSupport::TestCase + # Replace this with your real tests. + test "the truth" do + assert true + end +end