diff --git a/TESTPLAN.md b/TESTPLAN.md deleted file mode 100644 index ce6ffd0d..00000000 --- a/TESTPLAN.md +++ /dev/null @@ -1,421 +0,0 @@ -# TCAT Test Plan - -**DRAFT** - -## Introduction - -This is a test plan for TCAT. - -### Purpose - -This document describes how to test a TCAT installation to behave *as -expected* by the authors and users. It exists as a procedures to be -manually followed, but could be automated in the future. - -### Scope - -This documentation describes testing and (detailed) verification of -TCAT functionality. It does not describe unit testing yet. Unit -testing is a more micro-approach where we verify PHP function input -and output. We could also opt for a hybrid testing model, with some -important unit tests combined with tests of functionality as it is -described here. - -### Conformance - -The TCAT software is defined to have passed this test plan if it -passes every test in it and every alternative test. - -A test is defined to have **failed** if any of the expected items did -not occur, otherwise it is defined to have **passed**. That is, all -of the expected items must occur for the test to pass. - -### Overview - -The tests in this test plan have been grouped into these sections: - -- Install tests -- Capture tests -- Analysis tests -- Controlling capture tests -- URL expansion tests -- Rate limit tests -- Geographic search tests - -## Install tests - -### Install track mode - -Pre-requsites: - -- Host machine where TCAT has not been installed. -- A copy of the _helpers/tcat-install-linux.sh_ script. -- Configuration file with valid Twitter API credentials. It should - only contain these parameters and no others: - - CONSUMERKEY - - CONSUMERSECRET - - USERTOKEN - - USERSECRET - -Procedure: - -0. Run the TCAT install script in batch mode (-b), saving the TCAT - logins to a file (-l) and using the configuration file: - - sudo ./tcat-install-linux.sh -b -l -c myTCAT.conf - - Note: this will configure TCAT with the default Tweet capture mode - of "track phrases and keywords". - -0. Wait for the install script to finish running (a few minutes). - -0. Check the exit status by running `echo $?` - -0. Expect exit status to be zero (i.e. TCAT installed successfully). - -Note: the following tests ("Cron configured", "Database created" and -"Apache configured") are probably redundant, because other tests -(e.g. "Create a query bin") will fail if these tests failed. - -### Cron configured - -Pre-requesites: - -- Install track mode - -Procedure: - -0. Run `ls -l /etc/cron.d/tcat`. -0. Expect file exists. -0. Expect file owner and group to be both _root_. -0. Expect file permissions to be "-rw-r--r--". -0. Examine the contents by running `cat /etc/cron.d/tcat`. -0. Expect /var/www/dmi-tcat/capture/stream/controller.php to be run - very regularly (e.g. every minute). -0. Expect /var/www/dmi-tcat/helpers/urlexpand.sh to be run - regularly (e.g. every hour), if TCAT was installed with URL expansion. - -### Database created - -Pre-requsites: - -- Install track mode - -Procedure: - -0. Run `sudo mysql --defaults-file=/etc/mysql/conf.d/tcat-root.cnf` - (_sudo_ is required because the config file is only readable by the - _mysql_ user) -0. Expect the "mysql>" prompt to appear. -0. Enter the command `show databases;` -0. Expect a database called "twittercapture" to exist. -0. Enter the command `use twittercapture;` -0. Enter the command `show tables;` -0. Expect there to be no tables in the database (i.e. "Empty set" to be printed out). -0. Enter the command `quit`. - -Note: the controller.php does not work at this stage, because -there are no tables in the database. - -### Apache configured - -Pre-requsites: - -- Install track mode - -Procedure: - -0. Run `curl -v http://localhost/capture/`. -0. Expect HTTP status to be "401 Unauthorized". -0. Run `curl -v http://localhost/analysis/`. -0. Expect HTTP status to be "401 Unauthorized". - -Alternatively, run these commands: - -```sh -curl --silent --output /dev/null --write-out '%{http_code}\n' http://localhost/capture/ -curl --silent --output /dev/null --write-out '%{http_code}\n' http://localhost/analysis/ -``` - -## Capture tests - -In the following, replace localhost in the URLs with the hostname or -IP address of the host machine. The correct URLs should have been -printed out when the install script finished running. - -### Login as admin - -0. Visit http://localhost/capture/ in a Web browser. -0. Expect to be prompted by the browser to login to DMI-TCAT. -0. Enter "admin" and its password. The password is the one supplied to - the installer or was generated and printed out at the end of the - installation process (and can be found in - _/etc/apache2/tcat-login-admin.txt_ if the `-l` option was used - with the installer). -0. Expect the DMI-TCAT query manager page to appear. - -### Create a query bin - -Pre-requesites: - -- Login as admin -- A query bin called "test1" has not been created. - -Procedure: - -0. Expect the "New query bin" form is shown. - -0. Expect there is no query bin called "test1" listed in the table of - query bins. - -0. Fill in the "New query bin" form: - - Bin type: keyword track - - Bin name: test1 - - Phrases to track: apple - - Optional notes: (leave blank) - -0. Press the "add query bin" button. - -0. Expect a dialog box to appear, asking if you are sure you want to create - the query bin. - -0. Press the "OK" button. - -0. Expect a dialog box to appear, saying the new query bin has been created. - -0. Press the "Close" button to close the dialog box. - -0. Expect the new query to be listed in the table of query bins. - - 0. Expect the name to be "test1". - 0. Expect the "active" to be "1". - 0. Expect the "type" to be "track". - 0. Expect the "queries" to be "apple". - 0. Expect the "no. tweets" to be "0" for the new query bin. - 0. Expect the "Periods in which the query bin was active" to be - the time the "add query bin" button was pressed till "now". - -0. Wait at least one minute (the interval of the _controller.php_ - cron job). - -0. Refresh the capture page. - -0. Expect the "no. tweets" to have increased to a non-zero value. - -### Query bin database tables created - -0. Run - `sudo mysql --defaults-file=/etc/mysql/conf.d/tcat-root.cnf twittercapture` - (_sudo_ is required because the config file is only readable by the - _mysql_ user) -0. Expect the "mysql>" prompt to appear. -0. Enter the command `show tables;` -0. Expect 9 tables with names starting with "tcat_". -0. Expect 7 tables with names starting with the query bin name followed - by an underscore (i.e. "test1_"). -0. Enter the "quit" command. - -### Query bin log files - -0. Run `ls -l /var/www/dmi-tcat/logs`. -0. Expect directory to contain a file called controller.log. -0. Expect controller.log to have an owner and group of "tcat". -0. Expect controller.log to be non-empty -0. Expect directory to contain a file called track.error.log. -0. Expect track.error.log to have an owner and group of "tcat". - -_TODO: Is it an issue that track.error.log is non-empty? As an error -file, it should be empty unless something goes wrong, otherwise -important error messages get lost in the noise. Is there supposed to -be a (non-error) track log file?_ - -## Analysis tests - -### Select tweets - -Pre-requesites: - -- Query bin created -- Already logged in as the "admin" or "tcat" user. - -Procedure: - -0. Visit http://localhost/analysis/ in a Web browser. -0. Expect the analysis page to appear. -0. Expect the pop-up menu to contain the "test1" dataset. -0. Expect the pop-up menu to indicate there a non-zero number of tweets. -0. Change the start and end dates to include the tweet collection period. - Leave all other fields blank. -0. Press the "update overview" button. -0. Expect a pie chart to appear showing tweets with links and those without - links. -0. Expect lines to appear on the time-based graph. - -### Graph resolution - -0. In the "graph resolution" select the "hours" radio button. -0. Press the "update graph" button. -0. Expect a more detailed graph to appear. -0. In the "graph resolution" select the "minutes" radio button. -0. Press the "update graph" button. -0. Expect an even more detailed graph to appear. - -Alternatives: - -- If not logged in (as admin or tcat) the browser will not prompt - for the user and password. Either "admin" or "tcat" user - accounts can be used. - -### Export statistics - -Pre-requesites: - -- Select tweets - -Procedure: - -0. In the "Export selected data" section, select the output format - of "CSV (comma-separated)". -0. In the "Tweet statistics and activity metrics" subsection, - select the "overall" radio button. -0. In the "Tweet stats" sub-subsection, click on the "launch" link. -0. Expect a tweet stats page to appear. -0. Download the file from the link on the page. -0. Expect... - -Alternatives: - -Repeat with all the combinations of: - -- Other sub-subsections (i.e. the other "launch" links). -- Other statistics groupings. -- Output format of "TSV (tab-separated)". - -### Export tweets - -Pre-requesites: - -- Select tweets - -Procedure: - -0. In the "Export selected data" section, select the output format - of "CSV (comma-separated)". -0. In the "Tweet exports" subsection, select none of the - additional column check boxes. -0. In the "Random set of tweets from selection" sub-subsection, - click on the "launch" link. -0. Expect an export tweets page to appear. -0. Download the file from the link on the page. -0. Expect... - -Alternatives: - -Repeat with all the combinations of: - -- Other sub-subsections (i.e. the other "launch"/"export" links). -- Other additional columns selected. -- Output format of "TSV (tab-separated)". - -### Networks - -TBD. - -### Experimental - -TBD. - -## Controlling capture tests - -### Stop capture - -Pre-requsites: - -- Capture - -Procedure: - -0. Visit http://localhost/capture/ in a Web browser. -0. Click on the "stop" link for "test1". -0. Expect a dialog box to appear, to confirm stopping the capture. -0. Press the "yes" button. -0. Expect a dialog box to appear, saying the query bin has been stopped. -0. Press the "Close" button to close the dialog box. -0. Expect the "stop" link has been replaced by a "start" link. -0. Show the time when the controller.php was excuted by running - `ls -l /var/www/dmi-tcat/logs/controller.log` -0. Take note of the value of the "no. tweets" for "test1". -0. Take note of the finish time for the "periods in which the query bin was - active" for "test1". -0. Wait at least one minute (the interval of the _controller.php_ - cron job). -0. Refresh the capture page. -0. Expect the "no. tweets" to not have changed. -0. Expect the finish time to not have changed. -0. Show the time when the controller.php was excuted by running - `ls -l /var/www/dmi-tcat/logs/controller.log` -0. Expect the time is newer than when previously checked. - That is, the controller.php is still being executed, but it is only not - running that particular query bin. - -Alternative: - -Create another query bin that is not stopped. It should be updated -with new tweets while the stopped query bin is not. - -### Restarting capture - -Pre-requsites: - -- Stop capture - -0. Click on the "start" link for "test1". -0. Expect a dialog box to appear, to confirm the start of capturing. -0. Press the "yes" button. -0. Expect a dialog box to appear, saying the query bin has been started. -0. Press the "close" button. -0. Expect the "start" link has been replaced by a "stop" link. -0. Take note of the value of the "no. tweets" for "test1". -0. Take note of the finish time for the "periods in which the query bin was - active" for "test1". -0. Wait at least one minute (the interval of the _controller.php_ - cron job). -0. Refresh the capture page. -0. Expect the "no. tweets" to have increased. -0. Expect the finish time to have advanced. - -Alternative: - -Check the graph. - - -## URL expansion tests - -TBD. - -## Rate limit tests - -TBD. - -## Geographic search tests - -TBD. - -## To do - -- Tests for other Tweet capture modes (i.e. following users and one - percent capture). - - -From dentoir's draft from 7 April 2016: -https://github.com/digitalmethodsinitiative/dmi-tcat/issues/170#issuecomment-206898863 - -> - execute the command line script capture/search/search.php with specially crafted parameters to search a unique, long hashkey, which should return a single, real tweet, with a known ID and known content (or alternatively, multiple IDs with known content) -> - curl the TCAT analysis URL with special parameters: do we get a correct overview, with the tweet(s) found? -> - curl the TCAT analysis URL with special *search* parameters: do we get a correct overview, with the tweet(s) found? -> - curl several or all of the TCAT analysis scripts with special parameters, downloading the .tsv or .csv files and comparing them to output we know is correct -> - -Note: Currently _url_ alone might not be sufficient, since tweet -selection currently depends on JavaScript on the page. Will need a -REST-ful API for _curl_ to work. \ No newline at end of file diff --git a/analysis/common/config.php b/analysis/common/config.php index d179c01e..ada5bb97 100644 --- a/analysis/common/config.php +++ b/analysis/common/config.php @@ -1,6 +1,7 @@ 0) { + $sql = "SELECT value FROM tcat_status WHERE variable = '" . mysql_real_escape_string($variable) . "'"; + $sqlresults = mysql_query($sql); + if ($res = mysql_fetch_assoc($sqlresults)) { + return $res['value']; + } + } + return null; +} + // Output format: {dataset}-{startdate}-{enddate}-{query}-{exclude}-{from_user_name}-{from_user_lang}-{url_query}-{module_name}-{module_settings}-{hash}.{filetype} function get_filename_for_export($module, $settings = "", $filetype = "csv") { global $resultsdir, $esc; @@ -784,7 +807,7 @@ function get_hash_tags($msg) { function get_all_datasets() { global $dataset; $dbh = pdo_connect(); - $rec = $dbh->prepare("SELECT id, querybin, type, active, comments FROM tcat_query_bins WHERE visible = TRUE ORDER BY LOWER(querybin)"); + $rec = $dbh->prepare("SELECT id, querybin, type, active, comments FROM tcat_query_bins WHERE access = " . TCAT_QUERYBIN_ACCESS_OK . " OR access = " . TCAT_QUERYBIN_ACCESS_READONLY . " ORDER BY LOWER(querybin)"); $datasets = array(); if ($rec->execute() && $rec->rowCount() > 0) { while ($res = $rec->fetch()) { diff --git a/analysis/index.php b/analysis/index.php index 55ae9275..336b9bf1 100644 --- a/analysis/index.php +++ b/analysis/index.php @@ -251,11 +251,11 @@ function getExportSettings() { - Startdate: (YYYY-MM-DD or YYYY-MM-DD HH:MM:SS) + Startdate (UTC): (YYYY-MM-DD or YYYY-MM-DD HH:MM:SS) - Enddate: (YYYY-MM-DD or YYYY-MM-DD HH:MM:SS) + Enddate (UTC): (YYYY-MM-DD or YYYY-MM-DD HH:MM:SS) @@ -326,6 +326,8 @@ function updatestatus() { $show_url_export = true; } } + // see whether database is up-to-date to export ratelimit and gap tables + $show_ratelimit_and_gap_export = get_status('ratelimit_database_rebuild') == 2 ? true : false; // see whether the lang table exists $show_lang_export = FALSE; $sql = "SHOW TABLES LIKE '" . $esc['mysql']['dataset'] . "_lang'"; @@ -721,9 +723,24 @@ function updatestatus() {
Use: get a grasp of the most popular media.
+ +
- +

Export an estimation of the number of rate limited tweets in your data

+
Exports a spreadsheet with an estimation of the ammount of non-captured tweets in your query due to ratelimit occurances.
+
Use: gain insight in possible missing data due to hitting the Twitter API rate limits.
+ + +
+ +

Export table with potential gaps in your data

+
Exports a spreadsheet with all known data gaps in your current query, during which TCAT was not running or capturing data for this bin.
+
Use: Gain insight in possible missing data due to outages
+ + + +

Tweet exports

@@ -814,6 +831,8 @@ function updatestatus() { +
+

Networks

diff --git a/analysis/mod.gaps.php b/analysis/mod.gaps.php new file mode 100644 index 00000000..144233b7 --- /dev/null +++ b/analysis/mod.gaps.php @@ -0,0 +1,83 @@ + + + + + + + TCAT :: Export gap data + + + + + + + + + + + +

TCAT :: Export gap data

+ + writeheader(explode(',', $header)); + + // make query + $sql = "SELECT * FROM tcat_error_gap WHERE type = '" . mysql_real_escape_string($bin_type) . "' and + start >= '" . mysql_real_escape_string($_GET['startdate']) . "' and end <= '" . mysql_real_escape_string($_GET['enddate']) . "'"; + // loop over results and write to file + $sqlresults = mysql_query($sql); + if ($sqlresults) { + while ($data = mysql_fetch_assoc($sqlresults)) { + // the query bin must have been active during the gap period, if we want to report it as a possible gap + $sql2 = "SELECT count(*) as cnt FROM tcat_query_bins_phrases WHERE querybin_id = $bin_id and + starttime <= '" . $data["end"] . "' and (endtime >= '" . $data["start"] . "' or endtime is null or endtime = '0000-00-00 00:00:00')"; + $sqlresults2 = mysql_query($sql2); + if ($sqlresults2) { + if ($data2 = mysql_fetch_assoc($sqlresults2)) { + if ($data2['cnt'] > 0) { + $csv->newrow(); + $csv->addfield($data["start"]); + $csv->addfield($data["end"]); + $csv->writerow(); + } + } + } + } + } + $csv->close(); + + echo '
'; + echo 'Your File'; + echo '

' . $filename . '

'; + echo '
'; + ?> + + + diff --git a/analysis/mod.ratelimits.php b/analysis/mod.ratelimits.php new file mode 100644 index 00000000..d246a3b5 --- /dev/null +++ b/analysis/mod.ratelimits.php @@ -0,0 +1,210 @@ + + + + + + + TCAT :: Export ratelimit data + + + + + + + + + + + +

TCAT :: Export ratelimit data

+ + Notice: You have requested rate limit data for a query bin with is not of type "track", or "geotrack". There currently is no export module for ratelimit data of other types.
'; + echo ''; + die(); + } + if ($bin_type == "geotrack") { + // Lookup the earliest entry in the tcat_captured_phrases table for any geotrack bin. Geotrack bins historic rate limit is not reconstructed, + // therefore we decide to not allow this export function for earlier timeframes. + $accept = false; + $sql = "select min(created_at) as earliest from tcat_captured_phrases tcp inner join tcat_query_bins_phrases bp on tcp.phrase_id=bp.phrase_id inner join tcat_query_bins tqb on bp.querybin_id = tqb.id where tqb.type = 'geotrack' and earliest > '" . $esc['datetime']['startdate'] . "'"; + $sqlresults2 = mysql_query($sql); + if ($res2 = mysql_fetch_assoc($sqlresults)) { + if (array_key_exists('earliest', $res2) && is_string($res2['earliest'])) { + $accept = true; + } + } + if ($accept == false) { + echo 'Notice: You have requested rate limit data for a query bin which is of type "geotrack", but for a time period for which we posses no historical data. We cannot handle your request.
'; + echo ''; + die(); + } + } + // TODO: Support these. This shouldn't be difficult, but requires a little different logic. + if ($esc['date']['interval'] == "custom" || $esc['date']['interval'] == "overall") { + echo 'Notice: You have selected an interval type which is not yet supported by this export module.
'; + echo ''; + die(); + } + + // make filename and open file for write + if ($bin_type == "geotrack") { + $module = "rateLimitDataGeo"; + } else { + $module = "ratelimitData"; + } + $module .= "-" . $esc['date']['interval']; + $filename = get_filename_for_export($module); + $csv = new CSV($filename, $outputformat); + + // write header + $header = "querybin,datetime,tweets ratelimited (estimate)"; + $csv->writeheader(explode(',', $header)); + + $sqlInterval = sqlInterval(); $sqlSubset = sqlSubset(); + $sqlGroup = " GROUP BY datepart ASC"; + + // Use native MySQL to create a temporary table with all dateparts. They should be identical to the dateparts we will use in the GROUP BY statement. + // Prepare the string mysql needs in date_add() + $mysqlNativeInterval = "day"; // default $interval = daily + switch ($esc['date']['interval']) { + case "hourly": { $mysqlNativeInterval = "hour"; break; } + case "daily": { $mysqlNativeInterval = "day"; break; } + case "weekly": { $mysqlNativeInterval = "week"; break; } + case "monthly": { $mysqlNativeInterval = "month"; break; } + case "yearly": { $mysqlNativeInterval = "year"; break; } + } + $query = "CREATE TEMPORARY TABLE temp_dates ( date DATETIME )"; + mysql_query($query); + $query = "SET @date = '" . $esc['datetime']['startdate'] . "'"; + mysql_query($query); + for (;;) { + $query = "INSERT INTO temp_dates SELECT @date := date_add(@date, interval 1 $mysqlNativeInterval)"; + mysql_query($query); + // Are we finished? + $query = "SELECT @date > '" . $esc['datetime']['enddate'] . "' as finished"; + $rec = mysql_query($query); + if ($res = mysql_fetch_assoc($rec)) { + if ($res['finished'] == '1') { + break; + } + } + } + $dateparts = array(); + $sqlIntervalForDateparts = str_replace("t.created_at", "date", $sqlInterval); + $query = "SELECT $sqlIntervalForDateparts FROM temp_dates"; + $rec = mysql_query($query); + while ($res = mysql_fetch_assoc($rec)) { + $dateparts[] = $res['datepart']; + } + + /* + * measured phrase matches for bin (C) + * Formula for estimates = (A) ratelimited * -------------------------------- + * total unique tweets with matches (B) + */ + + $sqlIntervalForRL = str_replace("t.created_at", "start", $sqlInterval); + $sql_query_a = "SELECT SUM(tweets) as ratelimited, $sqlIntervalForRL FROM tcat_error_ratelimit WHERE start >= '" . $esc['datetime']['startdate'] . "' AND end <= '" . $esc['datetime']['enddate'] . "' $sqlGroup"; + + // This query retrieves the total unique tweets captured, grouped by the requested interval (hourly, daily, ...) + $sql_query_b = "SELECT COUNT(distinct(t.tweet_id)) AS cnt, $sqlInterval FROM tcat_captured_phrases t $sqlSubset $sqlGroup"; + + // Notice: we need to do a INNER JOIN on the querybin table here (to match phrase_id to querybin_id) + $sql_query_c = "SELECT COUNT(distinct(t.tweet_id)) AS cnt, $sqlInterval FROM tcat_captured_phrases t INNER JOIN tcat_query_bins_phrases qbp ON t.phrase_id = qbp.phrase_id $sqlSubset AND qbp.querybin_id = $bin_id $sqlGroup"; + + $fullresults = array(); + + // Get ratelimits (query A) + + $rec = mysql_query($sql_query_a); + while ($res = mysql_fetch_assoc($rec)) { + if (!array_key_exists($res['datepart'], $fullresults)) { + $fullresults[$res['datepart']] = array(); + } + $fullresults[$res['datepart']]['ratelimited'] = $res['ratelimited']; + } + + // Get the total unique phrases with matches (query B) + + $rec = mysql_query($sql_query_b); + while ($res = mysql_fetch_assoc($rec)) { + if (!array_key_exists($res['datepart'], $fullresults)) { + $fullresults[$res['datepart']] = array(); + } + $fullresults[$res['datepart']]['totalphrases'] = $res['cnt']; + } + + // Get the measured phrases per bin (query C) + + $rec = mysql_query($sql_query_c); + while ($res = mysql_fetch_assoc($rec)) { + if (!array_key_exists($res['datepart'], $fullresults)) { + $fullresults[$res['datepart']] = array(); + } + $fullresults[$res['datepart']]['measuredbin'] = $res['cnt']; + } + + foreach ($dateparts as $datepart) { + + if (!array_key_exists($datepart, $fullresults)) { + $csv->newrow(); + $csv->addfield($esc['mysql']['dataset']); + $csv->addfield($datepart); + $csv->addfield(-1); // report a minus 1 for a datepart with missing ratelimit information + $csv->writerow(); + } else { + + $row = $fullresults[$datepart]; + if (!array_key_exists('ratelimited', $row) || !array_key_exists('measuredbin', $row) || !array_key_exists('totalphrases', $row)) { + // TODO/TEST: this cannot occur I think + continue; + } + + // Now: calculate the estimate using our formula + $estimate = round( $row['ratelimited'] * $row['measuredbin'] / $row['totalphrases'], 2 ); + + $csv->newrow(); + $csv->addfield($esc['mysql']['dataset']); + $csv->addfield($datepart); + $csv->addfield($estimate); + $csv->writerow(); + } + + } + + $csv->close(); + + echo '
'; + echo 'Your File'; + echo '

' . $filename . '

'; + echo '
'; + ?> + + + diff --git a/capture/common/form.trackgeophrases.php b/capture/common/form.trackgeophrases.php index b5d286f5..5a8c2261 100644 --- a/capture/common/form.trackgeophrases.php +++ b/capture/common/form.trackgeophrases.php @@ -1,5 +1,6 @@ "set sql_mode='ALLOW_INVALID_DATES'")); + $dbh = new PDO("mysql:host=$hostname;dbname=$database;charset=utf8mb4", $dbuser, $dbpass, array(PDO::MYSQL_ATTR_INIT_COMMAND => "set sql_mode='ALLOW_INVALID_DATES';set time_zone='+00:00' +")); $dbh->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); return $dbh; @@ -40,15 +42,62 @@ function geophp_sane() { } function create_error_logs() { + global $dbuser, $dbpass, $database, $hostname; $dbh = pdo_connect(); - $sql = 'create table if not exists tcat_error_ratelimit ( id bigint auto_increment, type varchar(32), start datetime not null, end datetime not null, tweets bigint not null, primary key(id), index(type), index(start), index(end) )'; + $creating_tables_for_fresh_install = false; + $sql = "SELECT * FROM information_schema.tables WHERE table_schema = '$database' AND table_name = 'tcat_error_gap'"; + $test = $dbh->prepare($sql); + $test->execute(); + if ($test->rowCount() == 0) { + $creating_tables_for_fresh_install = true; + } + + $sql = 'create table if not exists tcat_error_ratelimit ( id bigint auto_increment, type varchar(32), start datetime not null, end datetime not null, tweets bigint not null, primary key(id), index(type), index(start), index(end) ) ENGINE=MyISAM'; $h = $dbh->prepare($sql); $h->execute(); - $sql = 'create table if not exists tcat_error_gap ( id bigint auto_increment, type varchar(32), start datetime not null, end datetime not null, primary key(id), index(type), index(start), index(end) )'; + $sql = 'create table if not exists tcat_error_gap ( id bigint auto_increment, type varchar(32), start datetime not null, end datetime not null, primary key(id), index(type), index(start), index(end) ) ENGINE=MyISAM'; $h = $dbh->prepare($sql); $h->execute(); + + /* + * The tcat_status variable is utilised as generic keystore to record and track aspects of this TCAT installation. + * This is not a configuration table. The configuration of TCAT is defined in config.php, though we may wish to allow dynamically configurable + * options in the future and this table would suit such a purpose. + * At the moment, this table is solely used by TCAT internally to store information such as wich upgrade steps have been executed, etc. + */ + + $sql = "CREATE TABLE IF NOT EXISTS tcat_status ( + `variable` varchar(32), + `value` varchar(1024), + PRIMARY KEY `variable` (`variable`), + KEY `value` (`value`) + ) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4"; + $create = $dbh->prepare($sql); + $create->execute(); + + + $sql = "select value from tcat_status where variable = 'ratelimit_format_modified_at'"; + $test = $dbh->prepare($sql); + $test->execute(); + if ($test->rowCount() == 0 && defined('CAPTURE')) { + // We are actively registering ratelimits in the new gauge-style and store the timestamp of the start of this new behaviour + // The purpose of this insert statemtn is for common/upgrade.php to know the exact time at which it can expect datetime insertion to be sane. + $sql = "insert into tcat_status ( variable, value ) values ( 'ratelimit_format_modified_at', now() )"; + $insert = $dbh->prepare($sql); + $insert->execute(); + } + + // When creating tables for a fresh install, set tcat_status variable to indicate we have up-to-date ratelimit, gap tables and are capturing in the proper timezone + // Practically, the purpose of this insert statement is for common/upgrade.php to know we do not need to upgrade the above table. + + if ($creating_tables_for_fresh_install) { + $sql = "insert into tcat_status ( variable, value ) values ( 'ratelimit_database_rebuild', 2 )"; + $insert = $dbh->prepare($sql); + $insert->execute(); + } + } // Enclose identifier in backticks; escape backticks inside by doubling them. @@ -56,6 +105,15 @@ function quoteIdent($field) { return "`" . str_replace("`", "``", $field) . "`"; } +// read the minute of the current hour without leading zero +function get_current_minute() { + $minutes = ltrim(date("i", time()), '0'); + if ($minutes == '') { + $minutes = '0'; + } + return intval($minutes); +} + function create_bin($bin_name, $dbh = false) { try { @@ -241,7 +299,7 @@ function create_admin() { `querybin` VARCHAR(45) NOT NULL, `type` VARCHAR(10) NOT NULL, `active` BOOLEAN NOT NULL, - `visible` BOOLEAN DEFAULT TRUE, + `access` INT DEFAULT 0, `comments` VARCHAR(2048) DEFAULT NULL, PRIMARY KEY (`id`), KEY `querybin` (`querybin`), @@ -339,47 +397,195 @@ function create_admin() { $rec->execute(); } + // 31/05/2016 Add access column, remove visibility column [fast auto-upgrade - reminder to remove] + $query = "SHOW COLUMNS FROM tcat_query_bins"; + $rec = $dbh->prepare($query); + $rec->execute(); + $columns = $rec->fetchAll(PDO::FETCH_COLUMN); + $update = FALSE; + foreach ($columns as $i => $c) { + if ($c == 'visible') { + $update = TRUE; + break; + } + } + if ($update) { + // Adding new columns to table tcat_query_bins + $query = "ALTER TABLE tcat_query_bins ADD COLUMN `access` INT DEFAULT 0"; + $rec = $dbh->prepare($query); + $rec->execute(); + $query = "UPDATE tcat_query_bins SET access = " . TCAT_QUERYBIN_ACCESS_OK . " where visible = TRUE"; + $rec = $dbh->prepare($query); + $rec->execute(); + $query = "UPDATE tcat_query_bins SET access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . " where visible = FALSE"; + $rec = $dbh->prepare($query); + $rec->execute(); + $query = "ALTER TABLE tcat_query_bins DROP COLUMN `visible`"; + $rec = $dbh->prepare($query); + $rec->execute(); + + } + + // 05/05/2016 Create a global lookup table to matching phrases to tweets + // Thanks to this table we know how many (unique or non-unique) tweets were the result of querying the phrase. + // This is used to estimate in (analysis/mod.ratelimits.php) how many tweets may have been ratelimited for bins associated with the phrase + $sql = "CREATE TABLE IF NOT EXISTS tcat_captured_phrases ( + `tweet_id` BIGINT(20) NOT NULL, + `phrase_id` BIGINT(20) NOT NULL, + `created_at` DATETIME NOT NULL, + PRIMARY KEY (`tweet_id`, `phrase_id`), + KEY `created_at` (`created_at`) ) ENGINE = MyISAM DEFAULT CHARSET = utf8mb4"; + $create = $dbh->prepare($sql); + $create->execute(); + $dbh = false; } /* - * Record a ratelimit disturbance + * This function imports the MySQL timezone data neccessary to make the convert_tz() function work. On Debian/Ubuntu systems the timezone data is not + * loaded by default, as is evident from the result of the following query, which unexpectedly is NULL: + * SELECT convert_tz(now(), 'SYSTEM', 'UTC'); + * + * Our function first tests (quickly) whether timezone data is available, and otherwise imports it. + * + * See also: http://stackoverflow.com/questions/9808160/mysql-time-zones + */ +function import_mysql_timezone_data() { + global $dbuser, $dbpass, $database, $hostname; + + $dbh = pdo_connect(); + + $sql = "SELECT convert_tz(now(), 'SYSTEM', 'UTC') as available"; + $test = $dbh->prepare($sql); + $test->execute(); + if ($res = $test->fetch()) { + if (array_key_exists('available', $res) && is_string($res['available'])) { + return true; // we already have the timezone data + } + } + if (!file_exists('/usr/share/zoneinfo') || !is_executable('/usr/bin/mysql_tzinfo_to_sql')) { + return false; // we cannot import timezone data (unknown OS?) + } + + // Connect to MySQL meta information database + try { + $dbh_mysql = new PDO("mysql:host=$hostname;dbname=mysql;charset=utf8mb4", $dbuser, $dbpass, array(PDO::MYSQL_ATTR_INIT_COMMAND => "set sql_mode='ALLOW_INVALID_DATES'")); + } catch (Exception $e) { + if ($e->getCode() == 1044) { + // Access denied (probably the connecting user does not have sufficient privileges) + return false; + } + } + $dbh_mysql->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + + // Run the MySQL tool to convert Unix timezones into MySQL format, read its output using popen() + // and then execute its output as a query. We could opt for piping directly to the mysql command line tool, + // but this is probably a bit more secure (no need to transfer passwords to the command-line) + $cmdhandle = popen("/usr/bin/mysql_tzinfo_to_sql /usr/share/zoneinfo", "r"); + $query = ""; + while ($buf = fread($cmdhandle, 2048)) { + $query .= $buf; + } + pclose($cmdhandle); + $import = $dbh_mysql->prepare($query); + $import->execute(); + return true; +} + +/* + * Record any ratelimit disturbance as it happened in the last minute */ -function ratelimit_record($ratelimit, $ex_start) { +function ratelimit_record($ratelimit) { $dbh = pdo_connect(); - $sql = "insert into tcat_error_ratelimit ( type, start, end, tweets ) values ( :type, :start, :end, :ratelimit)"; + $sql = "insert into tcat_error_ratelimit ( type, start, end, tweets ) values ( :type, date_sub(date_sub(now(), interval second(now()) second), interval 1 minute), date_sub(now(), interval second(now()) second), :ratelimit)"; $h = $dbh->prepare($sql); - $ex_start = toDateTime($ex_start); - $ex_end = toDateTime(time()); $type = CAPTURE; $h->bindParam(":type", $type, PDO::PARAM_STR); - $h->bindParam(":start", $ex_start, PDO::PARAM_STR); - $h->bindParam(":end", $ex_end, PDO::PARAM_STR); $h->bindParam(":ratelimit", $ratelimit, PDO::PARAM_INT); $h->execute(); $dbh = false; } +/* + * Zero non-existing ratelimit table rows backwards-in-time + */ + +function ratelimit_holefiller($minutes) { + if ($minutes <= 1) return; + $dbh = pdo_connect(); + for ($i = 2; $i <= $minutes; $i++) { + + // test if a rate limit record already exists in the database, and if so: break + + $sql = "select count(*) as cnt from tcat_error_ratelimit where type = '" . CAPTURE . "' and + start >= date_sub(date_sub(date_sub(now(), interval $i minute), interval second(date_sub(now(), interval $i minute)) second), interval 1 minute) and + end <= date_sub(date_sub(now(), interval " . ($i - 1) . " minute), interval second(date_sub(now(), interval " . ($i - 1) . " minute)) second)"; + $h = $dbh->prepare($sql); + $h->execute(); + while ($res = $h->fetch()) { + if (array_key_exists('cnt', $res) && $res['cnt'] > 0) { + // finished + $dbh = false; + return; + } + } + + // fill in the hole + + $sql = "insert into tcat_error_ratelimit ( type, start, end, tweets ) values ( :type, date_sub(date_sub(date_sub(now(), interval $i minute), interval second(date_sub(now(), interval $i minute)) second), interval 1 minute), date_sub(date_sub(now(), interval " . ($i - 1) . " minute), interval second(date_sub(now(), interval " . ($i - 1) . " minute)) second), 0)"; + logit(CAPTURE . ".error.log", "$sql"); + $h = $dbh->prepare($sql); + $type = CAPTURE; + $h->bindParam(":type", $type, PDO::PARAM_STR); + $h->execute(); + + } + $dbh = false; +} + /* * Record a gap in the data */ function gap_record($role, $ustart, $uend) { if ($uend <= $ustart) { - return TRUE; + return FALSE; } - if (($uend - $ustart) < 15) { - // a less than 15 second gap is usually the result of a software restart/reload - // during that restart the tweet buffer is flushed and the gap is very tiny, therefore we ignore this - return TRUE; + // A less than IDLETIME gap doesn't make sense te record, because we assume IDLETIME seconds to be a legitimate timeframe + // up to which we don't expect data from Twitter + $gap_in_seconds = $uend - $ustart; + if (!defined('IDLETIME')) { + define('IDLETIME', 600); + } + if (!defined('IDLETIME_FOLLOW')) { + define('IDLETIME_FOLLOW', IDLETIME); + } + if ($role == 'follow') { + $idletime = IDLETIME_FOLLOW; + } else { + $idletime = IDLETIME; + } + if ($role == 'follow' && $gap_in_seconds < IDLETIME_FOLLOW || + $role != 'follow' && $gap_in_seconds < IDLETIME) { + return FALSE; } $dbh = pdo_connect(); - $sql = "insert into tcat_error_gap ( type, start, end ) values ( :role, :start, :end)"; + + $sql = "select 1 from tcat_error_gap where type = :role and start = FROM_UNIXTIME(:start)"; + $h = $dbh->prepare($sql); + $h->bindParam(":role", $role, PDO::PARAM_STR); + $h->bindParam(":start", $ustart, PDO::PARAM_STR); + $h->execute(); + if ($h->execute() && $h->rowCount() > 0) { + // Extend an existing gap record + $sql = "update tcat_error_gap set end = FROM_UNIXTIME(:end) where type = :role and start = FROM_UNIXTIME(:start)"; + } else { + // Insert a new gap record + $sql = "insert into tcat_error_gap ( type, start, end ) values ( :role, FROM_UNIXTIME(:start), FROM_UNIXTIME(:end) )"; + } $h = $dbh->prepare($sql); - $ustart = toDateTime($ustart); - $uend = toDateTime($uend); $h->bindParam(":role", $role, PDO::PARAM_STR); $h->bindParam(":start", $ustart, PDO::PARAM_STR); $h->bindParam(":end", $uend, PDO::PARAM_STR); @@ -392,10 +598,15 @@ function gap_record($role, $ustart, $uend) { function ratelimit_report_problem() { if (defined('RATELIMIT_MAIL_HOURS') && RATELIMIT_MAIL_HOURS > 0) { - $sql = "select count(*) as cnt from tcat_error_ratelimit where start > (now() - interval " . RATELIMIT_MAIL_HOURS . " hour)"; + $sql = "select count(*) as cnt from tcat_status where variable = 'email_ratelimit' and value > (now() - interval " . RATELIMIT_MAIL_HOURS . " hour);"; $result = mysql_query($sql); if ($row = mysql_fetch_assoc($result)) { - if (isset($row['cnt']) && $row['cnt'] > 0) { + if (isset($row['cnt']) && $row['cnt'] == 0) { + /* send e-mail and register time of the action */ + $sql = "delete from tcat_status where variable = 'email_ratelimit'"; + $result = mysql_query($sql); + $sql = "insert into tcat_status ( variable, value ) values ( 'email_ratelimit', now() )"; + $result = mysql_query($sql); global $mail_to; mail($mail_to, 'DMI-TCAT rate limit has been reached (server: ' . getHostName() . ')', 'The script running the ' . CAPTURE . ' query has hit a rate limit while talking to the Twitter API. Twitter is not allowing you to track more than 1% of its total traffic at any time. This means that the number of tweets exceeding the barrier are being dropped. Consider reducing the size of your query bins and reducing the number of terms and users you are tracking.' . "\n\n" . 'This may be a temporary or a structural problem. Please look at the webinterface for more details. Rate limit statistics on the website are historic, however. Consider this message indicative of a current issue. This e-mail will not be repeated for at least ' . RATELIMIT_MAIL_HOURS . ' hours.', 'From: no-reply@dmitcat'); @@ -405,7 +616,7 @@ function ratelimit_report_problem() { } function toDateTime($unixTimestamp) { - return date("Y-m-d H:i:s", $unixTimestamp); + return date("Y-m-d H:i:s", intval($unixTimestamp)); } /* @@ -576,7 +787,8 @@ function getActivePhrases() { $dbh = pdo_connect(); $sql = "SELECT DISTINCT(p.phrase) FROM tcat_query_phrases p, tcat_query_bins_phrases bp, tcat_query_bins b WHERE bp.endtime = '0000-00-00 00:00:00' AND p.id = bp.phrase_id - AND bp.querybin_id = b.id AND b.type != 'geotrack' AND b.active = 1"; + AND bp.querybin_id = b.id AND b.type != 'geotrack' AND b.active = 1 + AND ( b.access = " . TCAT_QUERYBIN_ACCESS_OK . " or b.access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " or b.access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . ")"; $rec = $dbh->prepare($sql); $rec->execute(); $results = $rec->fetchAll(PDO::FETCH_COLUMN); @@ -591,8 +803,11 @@ function getActivePhrases() { * What type is a bin (track, geotrack, follow, onepercent) */ -function getBinType($binname) { - $dbh = pdo_connect(); +function getBinType($binname, $dbh = null) { + $dbh_parm = is_null($dbh) ? false : true; + if (!$dbh_parm) { + $dbh = pdo_connect(); + } $sql = "SELECT querybin, `type` FROM tcat_query_bins WHERE querybin = :querybin"; $rec = $dbh->prepare($sql); $rec->bindParam(':querybin', $binname); @@ -605,7 +820,9 @@ function getBinType($binname) { } } } - $dbh = false; + if (!$dbh_parm) { + $dbh = false; + } return false; } @@ -615,7 +832,7 @@ function getBinType($binname) { function geobinsActive() { $dbh = pdo_connect(); - $sql = "SELECT COUNT(*) AS cnt FROM tcat_query_bins WHERE `type` = 'geotrack' and active = 1"; + $sql = "SELECT COUNT(*) AS cnt FROM tcat_query_bins WHERE `type` = 'geotrack' and active = 1 and ( access = " . TCAT_QUERYBIN_ACCESS_OK . " or access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " or access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . " )"; $rec = $dbh->prepare($sql); $rec->execute(); $results = $rec->fetchAll(PDO::FETCH_COLUMN); @@ -652,7 +869,8 @@ function getActiveLocationsImploded() { $dbh = pdo_connect(); $sql = "SELECT phrase FROM tcat_query_phrases p, tcat_query_bins_phrases bp, tcat_query_bins b WHERE bp.endtime = '0000-00-00 00:00:00' AND p.id = bp.phrase_id - AND bp.querybin_id = b.id AND b.type = 'geotrack' AND b.active = 1"; + AND bp.querybin_id = b.id AND b.type = 'geotrack' AND b.active = 1 + AND ( b.access = " . TCAT_QUERYBIN_ACCESS_OK . " or b.access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " or b.access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . ")"; $rec = $dbh->prepare($sql); $rec->execute(); $results = $rec->fetchAll(PDO::FETCH_COLUMN); @@ -718,7 +936,7 @@ function getActiveUsers() { function getActiveTrackBins() { $dbh = pdo_connect(); - $sql = "SELECT b.querybin, p.phrase FROM tcat_query_bins b, tcat_query_phrases p, tcat_query_bins_phrases bp WHERE b.active = 1 AND bp.querybin_id = b.id AND bp.phrase_id = p.id AND bp.endtime = '0000-00-00 00:00:00'"; + $sql = "SELECT b.querybin, p.phrase FROM tcat_query_bins b, tcat_query_phrases p, tcat_query_bins_phrases bp WHERE b.active = 1 AND bp.querybin_id = b.id AND bp.phrase_id = p.id AND bp.endtime = '0000-00-00 00:00:00' and ( b.access = " . TCAT_QUERYBIN_ACCESS_OK . " or b.access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " or b.access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . " )"; $rec = $dbh->prepare($sql); $querybins = array(); if ($rec->execute() && $rec->rowCount() > 0) { @@ -730,9 +948,25 @@ function getActiveTrackBins() { return $querybins; } +// This function returns a phrase_string:phrase_id associative array. + +function getActivePhraseIds() { + $dbh = pdo_connect(); + $sql = "SELECT p.phrase as phrase, p.id as id FROM tcat_query_bins b, tcat_query_phrases p, tcat_query_bins_phrases bp WHERE b.active = 1 AND bp.querybin_id = b.id AND bp.phrase_id = p.id AND bp.endtime = '0000-00-00 00:00:00' and ( b.access = " . TCAT_QUERYBIN_ACCESS_OK . " or b.access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " or b.access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . " )"; + $rec = $dbh->prepare($sql); + $phrase_ids = array(); + if ($rec->execute() && $rec->rowCount() > 0) { + while ($res = $rec->fetch()) { + $phrase_ids[trim(preg_replace("/'/", "", $res['phrase']))] = $res['id']; + } + } + $dbh = false; + return $phrase_ids; +} + function getActiveFollowBins() { $dbh = pdo_connect(); - $sql = "SELECT b.querybin, u.id AS uid FROM tcat_query_bins b, tcat_query_users u, tcat_query_bins_users bu WHERE b.active = 1 AND bu.querybin_id = b.id AND bu.user_id = u.id AND bu.endtime = '0000-00-00 00:00:00'"; + $sql = "SELECT b.querybin, u.id AS uid FROM tcat_query_bins b, tcat_query_users u, tcat_query_bins_users bu WHERE b.active = 1 AND bu.querybin_id = b.id AND bu.user_id = u.id AND bu.endtime = '0000-00-00 00:00:00' and ( b.access = " . TCAT_QUERYBIN_ACCESS_OK . " or b.access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " or b.access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . " )"; $rec = $dbh->prepare($sql); $querybins = array(); if ($rec->execute() && $rec->rowCount() > 0) { @@ -746,7 +980,7 @@ function getActiveFollowBins() { function getActiveOnepercentBin() { $dbh = pdo_connect(); - $sql = "select querybin from tcat_query_bins where type = 'onepercent' and active = 1"; + $sql = "select querybin from tcat_query_bins where type = 'onepercent' and active = 1 and ( access = " . TCAT_QUERYBIN_ACCESS_OK . " or access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " or access = " . TCAT_QUERYBIN_ACCESS_INVISIBLE . " )"; $rec = $dbh->prepare($sql); $querybins = array(); if ($rec->execute() && $rec->rowCount() > 0) { @@ -929,18 +1163,10 @@ function capture_flush_buffer() { function capture_signal_handler_term($signo) { - global $exceeding, $ratelimit, $ex_start; - logit(CAPTURE . ".error.log", "received TERM signal"); capture_flush_buffer(); - logit(CAPTURE . ".error.log", "writing rate limit information to database"); - - if (isset($exceeding) && $exceeding == 1) { - ratelimit_record($ratelimit, $ex_start); - } - logit(CAPTURE . ".error.log", "exiting now on TERM signal"); exit(0); @@ -1761,13 +1987,50 @@ public function add($object) { } +// This function takes a one-dimensional array with sets of the following data: tweet_id, phrase_id, created_at +// It inserts this data into the MySQL database using a multi-insert statement +function insert_captured_phrase_ids($captured_phrase_ids) { + global $dbuser, $dbpass, $database, $hostname; + if (empty($captured_phrase_ids)) return; + $dbh = pdo_connect(); + + // construct insert SQL + + $moresets = 0; + if (count($captured_phrase_ids) > 3) { + $moresets = count($captured_phrase_ids) / 3 - 1; + } + $sql = "INSERT DELAYED IGNORE INTO tcat_captured_phrases ( tweet_id, phrase_id, created_at ) VALUES ( ?, ?, ? )" . str_repeat(", (?, ?, ?)", $moresets); + $h = $dbh->prepare($sql); + for ($i = 0; $i < count($captured_phrase_ids); $i++) { + // bindParam() expects its first parameter ( index of the ? placeholder ) to start with 1 + if ($i % 3 == 2) { + $h->bindParam($i + 1, $captured_phrase_ids[$i], PDO::PARAM_STR); + } else { + $h->bindParam($i + 1, $captured_phrase_ids[$i], PDO::PARAM_INT); + } + } + $h->execute(); + + $dbh = false; + +} + /* * Start a tracking process */ function tracker_run() { + global $dbuser, $dbpass, $database, $hostname, $tweetQueue; + + // We need the tcat_status table + + create_error_logs(); + + // We need the tcat_captured_phrases table + + create_admin(); - global $tweetQueue; $tweetQueue = new TweetQueue(); $tweetQueue->setoption('replace', false); if (defined('USE_INSERT_DELAYED') && USE_INSERT_DELAYED) { @@ -1836,12 +2099,14 @@ function tracker_run() { logit(CAPTURE . ".error.log", "geoPHP functions are not yet available, see documentation for instructions"); } - global $ratelimit, $exceeding, $ex_start, $last_insert_id; + global $rl_current_record, $rl_registering_minute; + global $last_insert_id; + global $tracker_started_at; - $ratelimit = 0; // rate limit counter since start of script - $exceeding = 0; // are we exceeding the rate limit currently? - $ex_start = 0; // time at which rate limit started being exceeded - $last_insert_id = -1; + $rl_current_record = 0; // how many tweets have been ratelimited this MINUTE? + $rl_registering_minute = get_current_minute(); // what is the minute we are registering (as soon as the current minute differs from this, we insert our record in the database) + $last_insert_id = -1; // needed to make INSERT DELAYED work, see the function database_activity() + $tracker_started_at = time(); // the walltime when this script was started global $twitter_consumer_key, $twitter_consumer_secret, $twitter_user_token, $twitter_user_secret, $lastinsert; @@ -1947,39 +2212,55 @@ function tracker_streamCallback($data, $length, $metrics) { } } - // handle rate limiting - if (array_key_exists('limit', $data)) { - global $ratelimit, $exceeding, $ex_start; - if (isset($data['limit'][CAPTURE])) { - $current = $data['limit'][CAPTURE]; - if ($current > $ratelimit) { - // currently exceeding rate limit - if (!$exceeding) { - // new disturbance! - $ex_start = time(); - ratelimit_report_problem(); - // logit(CAPTURE . ".error.log", "you have hit a rate limit. consider reducing your query bin sizes"); - } - $ratelimit = $current; - $exceeding = 1; + // handle rate limiting at intervals of a single minute - if (time() > ($ex_start + RATELIMIT_SILENCE * 6)) { - // every half an hour (or: heartbeat x 6), record, but keep the exceeding flag set - ratelimit_record($ratelimit, $ex_start); - $ex_start = time(); - } - } elseif ($exceeding && time() < ($ex_start + RATELIMIT_SILENCE)) { - // we are now no longer exceeding the rate limit - // to avoid flip-flop we only reset our values after the minimal heartbeat has passed - // store rate limit disturbance information in the database - ratelimit_record($ratelimit, $ex_start); - $ex_start = 0; - $exceeding = 0; - } + global $rl_current_record, $rl_registering_minute; + global $tracker_started_at; + + $current = 0; + $current_minute = get_current_minute(); + + // we keep a a counter of the nr. of tweets rate limited and reset it at intervals of one minute + + // read possible rate limit information from Twitter + + if (array_key_exists('limit', $data) && isset($data['limit']['track'])) { + $current = $data['limit'][CAPTURE]; + // we have a new rate limit, grow the record + $rl_current_record += $current; + } else { + // when no new rate limits occur, sustain our current record + $current = $rl_current_record; + } + + if ($rl_registering_minute != $current_minute) { + + // the current minute is no longer our registering minute; we have to record our ratelimit information in the database + + if ($current_minute == 0 && $rl_registering_minute < 59 || + $current_minute > 0 && $current_minute < $rl_registering_minute || + $current_minute > $rl_registering_minute + 1) { + // there was a more than 1 minute silence (i.e. a response from Curl took longer than our 1 minute interval, thus we need to fill in zeros backwards in time) + $tracker_running = round((time() - $tracker_started_at) / 60); + ratelimit_holefiller($tracker_running); } + + $rl_registering_minute = $current_minute; + + // we now have rate limit information for the last minute + ratelimit_record($rl_current_record); + if ($rl_current_record > 0) { + ratelimit_report_problem(); + $rl_current_record = 0; + } + + } + + if (array_key_exists('limit', $data)) { unset($data['limit']); } + if (empty($data)) return; // sometimes we only get rate limit info @@ -2001,11 +2282,14 @@ function processtweets($capturebucket) { global $tweetQueue; $querybins = getActiveBins(); + $phrase_ids = getActivePhraseIds(); // cache bin types $bintypes = array(); foreach ($querybins as $binname => $queries) $bintypes[$binname] = getBinType($binname); + $captured_phrase_ids = array(); + // running through every single tweet foreach ($capturebucket as $data) { @@ -2145,7 +2429,12 @@ function processtweets($capturebucket) { } if ($found) { - break; + // found = true causes the tweet to be inserted into the database + // store phrase data (in this case a geobox definition) + $captured_phrase_ids[] = $data['id_str']; + $captured_phrase_ids[] = $phrase_ids[$query]; + $captured_phrase_ids[] = date("Y-m-d H:i:s", strtotime($data["created_at"])); + continue; } } else { @@ -2180,10 +2469,13 @@ function processtweets($capturebucket) { } } - // at the first fitting query, we break + // at the first fitting query, we set found to true (to indicate we should insert the tweet into the database) + // we also register the fact this keyword query has been matched if ($pass == true) { $found = true; - break; + $captured_phrase_ids[] = $data['id_str']; + $captured_phrase_ids[] = $phrase_ids[$query]; + $captured_phrase_ids[] = date("Y-m-d H:i:s", strtotime($data["created_at"])); } } } @@ -2208,6 +2500,7 @@ function processtweets($capturebucket) { } } $tweetQueue->insertDB(); + insert_captured_phrase_ids($captured_phrase_ids); return TRUE; } diff --git a/capture/ids/lookup.php b/capture/ids/lookup.php index d710afee..1a51ee2e 100644 --- a/capture/ids/lookup.php +++ b/capture/ids/lookup.php @@ -6,6 +6,7 @@ set_time_limit(0); error_reporting(E_ALL); include_once __DIR__ . '/../../config.php'; +include_once __DIR__ . '/../../common/constants.php'; include_once __DIR__ . '/../../common/functions.php'; include_once __DIR__ . '/../../capture/common/functions.php'; diff --git a/capture/index.php b/capture/index.php index 36eb7630..847d3d2f 100644 --- a/capture/index.php +++ b/capture/index.php @@ -1,5 +1,6 @@ '; + print "You have configured TCAT to automatically upgrade in the background. However, a specific upgrade instruction requires MySQL root privileges and cannot be run by TCAT itself. You will need to install the MySQL Time Zone Support manually using instructions provided here:

https://dev.mysql.com/doc/refman/5.5/en/time-zone-support.html

For Debian or Ubuntu Linux systems, the following command, issued as root (use sudo su to become root), will install the neccessary time zone data.

/usr/bin/mysql_tzinfo_to_sql /usr/share/zoneinfo | mysql --defaults-file=/etc/mysql/debian.cnf --force -u debian-sys-maint mysql
"; + } + $showupdatemsg = true; + } if (is_array($git)) { $remote = getGitRemote($git['commit'], $git['branch']); if (is_array($remote)) { @@ -135,7 +143,9 @@ $url = $remote['url']; $required = $remote['required']; $autoupgrade = 'autoupgrade()'; - print '
'; + if (!$showupdatemsg) { + print '
'; + } $wikilink = 'https://github.com/digitalmethodsinitiative/dmi-tcat/wiki/Upgrading-TCAT'; if ($required) { print "A newer version of TCAT is available, containing important updates. You are strongly recommended to upgrade. Please read the documentation for instructions on upgrading, or click here to schedule an automatic upgrade. [ commit $commit - $mesg ]
"; diff --git a/capture/public/form.trackgeophrases.php b/capture/public/form.trackgeophrases.php index b5d286f5..5a8c2261 100644 --- a/capture/public/form.trackgeophrases.php +++ b/capture/public/form.trackgeophrases.php @@ -1,5 +1,6 @@ prepare("SELECT end FROM tcat_error_ratelimit ORDER BY end DESC LIMIT 1"); + $rec = $dbh->prepare("SELECT end FROM tcat_error_ratelimit WHERE tweets > 0 and end > date_sub(now(), interval 2 day) ORDER BY end DESC LIMIT 1"); if ($rec->execute() && $rec->rowCount() > 0) { $res = $rec->fetch(); return $res['end']; diff --git a/capture/search/search.php b/capture/search/search.php index 2bf982a9..b4a935b4 100644 --- a/capture/search/search.php +++ b/capture/search/search.php @@ -6,6 +6,7 @@ set_time_limit(0); error_reporting(E_ALL); include_once __DIR__ . '/../../config.php'; +include_once __DIR__ . '/../../common/constants.php'; include_once __DIR__ . '/../../common/functions.php'; include_once __DIR__ . '/../../capture/common/functions.php'; diff --git a/capture/stream/controller.php b/capture/stream/controller.php index cc810a2c..e8be804b 100644 --- a/capture/stream/controller.php +++ b/capture/stream/controller.php @@ -9,6 +9,7 @@ function env_is_cli() { die; include_once __DIR__ . '/../../config.php'; +include_once __DIR__ . '/../../common/constants.php'; include __DIR__ . '/../../common/functions.php'; include __DIR__ . '/../common/functions.php'; @@ -27,6 +28,14 @@ function env_is_cli() { $dbh = pdo_connect(); $roles = unserialize(CAPTUREROLES); +// We need the tcat_status table + +create_error_logs(); + +// We need the tcat_captured_phrases table + +create_admin(); + // first gather all instructions sent by the webinterface to the controller (ie. the instruction queue) $upgrade_requested = false; $commands = array(); @@ -184,8 +193,12 @@ function env_is_cli() { if ($reload || $idled) { - // record confirmed gap - gap_record($role, $last, time()); + // record confirmed gap if we could measure it + if ($last && gap_record($role, $last, time())) { + logit("controller.log", "recording a data gap for script $role from '" . toDateTime($last) . "' to '" . toDateTime(time()) . "'"); + } else { + logit("controller.log", "we have no information about previous running time of script $role - cannot record a gap"); + } if ($running) { @@ -255,8 +268,10 @@ function env_is_cli() { logit("controller.log", "script $role was not running - starting"); // record confirmed gap if we could measure it - if ($last) { - gap_record($role, $last, time()); + if ($last && gap_record($role, $last, time())) { + logit("controller.log", "recording a data gap for script $role from '" . toDateTime($last) . "' to '" . toDateTime(time()) . "'"); + } else { + logit("controller.log", "we have no information about previous running time of script $role - cannot record a gap"); } // a forked process may inherit our lock, but we prevent this. diff --git a/capture/stream/dmitcat_follow.php b/capture/stream/dmitcat_follow.php index 7cf4f989..9516dde4 100644 --- a/capture/stream/dmitcat_follow.php +++ b/capture/stream/dmitcat_follow.php @@ -16,6 +16,7 @@ function env_is_cli() { // ----- includes ----- include __DIR__ . '/../../config.php'; // load base config file +include __DIR__ . '/../../common/constants.php'; // load constants file include __DIR__ . '/../../common/functions.php'; // load base functions file include __DIR__ . '/../common/functions.php'; // load capture function file diff --git a/capture/stream/dmitcat_onepercent.php b/capture/stream/dmitcat_onepercent.php index f2efeb27..1020c3d2 100644 --- a/capture/stream/dmitcat_onepercent.php +++ b/capture/stream/dmitcat_onepercent.php @@ -16,6 +16,7 @@ function env_is_cli() { // ----- includes ----- include __DIR__ . '/../../config.php'; // load base config file +include __DIR__ . '/../../common/constants.php'; // load constants file include __DIR__ . '/../../common/functions.php'; // load base functions file include __DIR__ . '/../common/functions.php'; // load capture function file diff --git a/capture/stream/dmitcat_track.php b/capture/stream/dmitcat_track.php index 62a018f6..eaf08e35 100644 --- a/capture/stream/dmitcat_track.php +++ b/capture/stream/dmitcat_track.php @@ -16,6 +16,7 @@ function env_is_cli() { // ----- includes ----- include __DIR__ . '/../../config.php'; // load base config file +include __DIR__ . '/../../common/constants.php'; // load constants file include __DIR__ . '/../../common/functions.php'; // load base functions file include __DIR__ . '/../common/functions.php'; // load capture function file diff --git a/capture/user/timeline.php b/capture/user/timeline.php index d22d3b4e..bbf6be4e 100644 --- a/capture/user/timeline.php +++ b/capture/user/timeline.php @@ -6,6 +6,7 @@ set_time_limit(0); error_reporting(E_ALL); include_once __DIR__ . '/../../config.php'; +include_once __DIR__ . '/../../common/constants.php'; include_once __DIR__ . '/../../common/functions.php'; include_once __DIR__ . '/../common/functions.php'; diff --git a/capture/user/user-friends.php b/capture/user/user-friends.php index e21e7c09..d8134dc5 100644 --- a/capture/user/user-friends.php +++ b/capture/user/user-friends.php @@ -8,6 +8,7 @@ set_time_limit(0); error_reporting(E_ALL); include_once __DIR__ . '/../../config.php'; +include_once __DIR__ . '/../../common/constants.php'; include_once __DIR__ . '/../../common/functions.php'; include_once __DIR__ . '/../common/functions.php'; diff --git a/common/constants.php b/common/constants.php new file mode 100644 index 00000000..0d8e7c95 --- /dev/null +++ b/common/constants.php @@ -0,0 +1,29 @@ +prepare($query); + $rec->execute(); + $results = $rec->fetchAll(PDO::FETCH_COLUMN); + if (count($results)) { + $have_tcat_status = true; + } else { + $have_tcat_status = false; + } + // 29/08/2014 Alter tweets tables to add new fields, ex. 'possibly_sensitive' $query = "SHOW TABLES"; @@ -586,6 +600,717 @@ function upgrades($dry_run = false, $interactive = true, $aulevel = 2, $single = } } + // 05/04/2016 Re-assemble historical TCAT ratelimit information to keep appropriate interval records (see the discussion on Github: https://github.com/digitalmethodsinitiative/dmi-tcat/issues/168) + + // First test if a reconstruction is neccessary + + $already_updated = true; + + $now = null; // this variable will store the moment the new gauge behaviour became effective + + if ($have_tcat_status) { + + $sql = "select value, unix_timestamp(value) as value_unix from tcat_status where variable = 'ratelimit_format_modified_at'"; + $rec = $dbh->prepare($sql); + if ($rec->execute() && $rec->rowCount() > 0) { + while ($res = $rec->fetch()) { + $now = $res['value']; + $now_unix = $res['value_unix']; + } + } + + $sql = "select value from tcat_status where variable = 'ratelimit_database_rebuild' and value > 0"; + $rec = $dbh->prepare($sql); + if (!$rec->execute() || $rec->rowCount() == 0) { + $already_updated = false; + } + + $bin_mysqldump = $bin_gzip = null; + + if ($already_updated == false) { + $bin_mysqldump = get_executable("mysqldump"); + if ($bin_mysqldump === null) { + logit($logtarget, "The mysqldump binary appears to be missing. Did you install the MySQL client utilities? Some upgrades will not work without this utility."); + $already_updated = true; + } + $bin_gzip = get_executable("gzip"); + if ($bin_gzip === null) { + logit($logtarget, "The gzip binary appears to be missing. Please lookup this utility in your software repository. Some upgrades will not work without this utility."); + $already_updated = true; + } + } + + } else { + + // The upgrade script will cause active tracking roles to restart; which may take up to a minute. Afterwards, we can expect the tcat_status table to exist and + // to have started recording timezone, gap and ratelimit information in the new gauge style. Because it really neccessary to wait for the tracking roles to behave correctly, + // we decide to skip this upgrade step and inform the user. + + logit($logtarget, "Your tracking roles are being restarted now (in the background) to record timezone, ratelimit and gap information in a newer style."); + logit($logtarget, "Afterwards will we be able to re-assemble historical ratelimit and gap information, and new export modules can become available."); + logit($logtarget, "Please wait at least one minute and then run this script again."); + + } + + if (!$already_updated && $now != null) { + $suggested = true; + $required = true; // this is a bugfix, therefore required + if ($dry_run == false) { + $ans = ''; + if ($interactive == false) { + // require auto-upgrade level 2 + if ($aulevel > 1) { + $ans = 'a'; + } else { + $ans = 'SKIP'; + } + } else { + $ans = cli_yesnoall("Re-assemble historical TCAT tweet time zone, ratelimit and gap information to keep appropriate records. It will take quite a while on long-running servers, though the majority of operations are non-blocking. If you have some very big bins (with 70+ million tweets inside them), you may wish to explore the USE_INSERT_DELAYED option in config.php and restart your trackers before running this upgrade. The upgrade procedure will temporarily render bins being upgraded invisible in the front-end.", 2); + } + if ($ans == 'y' || $ans == 'a') { + + global $dbuser, $dbpass, $database, $hostname; + putenv('MYSQL_PWD=' . $dbpass); /* this avoids having to put the password on the command-line */ + + // We need functioning timezone tables for this upgrade step + if (import_mysql_timezone_data() == FALSE) { + logit($logtarget, "ERROR -----"); + logit($logtarget, "ERROR - Your MySQL server is unfortunately missing timezone data which is needed to perform this upgrade step."); + logit($logtarget, "ERROR - This is usually caused by having a non-root user connecting to the database server."); + logit($logtarget, "ERROR - Your current configuration is secure and actually the recommended one (as it is also set-up this way by the TCAT auto-installer)."); + logit($logtarget, "ERROR - But you will now have to perform a single superuser (sudo) command manually."); + logit($logtarget, "ERROR -"); + logit($logtarget, "ERROR - For Debian or Ubuntu systems, become root (using sudo su) and execute the following command:"); + logit($logtarget, "ERROR -"); + logit($logtarget, "ERROR - /usr/bin/mysql_tzinfo_to_sql /usr/share/zoneinfo | mysql --defaults-file=/etc/mysql/debian.cnf --force -u debian-sys-maint mysql"); + logit($logtarget, "ERROR -"); + logit($logtarget, "ERROR - (you can safely ignore the line: Warning: Unable to load '/usr/share/zoneinfo/leap-seconds.list' as time zone. Skipping it.')"); + logit($logtarget, "ERROR -"); + logit($logtarget, "ERROR - For all other operating systems, please read the MySQL instructions here: http://dev.mysql.com/doc/refman/5.7/en/mysql-tzinfo-to-sql.html"); + logit($logtarget, "ERROR -----"); + logit($logtarget, "Discontinuing this upgrade step until the issue has been resolved."); + } else { + + // First make sure the historical tweet data is correct + + $query = "SHOW TABLES"; + $rec = $dbh->prepare($query); + $rec->execute(); + $results = $rec->fetchAll(PDO::FETCH_COLUMN); + foreach ($results as $k => $tweets_table) { + if (!preg_match("/_tweets$/", $tweets_table)) continue; + $badzone = 'Europe/London'; + logit($logtarget, "Fixing timezone for created_at field in table '$tweets_table' .."); + if (TCAT_CONFIG_DEPRECATED_TIMEZONE && TCAT_CONFIG_DEPRECATED_TIMEZONE_CONFIGURED) { + $badzone = TCAT_CONFIG_DEPRECATED_TIMEZONE_CONFIGURED; + } + /* + * NOTE: The MySQL native function CONVERT_TZ(datetimestring, 'badtimezone', 'UTC') helps to undo the bug described + * Here: https://github.com/digitalmethodsinitiative/dmi-tcat/issues/197 + * And here: https://github.com/digitalmethodsinitiative/dmi-tcat/pull/194 + */ + $sql = "SELECT id FROM `$tweets_table` WHERE CONVERT_TZ(created_at, '$badzone', 'UTC') <= '$now' ORDER BY CONVERT_TZ(created_at, '$badzone', 'UTC') DESC LIMIT 1"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->execute(); + $results2 = $rec2->fetch(PDO::FETCH_ASSOC); + $max_id = $results2['id']; + if (is_null($max_id)) { + logit($logtarget, "Table is either empty or does not need to be fixed. Skipping."); + continue; + } + $dbh->beginTransaction(); + $binname = preg_replace("/_tweets$/", "", $tweets_table); + $orig_access = TCAT_QUERYBIN_ACCESS_OK; + if (in_array($binname, $all_bins)) { + $sql = "SELECT `access` FROM tcat_query_bins WHERE querybin = :querybin"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->bindParam(":querybin", $binname, PDO::PARAM_STR); + $rec2->execute(); + $results2 = $rec2->fetch(PDO::FETCH_ASSOC); + $orig_access = $results2['access']; + $sql = "UPDATE tcat_query_bins SET access = " . TCAT_QUERYBIN_ACCESS_WRITEONLY . " WHERE querybin = :querybin"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->bindParam(":querybin", $binname, PDO::PARAM_STR); + $rec2->execute(); + } + $dbh->commit(); + $dbh->beginTransaction(); + $sql = "UPDATE `$tweets_table` SET created_at = CONVERT_TZ(created_at, '$badzone', 'UTC') WHERE id <= $max_id"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->execute(); + if (in_array($binname, $all_bins)) { + $sql = "UPDATE tcat_query_bins SET access = :access WHERE querybin = :querybin"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->bindParam(":access", $orig_access, PDO::PARAM_INT); + $rec2->bindParam(":querybin", $binname, PDO::PARAM_STR); + $rec2->execute(); + } + $dbh->commit(); + } + + // Start working on the gaps and ratelimit tables + + $ts = time(); + logit($logtarget, "Backuping existing tcat_error_ratelimit and tcat_error_gap information to your system's temporary directory."); + $targetfile = sys_get_temp_dir() . "/tcat_error_ratelimit_and_gap_$ts.sql"; + if (!file_exists($targetfile)) { + $cmd = "$bin_mysqldump --default-character-set=utf8mb4 -u$dbuser -h $hostname $database tcat_error_ratelimit tcat_error_gap > " . $targetfile; + system($cmd, $retval); + } else { + $retval = 1; // failure + } + if ($retval != 0) { + logit($logtarget, "I couldn't create a backup at $targetfile - perhaps the backup already exists? Aborting this upgrade step."); + } else { + logit($logtarget, $cmd); + $cmd = "$bin_gzip $targetfile"; + logit($logtarget, $cmd); + system($cmd); + logit($logtarget, "Backup placed here - you may want to store it somewhere else: " . $targetfile . '.gz'); + + // Fix issue described here https://github.com/digitalmethodsinitiative/dmi-tcat/issues/197 + + $sql = "SELECT id FROM tcat_error_ratelimit WHERE CONVERT_TZ(end, '$badzone', 'UTC') < '$now' ORDER BY CONVERT_TZ(end, '$badzone', 'UTC') DESC LIMIT 1"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->execute(); + $results2 = $rec2->fetch(PDO::FETCH_ASSOC); + $max_id = $results2['id']; + if (is_null($max_id)) { + $max_id = 0; // table is empty. + } + $dbh->beginTransaction(); + $sql = "UPDATE tcat_error_ratelimit SET start = CONVERT_TZ(start, '$badzone', 'UTC'), end = CONVERT_TZ(end, '$badzone', 'UTC') WHERE id <= $max_id"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->execute(); + $dbh->commit(); + + /* + * First part: rate limits + */ + + /* + * Strategy: + * + * As recording of ratelimit continues in tcat_error_ratelimit, we build a tcat_error_ratelimit_upgrade table. + * For the entire timespan _before_ the new gauge behaviour became effective, we do a minute-interval reconstruction in this temporary upgrade table. + * Finally we throw away existing tcat_error_ratelimit entries from this era and insert the ones from our temporary table. + * + */ + + $sql = "create temporary table if not exists tcat_error_ratelimit_upgrade ( id bigint, `type` varchar(32), start datetime not null, end datetime not null, tweets bigint not null, primary key(id, type), index(type), index(start), index(end) ) ENGINE=MyISAM"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + $sql = "select unix_timestamp(min(start)) as beginning_unix from tcat_error_ratelimit"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $results = $rec->fetch(PDO::FETCH_ASSOC); + $beginning_unix = $results['beginning_unix']; + if (is_null($beginning_unix)) { + $difference_minutes = 0; + } else { + $difference_minutes = round(($now_unix / 60 - $beginning_unix / 60) + 1); + } + logit($logtarget, "We have ratelimit information on this server for the past $difference_minutes minutes."); + logit($logtarget, "Reconstructing the rate limit for these now."); + + // If we have an end before a start time, we are sure we cannot trust any minute measurements before this occurence. + // This situation (end < start) was related to a bug in the toDateTime() function, which formatted the minute-part wrong. + + $sql = "select max(start) as time_fixed_dateformat, unix_timestamp(max(start)) as timestamp_fixed_dateformat from tcat_error_ratelimit where end < start"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $results = $rec->fetch(PDO::FETCH_ASSOC); + $ratelimit_time_fixed_dateformat = $results['time_fixed_dateformat']; + $ratelimit_timestamp_fixed_dateformat = $results['timestamp_fixed_dateformat']; + + $sql = "select max(start) as time_fixed_dateformat, unix_timestamp(max(start)) as timestamp_fixed_dateformat from tcat_error_gap where end < start"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $results = $rec->fetch(PDO::FETCH_ASSOC); + $gap_time_fixed_dateformat = $results['time_fixed_dateformat']; + $gap_timestamp_fixed_dateformat = $results['timestamp_fixed_dateformat']; + + // Compare query results and pick earliest possible moment of distrust. + + if (!$ratelimit_time_fixed_dateformat && !$gap_time_fixed_dateformat) { + // Assume all measurements where hour-based until now. + logit($logtarget, "Dateformat fix not found"); + $time_fixed_dateformat = $now; + $timestamp_fixed_dateformat = $now_unix; + } elseif (!$ratelimit_time_fixed_dateformat) { + // We have only information in the gap table + logit($logtarget, "Dateformat fix solely in tcat_error_gap"); + $time_fixed_dateformat = $gap_time_fixed_dateformat; + $timestamp_fixed_dateformat = $gap_timestamp_fixed_dateformat; + } elseif (!$gap_time_fixed_dateformat) { + // We have only information in the ratelimit table + logit($logtarget, "Dateformat fix solely in tcat_error_ratelimit"); + $time_fixed_dateformat = $ratelimit_time_fixed_dateformat; + $timestamp_fixed_dateformat = $ratelimit_timestamp_fixed_dateformat; + } else { + // Compare table information + if ($gap_timestamp_fixed_dateformat > $ratelimit_timestamp_fixed_dateformat) { + logit($logtarget, "Dateformat fix learned from tcat_error_gap"); + $time_fixed_dateformat = $gap_time_fixed_dateformat; + $timestamp_fixed_dateformat = $gap_timestamp_fixed_dateformat; + } else { + logit($logtarget, "Dateformat fix learned from tcat_error_ratelimit"); + $time_fixed_dateformat = $ratelimit_time_fixed_dateformat; + $timestamp_fixed_dateformat = $ratelimit_timestamp_fixed_dateformat; + } + } + + logit($logtarget, "Dateformat fix found at '$time_fixed_dateformat'"); + + logit($logtarget, "Processing everything before MySQL date $now"); + + // Zero all minutes until the beginning of our capture era, for roles track and follow + + for ($i = 1; $i <= $difference_minutes; $i++) { + $sql = "insert into tcat_error_ratelimit_upgrade ( id, `type`, `start`, `end`, `tweets` ) values ( $i, 'track', + date_sub( date_sub('$now', interval $i minute), interval second(date_sub('$now', interval $i minute)) second ), + date_sub( date_sub('$now', interval " . ($i - 1) . " minute), interval second(date_sub('$now', interval " . ($i - 1) . " minute)) second ), + 0 )"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $sql = "insert into tcat_error_ratelimit_upgrade ( id, `type`, `start`, `end`, `tweets` ) values ( $i, 'follow', + date_sub( date_sub('$now', interval $i minute), interval second(date_sub('$now', interval $i minute)) second ), + date_sub( date_sub('$now', interval " . ($i - 1) . " minute), interval second(date_sub('$now', interval " . ($i - 1) . " minute)) second ), + 0 )"; + $rec = $dbh->prepare($sql); + $rec->execute(); + if ($i % ($difference_minutes/100) == 0) { + logit($logtarget, "Creating temporary table " . round($i/$difference_minutes * 100) . "% completed"); + } + } + + logit($logtarget, "Building a new ratelimit table in temporary space"); + + $roles = array ( 'track', 'follow' ); + + foreach ($roles as $role) { + + logit($logtarget, "Handle rate limits for role $role"); + + /* + * Start reading the tcat_error_ratelimit table for the role we are working on. We are using the 'start' column because it contains sufficient information. + */ + $sql = "select id, + `type` as role, + date_format(start, '%k') as measure_hour, + date_format(start, '%i') as measure_minute, + tweets as incr_record from tcat_error_ratelimit where `type` = '$role' + order by id desc"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $consolidate_hour = -1; // the hour we are working on to consolidate our data + $consolidate_minute = -1; // the minute we are working on to consolidate our data + $consolidate_max_id = -1; // the maximum tcat_error_ratelimit ID within the consolidation timeframe + while ($res = $rec->fetch()) { + // measure_minute will contain the minute we are reading from the table (remember: backwards in time) + $measure_minute = ltrim($res['measure_minute']); + if ($measure_minute == '') { + $measure_minute = 0; + } + // measure_hour will contain the minute we are reading from the table (again: backwards in time) + $measure_hour = $res['measure_hour']; + if ($measure_minute != $consolidate_minute || $measure_hour != $consolidate_hour) { + /* + * We are reading a new entry not inside our consolidation frame (which has the resolution of an hour or minute) + * We will consolidate our data, unless we are at the first row. + */ + if ($consolidate_minute == -1) { + // first row received + $consolidate_minute = $measure_minute; + $consolidate_hour = $measure_hour; + $consolidate_max_id = $res['id']; + } else { + $controller_restart_detected = false; + /* + * The SQL query below reads the MIN and MAX recorded tweets values for our interval. + * + * It additionally checks to detect controller resets. Whenever the controller resets itself, because of a crash or server reboot, + * the incremental counter will jump to zero. This SQL query recognizes this sudden jump by explicitely verifying the order. + * + * Note: this query uses max(start) to determine the start parameter to pass to the smoothing function. If we would've used min(start), + * we inadvertently include the start column of the NEXT row, and that's not our intention. Because we are using max(start), it is + * possible that the difference in minutes between the 'start' and 'end' becomes less than 1 minute. Our smoother function is + * aware of this. + * + */ + $sql = "select max(tweets) as record_max, + min(tweets) as record_min, + max(start) as start, unix_timestamp(max(start)) as start_unix, + max(end) as end, unix_timestamp(max(end)) as end_unix + from tcat_error_ratelimit where `type` = '$role' and + id >= " . $res['id'] . " and + id <= $consolidate_max_id and + ( select tweets from tcat_error_ratelimit where id = $consolidate_max_id ) > + ( select tweets from tcat_error_ratelimit where id = " . $res['id'] . " )"; + $rec2 = $dbh->prepare($sql); + $rec2->execute(); // our query will always return a non-empty result, because min()/max() always produce a row (with a possible NULL as value) + while ($res2 = $rec2->fetch()) { + if ($res2['record_max'] == null) { + // The order is NOT incremental. + $controller_restart_detected = true; + } + $record_max = $res2['record_max']; + $record_min = $res2['record_min']; + $record = $record_max - $record_min; + if ($controller_restart_detected) { + } elseif ($record >= 0) { + ratelimit_smoother($dbh, $timestamp_fixed_dateformat, $role, $res2['start'], $res2['end'], $res2['start_unix'], $res2['end_unix'], $record); + } + } + $consolidate_minute = $measure_minute; + $consolidate_hour = $measure_hour; + $consolidate_max_id = $res['id']; + } + } + } + if ($consolidate_minute != -1) { + // we consolidate the last minute + $sql = "select max(tweets) as record_max, + min(tweets) as record_min, + min(start) as start, unix_timestamp(min(start)) as start_unix, + max(end) as end, unix_timestamp(max(end)) as end_unix + from tcat_error_ratelimit where `type` = '$role' and + id <= $consolidate_max_id"; + $rec2 = $dbh->prepare($sql); + $rec2->execute(); + while ($res2 = $rec2->fetch()) { + $record_max = $res2['record_max']; + $record_min = $res2['record_min']; + $record = $record_max - $record_min; + if ($record > 0) { + ratelimit_smoother($dbh, $timestamp_fixed_dateformat, $role, $res2['start'], $res2['end'], $res2['start_unix'], $res2['end_unix'], $record); + } + } + } + } + + // By using a TRANSACTION block here, we ensure the tcat_error_ratelimit will not end up in an undefined state + + $dbh->beginTransaction(); + + $sql = "delete from tcat_error_ratelimit where start < '$now' or end < '$now'"; + $rec = $dbh->prepare($sql); + logit($logtarget, "Removing old records from tcat_error_ratelimit"); + $rec->execute(); + + $sql = "insert into tcat_error_ratelimit ( `type`, start, end, tweets ) select `type`, start, end, tweets from tcat_error_ratelimit_upgrade order by start asc"; + $rec = $dbh->prepare($sql); + logit($logtarget, "Inserting new records into tcat_error_ratelimit"); + $rec->execute(); + + /* + * The next operation will break the tie between the ascending order of the ID primary key, and the datetime columns start and end. This is not a problem per se. + * Rebuilding that order is feasible, but we shouldn't re-run this upgrade step anyway and this will never be presented as an option to the user. + * If something goes wrong, restore the original table from the backup instead. + */ + + $sql = "insert into tcat_status ( variable, value ) values ( 'ratelimit_database_rebuild', '1' )"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + $dbh->commit(); + + logit($logtarget, "Rebuilding of tcat_error_ratelimit has finished"); + $sql = "drop table tcat_error_ratelimit_upgrade"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + /* + * Second part: gaps + * + * Notice we do all datetime functions in native MySQL. This may appear to be cumbersome but is has the advantage of having to do a minimal ammount of datetime conversions, + * and being able to mostly ignore the system clock (OS), with the single exception of the reduce_gap_size() function. + * The gap table is not big and this upgrade step should maximally take several hours on long-running servers. + */ + + logit($logtarget, "Now rebuilding tcat_error_gap table"); + + // Fix issue described here https://github.com/digitalmethodsinitiative/dmi-tcat/issues/197 + + $sql = "SELECT id FROM tcat_error_gap WHERE CONVERT_TZ(end, '$badzone', 'UTC') < '$now' ORDER BY CONVERT_TZ(end, '$badzone', 'UTC') DESC LIMIT 1"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->execute(); + $results2 = $rec2->fetch(PDO::FETCH_ASSOC); + $max_id = $results2['id']; + if (is_null($max_id)) { + $max_id = 0; // table is empty. + } + $dbh->beginTransaction(); + $sql = "UPDATE tcat_error_gap SET start = CONVERT_TZ(start, '$badzone', 'UTC'), end = CONVERT_TZ(end, '$badzone', 'UTC') WHERE id <= $max_id"; + logit($logtarget, "$sql"); + $rec2 = $dbh->prepare($sql); + $rec2->execute(); + $dbh->commit(); + + $existing_roles = array ( 'track', 'follow', 'onepercent' ); + foreach ($existing_roles as $type) { + + $time_begin_gap = $timestamp_begin_gap = null; + + // Note: 1970-01-01 is the Unix timestamp for NULL. It was written to the database whenever there was a gap with an 'unknown' start time, + // due to the fact that there was no proc/ information available to the controller. This behaviour has changed. + + $sql = "select min(start) as time_begin_gap, unix_timestamp(min(start)) as timestamp_begin_gap FROM tcat_error_gap where type = '$type' and start > '1970-01-01 01:01:00'"; + $rec = $dbh->prepare($sql); + $rec->execute(); + if ($rec->execute() && $rec->rowCount() > 0) { + while ($row = $rec->fetch(PDO::FETCH_ASSOC)) { + $time_begin_gap = $row['time_begin_gap']; + $timestamp_begin_gap = $row['timestamp_begin_gap']; + } + } + + if (!$now || !$now_unix || !$time_begin_gap || !$timestamp_begin_gap) { + logit($logtarget, "Nothing to do for role $type"); + continue; + } + + $difference_minutes = round(($now_unix / 60 - $timestamp_begin_gap / 60) + 1); + logit($logtarget, "For role $type, we have gap information on this server for the past $difference_minutes minutes."); + + $gaps = array(); + + $sql = "select * from tcat_error_gap where type = '$type' and start > '1970-01-01 01:01:00' and end < '$now' order by id, start asc"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $ignore_start = $already_recorded_until = null; + $trust_minute_measurement = false; + while ($row = $rec->fetch(PDO::FETCH_ASSOC)) { + if ($row['start'] == $ignore_start) { continue; } + // If we are being told about a gap we already know, skip it + if ($already_recorded_until) { + $sql2 = "select '" . $row['start'] . "' > '$already_recorded_until'"; + $rec2 = $dbh->prepare($sql2); + $rec2->execute(); + $later_in_time = $rec2->fetchColumn(); + if ($later_in_time != '1') { + // Not registering the gap starting at $row['start'] here, because it is already accounted for. + continue; + } + } + // If we know for a fact the minute measurement is accurate, we work with that precision, and otherwise + // we try to attain it by searching the real capture data. + if (!$trust_minute_measurement) { + $sql2 = "select '" . $row['start'] . "' > '$time_fixed_dateformat'"; + $rec2 = $dbh->prepare($sql2); + $rec2->execute(); + $later_in_time = $rec2->fetchColumn(); + if ($later_in_time == '1') { + $trust_minute_measurement = true; + } + } + // The controller could create repeated rows with the same 'start' value if it didn't manage to boot up a role + // The next query recognizes this. + $sql2 = "select max(end) as max_end from tcat_error_gap where type = '$type' and start = '" . $row['start'] . "'"; + $rec2 = $dbh->prepare($sql2); + $rec2->execute(); + $max_end = null; + while ($row2 = $rec2->fetch(PDO::FETCH_ASSOC)) { + $max_end = $row2['max_end']; + break; + } + if ($max_end) { + // Example: '2016-04-19 03:12:44' + if (preg_match("/^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})$/", $max_end, $matches_end) && + preg_match("/^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})$/", $row['start'], $matches_start)) { + + if (!$trust_minute_measurement) { + + // Drop a distrusted minute measurement due to previous dateformat bug + // This first defines the gap as wide as possible (with an hourly precision). Afterwards we prune it by searching the real capture data. + + $matches_start[5] = '00'; // minutes start + $matches_start[6] = '00'; // seconds start + $matches_end[5] = '59'; // minutes end + $matches_end[6] = '59'; // seconds end + + $new_start = $matches_start[1] . '-' . $matches_start[2] . '-' . $matches_start[3] . ' ' . + $matches_start[4] . ':' . $matches_start[5] . ':' . $matches_start[6]; + $new_end = $matches_end[1] . '-' . $matches_end[2] . '-' . $matches_end[3] . ' ' . + $matches_end[4] . ':' . $matches_end[5] . ':' . $matches_end[6]; + + // Now attempt to reduce the gap size + + $reduced = reduce_gap_size($type, $new_start, $new_end); + if (is_null($reduced)) { + logit($logtarget, "Erroneous gap report for role $type from '" . $new_start . "' to '" . $new_end . "'"); + } else { + $new_start = $reduced['shrunk_start']; + $new_end = $reduced['shrunk_end']; + } + + } else { + + // Just copy both complete strings + + $new_start = $matches_start[0]; + $new_end = $matches_end[0]; + + // logit($logtarget, "We trust the next recorded gap without search"); + + } + + // logit($logtarget, "Detected possible gap from '" . $new_start . "' to '" . $new_end . "' - now investigating"); + + logit($logtarget, "Recording gap for role $type from '" . $new_start . "' to '" . $new_end . "'"); + $duplicate = false; + foreach ($gaps as $gap) { + if ($gap['start'] == $new_start && $gap['end'] == $new_end) { + $duplicate = true; + } + } + if (!$duplicate) { + $gap = array( 'start' => $new_start, 'end' => $new_end ); + $gaps[] = $gap; + } + + $ignore_start = $row['start']; + $already_recorded_until = $new_end; + + } + } + } + + // By using a TRANSACTION block here, we ensure the tcat_error_gap will not end up in an undefined state + + $dbh->beginTransaction(); + + $sql = "delete from tcat_error_gap where type = '$type' and end <= '$now'"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + // Knit gap timespans togheter if they are absolutely sequential. + + $newgaps = array(); + $first = true; + $previous_start = $previous_end = null; + foreach ($gaps as $gap) { + if ($first) { + $previous_start = $gap['start']; + $previous_end = $gap['end']; + $first = false; + continue; + } + $sql = "select timediff('" . $gap['start'] . "', '$previous_end') as difference"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $difference = null; + while ($row = $rec->fetch(PDO::FETCH_ASSOC)) { + $difference = $row['difference']; + break; + } + // This one second difference will be produced by us widening an hourly measurement as much as possible + // (and not finding any real tweet data to shrink it) + if ($gap !== end($gaps) && isset($difference) && $difference == '00:00:01') { + // Keep on knittin' + $previous_end = $gap['end']; + } else { + $newgaps[] = array ( 'start' => $previous_start, 'end' => $previous_end ); + $previous_start = $gap['start']; + $previous_end = $gap['end']; + } + } + // The knitting won't produce a result in a situation with only a single gap record + if (count($gaps) > 1) { + $gaps = $newgaps; + } + + foreach ($gaps as $gap) { + $sql = "insert into tcat_error_gap ( `type`, `start`, `end` ) values ( '$type', '" . $gap['start'] . "', '" . $gap['end'] . "' )"; + $rec = $dbh->prepare($sql); + $rec->execute(); + } + + // The final step is to prune tcap_error_gap to remove all gaps lesser than IDLETIME + if (!defined('IDLETIME')) { + define('IDLETIME', 600); + } + if (!defined('IDLETIME_FOLLOW')) { + define('IDLETIME_FOLLOW', IDLETIME); + } + if ($type == 'follow') { + $idletime = IDLETIME_FOLLOW; + } else { + $idletime = IDLETIME; + } + $sql = "delete from tcat_error_gap where time_to_sec(timediff(end,start)) < $idletime"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + $dbh->commit(); + + } + + logit($logtarget, "Rebuilding of tcat_error_gap has finished"); + + // For ratelimit exports to function, we want to have the tcat_captured_phrases table + // As we have not recorded this (yet), we need to reconstruct it from the previous phrases and detect which tweets contain which queries. + logit($logtarget, "Creating the tcat_captured_phrases table"); + + create_admin(); + $trackbins = get_track_bin_phrases(); + + foreach ($trackbins as $querybin => $phrases) { + logit($logtarget, "Extracting keyword phrase matches from querybin $querybin and inserting into tcat_captured_phrases .."); + foreach ($phrases as $phrase => $phrase_id) { + if (substr($phrase, 0, 1) !== "'" && strpos($phrase, ' ') !== false) { + // The user intends a AND match here, such as: [ scottish AND independence ] + // Both words should be in the tweet, but not neccessarily next to each other ( according to the documentation: https://github.com/digitalmethodsinitiative/dmi-tcat/wiki/FAQ#keyword-track ) + $subphrases = explode(' ', $phrase); + $sql = "REPLACE INTO tcat_captured_phrases ( tweet_id, phrase_id, created_at ) SELECT id, $phrase_id, created_at FROM " . quoteIdent($querybin . "_tweets") . " WHERE text REGEXP ?"; + if (count($subphrases) > 1) { + $sql .= str_repeat(" AND text REGEXP ?", count($subphrases) - 1); + } + $rec = $dbh->prepare($sql); + $i = 1; + foreach ($subphrases as $subphrase) { + $regexp = "[[:<:]]" . $subphrase . "[[:>:]]"; + $rec->bindParam($i, $regexp, PDO::PARAM_STR); + $i++; + } + $rec->execute(); + } else { + // The user intends an exact string match here, such as: [ 'scottish independence' ] + // Or the keyword string is a simple, single-word phrase + $phrasematch = str_replace("'", "", $phrase); // replace any occurances of the quoting character + $sql = "REPLACE INTO tcat_captured_phrases ( tweet_id, phrase_id, created_at ) SELECT id, $phrase_id, created_at FROM " . quoteIdent($querybin . "_tweets") . " WHERE text REGEXP :regexp"; + $rec = $dbh->prepare($sql); + $regexp = "[[:<:]]" . $phrasematch . "[[:>:]]"; + $rec->bindParam(":regexp", $regexp, PDO::PARAM_STR); + $rec->execute(); + } + // logit($logtarget, $sql); // print SQL statements for debugging purposes + } + } + + // Indicate to the analytics panel we have fully executed this upgrade step and export functions can become available + + $sql = "update tcat_status set value = 2 where variable = 'ratelimit_database_rebuild'"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + logit($logtarget, "Phrases table successfully built. Gap and ratelimit export features have now been unlocked."); + + } + } + } + } + } + // 24/05/2016 Fix index of tweet_id on withheld tables $query = "SHOW TABLES"; @@ -619,7 +1344,7 @@ function upgrades($dry_run = false, $interactive = true, $aulevel = 2, $single = if ($update && $dry_run) { $suggested = true; } - if ($update) { + if ($update && $dry_run == false) { if ($ans !== 'a') { $ans = cli_yesnoall("Fixing index of tweet_id on table $v", 0, '2f1c585fac9e2646951bb44f61e864f4488a37e6'); } @@ -635,6 +1360,10 @@ function upgrades($dry_run = false, $interactive = true, $aulevel = 2, $single = // End of upgrades + if ($required == true && $suggested == true) { + $required = true; $suggested = false; // only return the strongest option + } + if ($dry_run) { return array( 'suggested' => $suggested, 'required' => $required ); } @@ -700,4 +1429,248 @@ function upgrades($dry_run = false, $interactive = true, $aulevel = 2, $single = } +/* + * Smoothing function for ratelimit re-assembly + */ +function ratelimit_smoother($dbh, $timestamp_fixed_dateformat, $role, $start, $end, $start_unix, $end_unix, $tweets) { + $minutes_difference = round(abs($end_unix / 60 - $start_unix / 60)); + if ($tweets <= 0) return; + if ($minutes_difference > 61) { + + // TCAT has ratelimit data with a big time difference (more than an hour) + // Consolidate around the last hour and create an average tweets per minute. + + $avg_tweets_per_minute = round($tweets / 60); + if ($avg_tweets_per_minute == 0) { + return; + } + + $sql = "update tcat_error_ratelimit_upgrade set tweets = $avg_tweets_per_minute where `type` = '$role' and + start >= date_sub( date_sub( date_sub( '$end', interval second('$end') second ), interval minute('$end') minute), interval 1 hour ) and + end <= date_sub( date_sub( '$end', interval second('$end') second ), interval minute('$end') minute )"; + + $rec = $dbh->prepare($sql); + $rec->execute(); + + return; + } + + if ($start_unix > $timestamp_fixed_dateformat) { + + // Within this timeframe, the minute-part of the timestamp can be trusted + // as a result of fix: https://github.com/digitalmethodsinitiative/dmi-tcat/commit/5385937cc38869ba0a6e9a2ace7875afe7eb1256 + + if ($minutes_difference == 0) { + + // TCAT is already capturing and registering time ratelimits per MINUTE here, + // But is recording multiple hits within a single minute. We will consolidate those to :00 - :00 of whole minutes + // as per the new gauge measurement style. + + $sql = "update tcat_error_ratelimit_upgrade set tweets = $tweets where `type` = '$role' and + start >= date_sub( '$start', interval second('$start') second ) and + end <= date_add( date_sub( '$end', interval second('$end') second ), interval 1 minute)"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + } elseif ($minutes_difference > 0 && $minutes_difference < 59) { + + $avg_tweets_per_minute = round($tweets / $minutes_difference); + if ($avg_tweets_per_minute == 0) { + return; + } + + // TCAT is already capturing and registering time ratelimits per MINUTE here + // We keep the tweet record, but strip the seconds + + $sql = "update tcat_error_ratelimit_upgrade set tweets = $avg_tweets_per_minute where `type` = '$role' and + start >= date_sub( '$start', interval second('$start') second ) and + end <= date_sub( '$end', interval second('$end') second )"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + } else if ($minutes_difference <= 61) { + + // TCAT has ratelimit data with an HOURLY precision here, round minutes to start and end of the measurement hour + // and create an average tweets per minute. + + $avg_tweets_per_minute = round($tweets / 60); + if ($avg_tweets_per_minute == 0) { + return; + } + + $sql = "update tcat_error_ratelimit_upgrade set tweets = $avg_tweets_per_minute where `type` = '$role' and + start >= date_sub( date_sub( '$start', interval second('$start') second ), interval minute('$start') minute) and + end <= date_sub( date_sub( '$end', interval second('$end') second ), interval minute('$end') minute )"; + + $rec = $dbh->prepare($sql); + $rec->execute(); + + } + + } else { + + // Within this timeframe, the minute-part of the timestamp cannot be trusted. + + // TCAT has ratelimit data with an HOURLY precision here (with an erroneous minute measurement). + + if ($minutes_difference == 0) { + + // We have multiple (untrusted) measurements within one hour; consolidate around the whole hour + + $avg_tweets_per_minute = round($tweets / 60); + if ($avg_tweets_per_minute == 0) { + return; + } + + $sql = "update tcat_error_ratelimit_upgrade set tweets = $avg_tweets_per_minute where `type` = '$role' and + start >= date_sub( date_sub( '$start', interval second('$start') second ), interval minute('$start') minute) and + end <= date_add( date_sub( date_sub( '$end', interval second('$end') second ), interval minute('$end') minute ), interval 1 hour)"; + + $rec = $dbh->prepare($sql); + $rec->execute(); + + } else { + + // We have an trusted hourly measurement; and the difference between the previous rate limit hit is not more than one hour. + // Consolidate around the hour. + + $avg_tweets_per_minute = round($tweets / 60); + if ($avg_tweets_per_minute == 0) { + return; + } + + $sql = "update tcat_error_ratelimit_upgrade set tweets = $avg_tweets_per_minute where `type` = '$role' and + start >= date_sub( date_sub( '$start', interval second('$start') second ), interval minute('$start') minute) and + end <= date_sub( date_sub( '$end', interval second('$end') second ), interval minute('$end') minute )"; + + $rec = $dbh->prepare($sql); + $rec->execute(); + + } + + } + +} + +/* + * This function takes two MySQL formatted datetime strings. These parameters are the widest possible gap (defined with HOURLY accuracy). + * The function seeks to improve the accuracy as much as possible by searching real capture data across all bins of the same type. + * If any data is found inside our gap-frame. We shrink the gap and return the new dimensions. + */ +function reduce_gap_size($type, $start, $end) { + global $all_bins; + $dbh = pdo_connect(); + + $shrunk_start = $start; + $shrunk_end = $end; + + $sql = "create temporary table gap_searcher ( measurement datetime primary key )"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + foreach ($all_bins as $bin) { + + // Filter to only consider bins with the tracking role under consideration + $bintype = getBinType($bin, $dbh); + if ($bintype == 'geotrack') { $bintype = 'track'; } + if ($bintype != $type) { + continue; + } + + // This SQL query performs an explicit cast to handle the problems with created_at and timezones described here https://github.com/digitalmethodsinitiative/dmi-tcat/issues/197 + // We compare it with the dates we have in the gap table, which is the date specified by config.php + $sql = "insert ignore into gap_searcher select created_at from $bin" . "_tweets + where created_at > '$start' and created_at < '$end'"; + $rec = $dbh->prepare($sql); + $rec->execute(); + } + + $sql = "select measurement from gap_searcher order by measurement asc"; + $rec = $dbh->prepare($sql); + $rec->execute(); + $date_previous = null; + $biggest_gap = -1; + $biggest_gap_start = $biggest_gap_end = null; + while ($row = $rec->fetch(PDO::FETCH_ASSOC)) { + $date = $row['measurement']; + if (is_null($date_previous)) { + $date_previous = $date; + continue; + } + $sql2 = "select timediff('$date', '$date_previous') as gap_size"; + $rec2 = $dbh->prepare($sql2); + $rec2->execute(); + $gap_size = null; + while ($row2 = $rec2->fetch(PDO::FETCH_ASSOC)) { + if (isset($row2['gap_size'])) { + $gap_size = $row2['gap_size']; + } + } + if ($gap_size) { + if (preg_match("/^(\d{2}):(\d{2}):(\d{2})$/", $gap_size, $matches)) { + $hours = intval($matches[1]); $minutes = intval($matches[2]); $seconds = intval($matches[3]); + $gap_in_seconds = $seconds + $minutes * 60 + $hours * 3600; + if (!defined('IDLETIME')) { + define('IDLETIME', 600); + } + if (!defined('IDLETIME_FOLLOW')) { + define('IDLETIME_FOLLOW', IDLETIME); + } + // As per controller behaviour, we do not consider this a gap. + if ($type == 'follow' && $gap_in_seconds < IDLETIME_FOLLOW || + $type != 'follow' && $gap_in_seconds < IDLETIME) { + // As per controller behaviour, we do not consider this a gap. + continue; + } + if ($gap_in_seconds > $biggest_gap) { + $biggest_gap = $gap_in_seconds; + $biggest_gap_start = $date_previous; + $biggest_gap_end = $date; + } + } + } + $date_previous = $date; + } + + if ($biggest_gap !== -1) { + $shrunk_start = $biggest_gap_start; + $shrunk_end = $biggest_gap_end; + } + + if ($biggest_gap == 1) { + // This is a situation where there doesn't appear to be a real data gap + return null; + } + + $sql = "drop table gap_searcher"; + $rec = $dbh->prepare($sql); + $rec->execute(); + + $dbh = null; + return array( 'shrunk_start' => $shrunk_start, 'shrunk_end' => $shrunk_end ); +} + +function get_executable($binary) { + $where = `which $binary`; + $where = trim($where); + if (!is_string($where) || !file_exists($where)) { + return null; + } + return $where; +} + +// Returns an array with all the (active or non-active) track bins and their associated phrases (also the no longer running ones) +function get_track_bin_phrases() { + $dbh = pdo_connect(); + $sql = "SELECT b.querybin, p.phrase, p.id FROM tcat_query_bins b, tcat_query_phrases p, tcat_query_bins_phrases bp WHERE b.type = 'track' AND bp.querybin_id = b.id AND bp.phrase_id = p.id"; + $rec = $dbh->prepare($sql); + $querybins = array(); + if ($rec->execute() && $rec->rowCount() > 0) { + while ($res = $rec->fetch()) { + $querybins[$res['querybin']][$res['phrase']] = $res['id']; + } + } + $dbh = false; + return $querybins; +} diff --git a/config.php.example b/config.php.example index 210a8152..9e4a67c0 100644 --- a/config.php.example +++ b/config.php.example @@ -127,11 +127,6 @@ define('AUTOUPDATE_LEVEL', 'trivial'); */ define('IDLETIME', 600); -/* - * To avoid excessive verbosity, assume a minimal length of ratetime disturbance (heartbeat) in seconds - */ -define('RATELIMIT_SILENCE', 300); - /* * Report rate limit problems to the administrator every x hours ( 0 = no mail reporting ) */ diff --git a/helpers/export.php b/helpers/export.php index ad4a9db0..068a447c 100755 --- a/helpers/export.php +++ b/helpers/export.php @@ -307,11 +307,14 @@ function env_is_cli() { fputs($fh, "-- DMI-TCAT - Update TCAT tables\n"); fputs($fh, "--\n"); -$sql = "INSERT INTO tcat_query_bins ( querybin, `type`, active, visible ) values ( " . $dbh->Quote($bin) . ", " . $dbh->Quote($bintype) . ", 0, 1 );"; +$sql = "INSERT INTO tcat_query_bins ( querybin, `type`, active, access ) values ( " . $dbh->Quote($bin) . ", " . $dbh->Quote($bintype) . ", 0, 0 );"; fputs($fh, $sql . "\n"); if ($bintype == 'track') { + // Notice: We do not export information from the tcat_captured_phrases table here. Because we allow adding of phrasing and querybins to an existing tcat installation, + // the phrase IDs will change and it would not be safe to simply copy the data (phrase id <-> phrase text would no longer match) + foreach ($phrases as $phrase) { $sql = "INSERT INTO tcat_query_phrases ( phrase ) values ( " . $dbh->Quote($phrase) . " );"; fputs($fh, $sql . "\n"); diff --git a/helpers/import.php b/helpers/import.php index a7972d03..3a7ce7e4 100755 --- a/helpers/import.php +++ b/helpers/import.php @@ -58,7 +58,7 @@ function env_is_cli() { if (preg_match("/^-- Table structure for table `(.*)_tweets`/", $line, $matches)) { array_push($queryBins, $matches[1]); } - if (preg_match("/^INSERT INTO tcat_query_bins \( querybin, `type`, active, visible \) values \( '(.*?)',/", $line, $matches)) { + if (preg_match("/^INSERT INTO tcat_query_bins \( querybin, `type`, active, access \) values \( '(.*?)',/", $line, $matches)) { array_push($queryBins, $matches[1]); } } diff --git a/helpers/tcat-install-linux.sh b/helpers/tcat-install-linux.sh index 6605fbac..c3e251c9 100755 --- a/helpers/tcat-install-linux.sh +++ b/helpers/tcat-install-linux.sh @@ -21,7 +21,7 @@ # - Ubuntu 14.04 # - Ubuntu 15.04 # - Ubuntu 15.10 -# - Debian 8.1 +# - Debian 8.* # #---------------------------------------------------------------- @@ -1201,6 +1201,12 @@ password="${TCATMYSQLPASS}" EOF echo "$PROG: account details saved: $FILE" +# Install MySQL server timezone data + +mysql_tzinfo_to_sql /usr/share/zoneinfo | mysql --defaults-file="$MYSQL_USER_ADMIN_CNF" mysql + +# Create twittercapture database + echo "CREATE DATABASE IF NOT EXISTS twittercapture DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_unicode_ci;" | mysql --defaults-file="$MYSQL_USER_ADMIN_CNF" echo "GRANT CREATE, DROP, LOCK TABLES, ALTER, DELETE, INDEX, INSERT, SELECT, UPDATE, CREATE TEMPORARY TABLES ON twittercapture.* TO '$TCATMYSQLUSER'@'localhost' IDENTIFIED BY '$TCATMYSQLPASS';" | mysql --defaults-file="$MYSQL_USER_ADMIN_CNF" echo "FLUSH PRIVILEGES;" | mysql --defaults-file="$MYSQL_USER_ADMIN_CNF" diff --git a/import/import-gnip.php b/import/import-gnip.php index c3a57e9b..c28756a2 100644 --- a/import/import-gnip.php +++ b/import/import-gnip.php @@ -4,6 +4,7 @@ die; // only run from command line include_once __DIR__ . '/../config.php'; +include_once __DIR__ . '/../common/constants.php'; include_once __DIR__ . '/../common/functions.php'; include_once __DIR__ . '/../capture/common/functions.php'; diff --git a/import/import-jsondump.php b/import/import-jsondump.php index 84291da4..8da8f392 100644 --- a/import/import-jsondump.php +++ b/import/import-jsondump.php @@ -3,6 +3,7 @@ if ($argc < 1) die; // only run from command line include_once __DIR__ . '/../config.php'; +include_once __DIR__ . '/../common/constants.php'; include_once __DIR__ . '/../common/functions.php'; include_once __DIR__ . '/../capture/common/functions.php'; diff --git a/import/import-timeline.php b/import/import-timeline.php index 194f145e..59d5c3e1 100644 --- a/import/import-timeline.php +++ b/import/import-timeline.php @@ -1,6 +1,7 @@