Permalink
Browse files

Part 5: Playing Fair

Since we are writing a program that makes use of the public
infrastructure that is the Internet, it makes sense to play fair and
make our programs behave properly so that we can avoid clashes with the
webmasters, or even worse retaliation.

Webmasters may not want their site to be spidered by other computers,
and they have a way to say it clearly with the robots.txt file. Inside a
robots.txt file, which is a simple text file stating one or more rules,
a webmaster can describe if scraping is allowed at all, or otherwise
limit scraping to only some files and folders, or only to some specific
spiders.

Our plan to playing fair is to read the site’s robots file, and stay
well clear of the actual content if we are not allowed to read it. The
wisest thing to do here is to stand on the shoulders of giants, since
there are already other people who have done this in the past. And
luckily some of these folks have shared their efforts for everyone else
to use. Which means that we do not have to reinvent the wheel, and will
only write a few lines of code to read the robots.txt file and make our
program compliant.
  • Loading branch information...
edvella
edvella committed Jun 19, 2014
1 parent 6182218 commit 068d539f5306e43c42e3830c04dedfcd96b61524
View
@@ -1,4 +1,10 @@
Keyword Checker (Beginner's C# Tutorial Series)
===============================================
This project is the accompanying code for the beginner's C# tutorial at [edvella.com](<http://www.edvella.com/Post/20/Beginners_Scraping_With_C_Part_1>).
This project is the accompanying code for the beginner's C# tutorial at [edvella.com](<http://www.edvella.com/Post/20/Beginners_Scraping_With_C_Part_1>).
Part 1: [Beginner's Scraping With C#](http://www.edvella.com/Post/20/Beginners_Scraping_With_C_Part_1)
Part 2: [Command Line Parameters](http://www.edvella.com/Post/21/Beginners_C_Part_2_Command_Line_Parameters)
Part 3: [GUI Days](http://www.edvella.com/Post/22/Beginners_C_Part_3_GUI_Days)
Part 4: [Regular Expressions](http://www.edvella.com/Post/23/Beginners_C_Part_4_Regular_Expressions)
Part 5: [Playing Fair](http://www.edvella.com/Post/24/Playing_Fair)
@@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8" ?>
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5"/>
</startup>
</configuration>
<appSettings>
</appSettings></configuration>
View
@@ -1,4 +1,5 @@
using System;
using RobotsTxt;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
@@ -14,39 +15,50 @@ namespace keywordChecker
{
public partial class frmSettings : Form
{
WebClient client;
public frmSettings()
{
InitializeComponent();
}
private void btnCheck_Click(object sender, EventArgs e)
{
var client = new WebClient();
client = new WebClient();
var url = txtUrl.Text;
url = !string.IsNullOrEmpty(url) && Uri.IsWellFormedUriString(url, UriKind.Absolute) ?
url : "http://www.gametrailers.com";
var keywords = txtKeywords.Text;
keywords = !string.IsNullOrEmpty(keywords) ? keywords : "final fantasy";
var pageContent = client.DownloadString(url);
var keywordLocation = pageContent.IndexOf(keywords, StringComparison.OrdinalIgnoreCase);
StringBuilder sb = new StringBuilder();
if (keywordLocation >= 0)
if (CheckRobots(url))
{
var pageIds = Regex.Matches(pageContent, @"id=""\s*?\S*?""");
string matchedId = closestId(keywordLocation, pageIds);
string idTag = matchedId.Substring(4, matchedId.Length - 5);
brwPreview.Navigate(url + "#" + idTag);
sb.AppendFormat("{0} are talking about {1} today.", url, keywords);
sb.Append("\n\nSnippet:\n" + pageContent.Substring(keywordLocation, 100));
sb.AppendFormat("\n\nClosest id: {0}", idTag);
var pageContent = client.DownloadString(url);
var keywordLocation = pageContent.IndexOf(keywords, StringComparison.InvariantCultureIgnoreCase);
StringBuilder sb = new StringBuilder();
if (keywordLocation >= 0)
{
var pageIds = Regex.Matches(pageContent, @"id=""\s*?\S*?""");
string matchedId = closestId(keywordLocation, pageIds);
string idTag = matchedId.Substring(4, matchedId.Length - 5);
brwPreview.Navigate(url + "#" + idTag);
sb.AppendFormat("{0} are talking about {1} today.", url, keywords);
sb.Append("\n\nSnippet:\n" + pageContent.Substring(keywordLocation, 100));
sb.AppendFormat("\n\nClosest id: {0}", idTag);
}
else
{
sb.Append("Keyword not found!");
}
lblResult.Text = sb.ToString();
}
else
{
sb.Append("Keyword not found!");
lblResult.Text = "Blocked by robots.txt!";
}
lblResult.Text = sb.ToString();
}
private string closestId(int keywordLocation, MatchCollection matchingIds)
@@ -72,5 +84,25 @@ private string closestId(int keywordLocation, MatchCollection matchingIds)
}
return closestIdName;
}
/// <summary>
/// Checks if a url is allowed to be spidered by our bot.
/// </summary>
/// <param name="url">The url to check.</param>
/// <returns>True if allowed, false if not.</returns>
private bool CheckRobots(string url)
{
var robotsFileLocation = new Uri(url).GetLeftPart(UriPartial.Authority) + "/robots.txt";
try
{
var robotsFileContent = client.DownloadString(robotsFileLocation);
Robots robots = Robots.Load(robotsFileContent);
return robots.IsPathAllowed("keywordChecker", url);
}
catch
{
return true;
}
}
}
}
@@ -32,6 +32,9 @@
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="RobotsTxt">
<HintPath>..\packages\RobotsTxt.2014.02.19\lib\RobotsTxt.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
@@ -64,6 +67,7 @@
<AutoGen>True</AutoGen>
<DependentUpon>Resources.resx</DependentUpon>
</Compile>
<None Include="packages.config" />
<None Include="Properties\Settings.settings">
<Generator>SettingsSingleFileGenerator</Generator>
<LastGenOutput>Settings.Designer.cs</LastGenOutput>
@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="RobotsTxt" version="2014.02.19" targetFramework="net45" />
</packages>
Binary file not shown.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.
Binary file not shown.
@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<repositories>
<repository path="..\keywordChecker\packages.config" />
</repositories>

0 comments on commit 068d539

Please sign in to comment.