-
Notifications
You must be signed in to change notification settings - Fork 0
/
VLGroupScraper.cs
127 lines (108 loc) · 3.18 KB
/
VLGroupScraper.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Core.Model;
using FileHelpers;
using HtmlAgilityPack;
namespace Scraper
{
public class VLGroupScraper
{
private static IEnumerable<string> malenames =
GetListFromCVS("data/drenge.csv").Select(_ => _.Trim());
private static IEnumerable<string> femalenames =
GetListFromCVS("data/piger.csv").Select(_ => _.Trim());
public static IEnumerable<Member> GetMembers()
{
var relevantAreas = new List<string>() {
"København",
"Århus",
"Aarhus",
"Aalborg",
"Ålborg",
"Vestsjælland",
"Kolding",
"Midt- og Nordvestjylland",
"Esbjerg",
"Haderslev-Kolding",
"Østjylland",
"Vendsyssel",
"Fyn",
"Fredericia",
"Helsingør",
"Vejle",
"Viborg",
"Horsens",
"Sydjylland",
"Midt-Østjylland",
"Lillebælt",
"Frederiksborg",
"Trekantområdet",
"Sønderjylland",
"Sydvest Jylland",
};
var doc = Util.LoadDoc(
Util.GetHtml("http://iframe.vl.dk/gruppeoversigt.php", Encoding.UTF8));
var rows = doc.DocumentNode.
SelectNodes("//table[@id='table-1']/tbody/tr/td[@width='40%']/a");
var allgroups = rows.OfType<HtmlNode>().
Select(_ => new
{
Name = _.InnerText.Trim(),
Url = _.Attributes["href"].Value,
});
var nondanish = allgroups.Where(_ => !relevantAreas.Contains(_.Name));
Console.WriteLine("Ignoring {0}", string.Join(", ", nondanish.Select(x => x.Name)));
var danishGroups = allgroups.Where(_ => relevantAreas.Contains(_.Name));
var members = danishGroups.AsParallel().WithDegreeOfParallelism(10).SelectMany(_ =>
GetGroup(_.Url, _.Name));
return members;
}
private static bool GetGender(string firstname)
{
string testname = firstname.Split(' ')[0].Trim();
var forcedMaleNames = new[] { "Bo", "Kim", "Johnny", "Benny", "Tonny" };
var forcedFemaleNames = new[] { "Mai", "Joan", "Kelly" };
if (forcedMaleNames.Contains(testname))
return true;
if (forcedFemaleNames.Contains(testname))
return false;
if (malenames.Contains(testname))
return true;
if (femalenames.Contains(testname))
return false;
// this is an expedient measure to get groups with no women
return true;
}
private static IEnumerable<string> GetListFromCVS(string filename)
{
var eng = new FileHelperEngine<Name>();
return eng.ReadFile(filename).Select(_ => _.TheName);
}
private static IEnumerable<Member> GetGroup(string url, string area)
{
var doc = Util.LoadDoc(Util.GetHtml("http://iframe.vl.dk/" + url, Encoding.UTF8));
var rows = doc.DocumentNode.
SelectNodes("//table[@id='table-1']/tbody/tr");
foreach (var row in rows)
{
var cells = row.SelectNodes("./td");
yield return new Member
{
Title = cells[0].InnerText.Trim(),
Firstname = cells[1].InnerText.Trim(),
Lastname = cells[2].InnerText.Trim(),
Company = cells[3].InnerText.Trim(),
Group = int.Parse(url.Split(new string[] { "id=" }, StringSplitOptions.RemoveEmptyEntries)[1]),
ProbablyGender = GetGender(cells[1].InnerText.Trim()),
};
}
}
}
[DelimitedRecord(",")]
class Name
{
public string TheName { get; set; }
}
}