coala · pradeepgangwar · Feb 4, 2018
diff --git a/.ci/build.sh b/.ci/build.sh
@@ -4,6 +4,7 @@ set -e -x
 
 mkdir private _site public
 
+scrapy crawl gsoc
 if [[ -n "$GCI_TOKEN" ]]; then
   python manage.py fetch_gci_task_data private
   python manage.py cleanse_gci_task_data private _site

diff --git a/community/urls.py b/community/urls.py
@@ -7,6 +7,8 @@
 from django.conf import settings
 from django.views.generic import TemplateView
 
+from gsoc.views import index as gsoc_index
+from gsoc.views import projects as gsoc_projects
 from gci.views import index as gci_index
 from gci.feeds import LatestTasksFeed as gci_tasks_rss
 from activity.scraper import activity_json
@@ -51,6 +53,18 @@ def get_index():
         distill_func=get_index,
         distill_file='gci/index.html',
     ),
+    distill_url(
+        r'^gsoc/$', gsoc_index,
+        name='community-gsoc',
+        distill_func=get_index,
+        distill_file='gsoc/index.html',
+    ),
+    distill_url(
+        r'^gsoc/projects/$', gsoc_projects,
+        name='community-gsoc-projects',
+        distill_func=get_index,
+        distill_file='gsoc/projects.html',
+    ),
     distill_url(
         r'twitter/', twitter_index,
         name='twitter',

diff --git a/gsoc/config.py b/gsoc/config.py
@@ -0,0 +1,16 @@
+import ruamel.yaml
+import os
+from django.conf import settings
+
+os.environ['DJANGO_SETTINGS_MODULE'] = 'community.settings'
+
+DATA_DIR = settings.STATIC_ROOT
+
+
+def load_cache(filename):
+    with open(os.path.join(DATA_DIR, filename), 'r') as f:
+        return ruamel.yaml.load(f, Loader=ruamel.yaml.Loader)
+
+
+def get_year():
+    return 2018 - 1
diff --git a/gsoc/data.py b/gsoc/data.py
@@ -0,0 +1,21 @@
+from .config import load_cache
+
+
+_org = {}
+_projects = {}
+
+
+def get_org_data():
+    global _org
+    if not _org:
+        _org = load_cache('gsoc_org_info.yaml')
+
+    return _org
+
+
+def get_projects_data():
+    global _projects
+    if not _projects:
+        _projects = load_cache('gsoc_project_info.yaml')
+
+    return _projects
diff --git a/gsoc/urls.py b/gsoc/urls.py
@@ -0,0 +1,8 @@
+from django.conf.urls import url
+
+from . import views
+
+urlpatterns = [
+    url(r'^$', views.index, name='index'),
+    url(r'^projects/$', views.projects, name='projects'),
+]
diff --git a/gsoc/views.py b/gsoc/views.py
@@ -0,0 +1,68 @@
+from django.shortcuts import render
+from django.http import Http404
+import logging
+
+from .data import get_org_data
+from .data import get_projects_data
+from community.git import get_owner
+from gsoc.config import get_year
+
+logger = logging.getLogger(__name__ + '.index')
+org_name = get_owner()
+year = get_year()
+
+
+def index(request):
+    try:
+        org = get_org_data()
+    except FileNotFoundError:
+        logger.info('GSoC data not available')
+        raise Http404
+    else:
+        for key in org.keys():
+            id = org.get(key).get('id')
+            name = org.get(key).get('name')
+            tagline = org.get(key).get('tagline')
+            description = org.get(key).get('description')
+            tech = []
+            for technology in org.get(key).get('technologies').values():
+                tech.append(technology)
+
+            return render(request, 'gsoc.html', {'id': id,
+                                                 'name': name,
+                                                 'tagline': tagline,
+                                                 'description': description,
+                                                 'tech': tech
+                                                 })
+
+
+def projects(request):
+    try:
+        org = get_org_data()
+    except FileNotFoundError:
+        logger.info('GSoC data not available')
+        raise Http404
+    else:
+        for key in org.keys():
+            name = org.get(key).get('name')
+        projects = get_projects_data()
+        projects_list = []
+        for key in projects.keys():
+            mentors = []
+            for mentor in projects.get(key).get('mentors').values():
+                mentors.append(mentor)
+            item = {
+                'id': projects.get(key).get('id'),
+                'summary': projects.get(key).get('summary'),
+                'title': projects.get(key).get('title'),
+                'student': projects.get(key).get('student'),
+                'code': projects.get(key).get('project_code'),
+                'link': projects.get(key).get('project_link'),
+                'mentors': mentors
+            }
+            projects_list.append(item)
+        return render(request, 'gsoc_projects.html',
+                      {
+                        'project_list': projects_list,
+                        'org_name': name
+                      })
diff --git a/gsocscrape/__init__.py b/gsocscrape/__init__.py
diff --git a/gsocscrape/items.py b/gsocscrape/items.py
@@ -0,0 +1,5 @@
+import scrapy
+
+
+class GsocscrapeItem(scrapy.Item):
+    pass
diff --git a/gsocscrape/middlewares.py b/gsocscrape/middlewares.py
@@ -0,0 +1,27 @@
+from scrapy import signals
+
+
+class GsocscrapeSpiderMiddleware(object):
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/gsocscrape/pipelines.py b/gsocscrape/pipelines.py
@@ -0,0 +1,3 @@
+class GsocscrapePipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/gsocscrape/settings.py b/gsocscrape/settings.py
@@ -0,0 +1,6 @@
+BOT_NAME = 'gsocscrape'
+
+SPIDER_MODULES = ['gsocscrape.spiders']
+NEWSPIDER_MODULE = 'gsocscrape.spiders'
+
+ROBOTSTXT_OBEY = True
diff --git a/gsocscrape/spiders/__init__.py b/gsocscrape/spiders/__init__.py
diff --git a/gsocscrape/spiders/gsoc.py b/gsocscrape/spiders/gsoc.py
@@ -0,0 +1,132 @@
+import scrapy
+import string
+import json
+import logging
+
+from ruamel.yaml import YAML
+import os.path
+from collections import OrderedDict
+from community.git import get_owner
+from gsoc.config import get_year
+
+
+logger = logging.getLogger(__name__ + '.index')
+org_name = get_owner()
+year = get_year()
+yaml = YAML()
+
+
+class GsocSpider(scrapy.Spider):
+    name = 'gsoc'
+    start_urls = [
+        'https://summerofcode.withgoogle.com/archive/{}/organizations'
+        .format(year),
+    ]
+
+    def parse(self, response):
+        home_url = 'https://summerofcode.withgoogle.com/'\
+                   'archive/{}/organizations/'.format(year)
+        selector = "//li[contains(.,'{org_name}')]/a/@href".format(
+            org_name=org_name)
+        organization_link = response.xpath(selector)
+
+        if(organization_link):
+            organization_link = organization_link[0].extract().split('/')[4]
+        else:
+            logger.info('Organisation {} does not exist in GSoC for {}'.format(
+                org_name, year
+            ))
+            return
+
+        follow_link = home_url + organization_link
+        yield response.follow(follow_link, self.parse_org)
+
+    def parse_org(self, response):
+        project_url = 'https://summerofcode.withgoogle.com/'\
+                      'archive/{}/projects/'.format(year)
+
+        technology = {}
+
+        id = response.url.split('/')[-2]
+        org_name = response.css('h3.banner__title::text')[0].extract()
+        org_tagline = response.css('h4.org__tagline::text')[0].extract()
+        org_long_description = response.xpath(
+            "//div[@class='org__long-description']")[0].extract()
+        org_technologies = response.xpath(".//div[@class='org__meta']/div"
+                                          "[contains(.,'Technologies')]/ul/"
+                                          'li/text()').extract()
+
+        count = 0
+        for tech in org_technologies:
+            technology['%s' % str(count)] = tech
+            count = count + 1
+
+        item = {
+            'id': id,
+            'name': org_name,
+            'tagline': org_tagline,
+            'description': org_long_description,
+            'technologies': technology
+        }
+        org_data = {}
+        org_data[int(item['id'])] = item
+
+        with open(os.path.join('_site', 'gsoc_org_info.yaml'), 'w') as f:
+            yaml.dump(org_data, f)
+
+        # Overwrite any previous data with empty dataset
+        open(os.path.join('_site', 'gsoc_project_info.yaml'), 'w').close()
+
+        for res in response.css('a.archive-project-card__link'):
+            link = res.xpath('@href').extract()[0]
+            link = link.split('/')[4]
+            url_project = project_url + link
+            yield response.follow(url_project, self.parse_project)
+
+    def parse_project(self, response):
+        mentors = []
+        org_url = 'https://summerofcode.withgoogle.com/'\
+                  'archive/{}/organizations/'.format(year)
+        page = response.url.split('/')[-2]
+
+        project_id = page
+        project_title = response.css('h3.banner__title::text')[0].extract()
+        project_summary = response.xpath(
+            "//div[@class='org__long-description']")[0].extract()
+        project_organization_code = response.css(
+            'md-card.org__info-card a::attr(href)')[0].extract().split('/')[4]
+        project_link = response.url
+        project_organization_url = org_url + project_organization_code
+        project_organization_name = response.css('md-card.org__info-card '
+                                                 'a::text')[3].extract()
+        project_code = response.css('md-card.org__info-card '
+                                    'a::attr(href)')[1].extract()
+        project_student = response.xpath(
+            ".//div[@class='org__meta']/div[contains(.,'Student')]/"
+            'div/text()')[0].extract()
+        project_mentors = response.xpath(
+            ".//div[@class='org__meta']/div[contains(.,'Mentors')]"
+            '/ul/li/text()').extract()
+        ment = {}
+        count = 0
+        for mentor in project_mentors:
+            ment['%s' % str(count)] = mentor
+            count = count + 1
+
+        item = {
+            'id': project_id,
+            'title': project_title,
+            'summary': project_summary,
+            'student': project_student,
+            'mentors': ment,
+            'organization_name': project_organization_name,
+            'organization_code': project_organization_code,
+            'organization_url': project_organization_url,
+            'project_link': project_link,
+            'project_code': project_code
+        }
+        project_data = {}
+        project_data[int(item['id'])] = item
+
+        with open(os.path.join('_site', 'gsoc_project_info.yaml'), 'a') as f:
+            yaml.dump(project_data, f)
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ django-distill
 django-eventtools
 IGitt==0.4.1.dev20180111025558
 requests
+scrapy
 python-dateutil
 pillow
 ruamel.yaml

diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = gsocscrape.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = gsocscrape
diff --git a/templates/gsoc.html b/templates/gsoc.html
@@ -0,0 +1,27 @@
+<html>
+
+    <head>
+        <title> GSoC 2017 Data </title>
+    </head>
+
+    <body>
+            <p>{{id}}</p>
+            <hr>
+            <h3>Name</h3>
+            <p>{{name}}</p>
+            <hr>
+            <h3>Tagline</h3>
+            <p>{{tagline}}</p>
+            <hr>
+            <h3>Description</h3>
+            <p>{% autoescape off %}{{description}}{% endautoescape %}</p>
+            <hr>
+            <h3> Technology </h3>
+            <ul>
+            {% for technology in tech %}
+                <li>{{technology}}</li>
+            {% endfor %}
+            </ul>
+    </body>
+
+</html>