Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add generic GSoC import app #76

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .ci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set -e -x

mkdir private _site public

scrapy crawl gsoc
if [[ -n "$GCI_TOKEN" ]]; then
python manage.py fetch_gci_task_data private
python manage.py cleanse_gci_task_data private _site
Expand Down
14 changes: 14 additions & 0 deletions community/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from django.conf import settings
from django.views.generic import TemplateView

from gsoc.views import index as gsoc_index
from gsoc.views import projects as gsoc_projects
from gci.views import index as gci_index
from gci.feeds import LatestTasksFeed as gci_tasks_rss
from activity.scraper import activity_json
Expand Down Expand Up @@ -51,6 +53,18 @@ def get_index():
distill_func=get_index,
distill_file='gci/index.html',
),
distill_url(
r'^gsoc/$', gsoc_index,
name='community-gsoc',
distill_func=get_index,
distill_file='gsoc/index.html',
),
distill_url(
r'^gsoc/projects/$', gsoc_projects,
name='community-gsoc-projects',
distill_func=get_index,
distill_file='gsoc/projects.html',
),
distill_url(
r'twitter/', twitter_index,
name='twitter',
Expand Down
16 changes: 16 additions & 0 deletions gsoc/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import ruamel.yaml
import os
from django.conf import settings

os.environ['DJANGO_SETTINGS_MODULE'] = 'community.settings'

DATA_DIR = settings.STATIC_ROOT


def load_cache(filename):
with open(os.path.join(DATA_DIR, filename), 'r') as f:
return ruamel.yaml.load(f, Loader=ruamel.yaml.Loader)


def get_year():
return 2018 - 1
21 changes: 21 additions & 0 deletions gsoc/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .config import load_cache


_org = {}
_projects = {}


def get_org_data():
global _org
if not _org:
_org = load_cache('gsoc_org_info.yaml')

return _org


def get_projects_data():
global _projects
if not _projects:
_projects = load_cache('gsoc_project_info.yaml')

return _projects
8 changes: 8 additions & 0 deletions gsoc/urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from django.conf.urls import url

from . import views

urlpatterns = [
url(r'^$', views.index, name='index'),
url(r'^projects/$', views.projects, name='projects'),
]
68 changes: 68 additions & 0 deletions gsoc/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from django.shortcuts import render
from django.http import Http404
import logging

from .data import get_org_data
from .data import get_projects_data
from community.git import get_owner
from gsoc.config import get_year

logger = logging.getLogger(__name__ + '.index')
org_name = get_owner()
year = get_year()


def index(request):
try:
org = get_org_data()
except FileNotFoundError:
logger.info('GSoC data not available')
raise Http404
else:
for key in org.keys():
id = org.get(key).get('id')
name = org.get(key).get('name')
tagline = org.get(key).get('tagline')
description = org.get(key).get('description')
tech = []
for technology in org.get(key).get('technologies').values():
tech.append(technology)

return render(request, 'gsoc.html', {'id': id,
'name': name,
'tagline': tagline,
'description': description,
'tech': tech
})


def projects(request):
try:
org = get_org_data()
except FileNotFoundError:
logger.info('GSoC data not available')
raise Http404
else:
for key in org.keys():
name = org.get(key).get('name')
projects = get_projects_data()
projects_list = []
for key in projects.keys():
mentors = []
for mentor in projects.get(key).get('mentors').values():
mentors.append(mentor)
item = {
'id': projects.get(key).get('id'),
'summary': projects.get(key).get('summary'),
'title': projects.get(key).get('title'),
'student': projects.get(key).get('student'),
'code': projects.get(key).get('project_code'),
'link': projects.get(key).get('project_link'),
'mentors': mentors
}
projects_list.append(item)
return render(request, 'gsoc_projects.html',
{
'project_list': projects_list,
'org_name': name
})
Empty file added gsocscrape/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions gsocscrape/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import scrapy


class GsocscrapeItem(scrapy.Item):
pass
27 changes: 27 additions & 0 deletions gsocscrape/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from scrapy import signals


class GsocscrapeSpiderMiddleware(object):

@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
return None

def process_spider_output(self, response, result, spider):
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
pass

def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
3 changes: 3 additions & 0 deletions gsocscrape/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class GsocscrapePipeline(object):
def process_item(self, item, spider):
return item
6 changes: 6 additions & 0 deletions gsocscrape/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
BOT_NAME = 'gsocscrape'

SPIDER_MODULES = ['gsocscrape.spiders']
NEWSPIDER_MODULE = 'gsocscrape.spiders'

ROBOTSTXT_OBEY = True
Empty file added gsocscrape/spiders/__init__.py
Empty file.
132 changes: 132 additions & 0 deletions gsocscrape/spiders/gsoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import scrapy
import string
import json
import logging

from ruamel.yaml import YAML
import os.path
from collections import OrderedDict
from community.git import get_owner
from gsoc.config import get_year


logger = logging.getLogger(__name__ + '.index')
org_name = get_owner()
year = get_year()
yaml = YAML()


class GsocSpider(scrapy.Spider):
name = 'gsoc'
start_urls = [
'https://summerofcode.withgoogle.com/archive/{}/organizations'
.format(year),
]

def parse(self, response):
home_url = 'https://summerofcode.withgoogle.com/'\
'archive/{}/organizations/'.format(year)
selector = "//li[contains(.,'{org_name}')]/a/@href".format(
org_name=org_name)
organization_link = response.xpath(selector)

if(organization_link):
organization_link = organization_link[0].extract().split('/')[4]
else:
logger.info('Organisation {} does not exist in GSoC for {}'.format(
org_name, year
))
return

follow_link = home_url + organization_link
yield response.follow(follow_link, self.parse_org)

def parse_org(self, response):
project_url = 'https://summerofcode.withgoogle.com/'\
'archive/{}/projects/'.format(year)

technology = {}

id = response.url.split('/')[-2]
org_name = response.css('h3.banner__title::text')[0].extract()
org_tagline = response.css('h4.org__tagline::text')[0].extract()
org_long_description = response.xpath(
"//div[@class='org__long-description']")[0].extract()
org_technologies = response.xpath(".//div[@class='org__meta']/div"
"[contains(.,'Technologies')]/ul/"
'li/text()').extract()

count = 0
for tech in org_technologies:
technology['%s' % str(count)] = tech
count = count + 1

item = {
'id': id,
'name': org_name,
'tagline': org_tagline,
'description': org_long_description,
'technologies': technology
}
org_data = {}
org_data[int(item['id'])] = item

with open(os.path.join('_site', 'gsoc_org_info.yaml'), 'w') as f:
yaml.dump(org_data, f)

# Overwrite any previous data with empty dataset
open(os.path.join('_site', 'gsoc_project_info.yaml'), 'w').close()

for res in response.css('a.archive-project-card__link'):
link = res.xpath('@href').extract()[0]
link = link.split('/')[4]
url_project = project_url + link
yield response.follow(url_project, self.parse_project)

def parse_project(self, response):
mentors = []
org_url = 'https://summerofcode.withgoogle.com/'\
'archive/{}/organizations/'.format(year)
page = response.url.split('/')[-2]

project_id = page
project_title = response.css('h3.banner__title::text')[0].extract()
project_summary = response.xpath(
"//div[@class='org__long-description']")[0].extract()
project_organization_code = response.css(
'md-card.org__info-card a::attr(href)')[0].extract().split('/')[4]
project_link = response.url
project_organization_url = org_url + project_organization_code
project_organization_name = response.css('md-card.org__info-card '
'a::text')[3].extract()
project_code = response.css('md-card.org__info-card '
'a::attr(href)')[1].extract()
project_student = response.xpath(
".//div[@class='org__meta']/div[contains(.,'Student')]/"
'div/text()')[0].extract()
project_mentors = response.xpath(
".//div[@class='org__meta']/div[contains(.,'Mentors')]"
'/ul/li/text()').extract()
ment = {}
count = 0
for mentor in project_mentors:
ment['%s' % str(count)] = mentor
count = count + 1

item = {
'id': project_id,
'title': project_title,
'summary': project_summary,
'student': project_student,
'mentors': ment,
'organization_name': project_organization_name,
'organization_code': project_organization_code,
'organization_url': project_organization_url,
'project_link': project_link,
'project_code': project_code
}
project_data = {}
project_data[int(item['id'])] = item

with open(os.path.join('_site', 'gsoc_project_info.yaml'), 'a') as f:
yaml.dump(project_data, f)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ django-distill
django-eventtools
IGitt==0.4.1.dev20180111025558
requests
scrapy
python-dateutil
pillow
ruamel.yaml
Expand Down
11 changes: 11 additions & 0 deletions scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = gsocscrape.settings

[deploy]
#url = http://localhost:6800/
project = gsocscrape
27 changes: 27 additions & 0 deletions templates/gsoc.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<html>

<head>
<title> GSoC 2017 Data </title>
</head>

<body>
<p>{{id}}</p>
<hr>
<h3>Name</h3>
<p>{{name}}</p>
<hr>
<h3>Tagline</h3>
<p>{{tagline}}</p>
<hr>
<h3>Description</h3>
<p>{% autoescape off %}{{description}}{% endautoescape %}</p>
<hr>
<h3> Technology </h3>
<ul>
{% for technology in tech %}
<li>{{technology}}</li>
{% endfor %}
</ul>
</body>

</html>
Loading