Skip to content

Commit

Permalink
feat(gulli): support gulli as a finder to create a new podcast
Browse files Browse the repository at this point in the history
We use the replays page of gulli to find the information about the show to be registered.
The OpenGraph meta information are useful to get information about description. The title is fetch from breadcrumb and the cover from the show list behind the replay
  • Loading branch information
davinkevin committed Oct 12, 2016
1 parent 76375b8 commit 93cf15f
Show file tree
Hide file tree
Showing 4 changed files with 232 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package lan.dk.podcastserver.manager.worker.downloader;

import javaslang.collection.List;
import lan.dk.podcastserver.entity.Item;
import lan.dk.podcastserver.repository.ItemRepository;
import lan.dk.podcastserver.repository.PodcastRepository;
import lan.dk.podcastserver.service.HtmlService;
import lan.dk.podcastserver.service.JsonService;
import lan.dk.podcastserver.service.MimeTypeService;
import lan.dk.podcastserver.service.UrlService;
import lan.dk.podcastserver.service.factory.WGetFactory;
import lan.dk.podcastserver.service.properties.PodcastServerParameters;
import org.apache.commons.lang3.StringUtils;
import org.springframework.messaging.simp.SimpMessagingTemplate;

import java.util.regex.Pattern;

import static java.util.Objects.nonNull;

/**
* Created by kevin on 12/10/2016 for Podcast Server
*/
public class GulliDownloader extends HTTPDownloader {

static final Pattern NUMBER_IN_PLAYLIST_EXTRACTOR = Pattern.compile("playlistItem\\(([^\\)]*)\\);");
static final Pattern PLAYLIST_EXTRACTOR = Pattern.compile("playlist:\\s*(.*?(?=events:))", Pattern.DOTALL);

private final HtmlService htmlService;
private final JsonService jsonService;

private String url = null;

public GulliDownloader(ItemRepository itemRepository, PodcastRepository podcastRepository, PodcastServerParameters podcastServerParameters, SimpMessagingTemplate template, MimeTypeService mimeTypeService, UrlService urlService, WGetFactory wGetFactory, HtmlService htmlService, JsonService jsonService) {
super(itemRepository, podcastRepository, podcastServerParameters, template, mimeTypeService, urlService, wGetFactory);
this.htmlService = htmlService;
this.jsonService = jsonService;
}

@Override
public String getItemUrl(Item item) {
if (nonNull(this.item) && !this.item.equals(item))
return item.getUrl();

if (nonNull(url))
return url;

url = htmlService.get(item.getUrl())
.map(d -> d.select("script"))
.flatMap(scripts -> List.ofAll(scripts).find(e -> e.html().contains("playlist")))
.map(e -> "Foo")
.getOrElse(StringUtils.EMPTY);


return super.getItemUrl(item);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package lan.dk.podcastserver.manager.worker.finder;

import javaslang.control.Option;
import lan.dk.podcastserver.entity.Cover;
import lan.dk.podcastserver.entity.Podcast;
import lan.dk.podcastserver.service.HtmlService;
import lan.dk.podcastserver.service.ImageService;
import lombok.AllArgsConstructor;
import org.hibernate.validator.constraints.NotEmpty;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Service;

/**
* Created by kevin on 04/10/2016 for Podcast Server
*/
@Service("GulliFinder")
@AllArgsConstructor
public class GulliFinder implements Finder {

private static final String COVER_SELECTOR = "div.program_gullireplay a[href=%s] img";

final HtmlService htmlService;
final ImageService imageService;

@Override
public Podcast find(String url) {
return htmlService.get(url)
.map(this::htmlToPodcast)
.getOrElse(Podcast.DEFAULT_PODCAST);
}

private Podcast htmlToPodcast(Document d) {
return Podcast.builder()
.title(d.select("ol.breadcrumb li.active").first().text())
.cover(coverOf(d))
.description(d.select("meta[property=og:description]").attr("content"))
.url(d.select("meta[property=og:url]").attr("content"))
.type("Gulli")
.build();
}

private Cover coverOf(Document d) {
String pageUrl = d.select("meta[property=og:url]").attr("content");

return Option.of(d.select(String.format(COVER_SELECTOR, pageUrl)).first())
.map(e -> e.attr("src"))
.map(imageService::getCoverFromURL)
.getOrElse(Cover.DEFAULT_COVER);
}

@Override
public Integer compatibility(@NotEmpty String url) {
return url.contains("replay.gulli.fr") ? 1 : Integer.MAX_VALUE;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package lan.dk.podcastserver.manager.worker.updater;

import com.google.common.collect.Sets;
import javaslang.collection.HashSet;
import javaslang.collection.List;
import javaslang.control.Option;
import lan.dk.podcastserver.entity.Cover;
import lan.dk.podcastserver.entity.Item;
import lan.dk.podcastserver.entity.Podcast;
import lan.dk.podcastserver.service.HtmlService;
import lan.dk.podcastserver.service.ImageService;
import lan.dk.podcastserver.service.SignatureService;
import lan.dk.podcastserver.service.properties.PodcastServerParameters;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;

import javax.validation.Validator;
import java.time.ZonedDateTime;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static javaslang.collection.HashSet.collector;

/**
* Created by kevin on 05/10/2016 for Podcast Server
*/
@Slf4j
@Component("GulliUpdater")
public class GulliUpdater extends AbstractUpdater {

private static final Pattern FRAME_EXTRACTOR = Pattern.compile(".*\\.html\\(.<iframe.* src=\"([^\"]*)\".*");

private final HtmlService htmlService;
private final ImageService imageService;

public GulliUpdater(PodcastServerParameters podcastServerParameters, SignatureService signatureService, Validator validator, HtmlService htmlService, ImageService imageService) {
super(podcastServerParameters, signatureService, validator);
this.htmlService = htmlService;
this.imageService = imageService;
}

@Override
public Set<Item> getItems(Podcast podcast) {
return htmlService.get(podcast.getUrl())
.map(d -> d.select("div.all-videos ul li.col-md-3"))
.map(this::asSet)
.map(HashSet::toJavaSet)
.getOrElse(Sets.newHashSet());
}

private HashSet<Item> asSet(Elements elements) {
return elements.stream()
.map(this::findDetailsInFromPage)
.collect(collector());
}

private Item findDetailsInFromPage(Element e) {
return Option.of(e.select("a").first())
.map(elem -> elem.attr("href"))
.flatMap(htmlService::get)
.flatMap(Option::of)
.map(d -> d.select(".bloc_streaming").first())
.flatMap(this::htmlToItem)
.flatMap(Option::of)
.map(i -> i.setCover(getCover(e)))
.getOrElse(Item.DEFAULT_ITEM);
}

private Option<Item> htmlToItem(Element block) {
return List.ofAll(block.select("script"))
.find(e -> e.html().contains("iframe"))
.map(Element::html)
.map(FRAME_EXTRACTOR::matcher)
.filter(Matcher::find)
.map(m -> m.group(1))
.map(url -> Item.builder()
.title(block.select(".episode_title").text())
.description(block.select(".description").text())
.url(url)
.pubDate(ZonedDateTime.now())
.build());
}

private Cover getCover(Element block) {
return Option.of(block)
.map(e -> e.select("img").attr("src"))
.map(imageService::getCoverFromURL)
.getOrElse(Cover.DEFAULT_COVER);
}

@Override
public String signatureOf(Podcast podcast) {
return htmlService.get(podcast.getUrl())
.map(d -> d.select("div.all-videos ul").first())
.map(Element::html)
.map(signatureService::generateMD5Signature)
.getOrElse(StringUtils.EMPTY);
}

@Override
public Type type() {
return new Type("Gulli", "Gulli");
}

@Override
public Integer compatibility(String url) {
return url.contains("replay.gulli.fr") ? 1 : Integer.MAX_VALUE;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import static org.junit.Assert.*;

/**
* Created by kevin on 12/10/2016 for Podcast Server
*/
public class GulliDownloaderTest {

}

0 comments on commit 93cf15f

Please sign in to comment.