In [20]:
import kotlinx.serialization.Serializable

@Serializable
data class PageResponse(
    val id: Int,
    val key: String,
    val title: String,
    val latest: LatestRevision,
    val content_model: String,
    val license: License,
    val html_url: String
)

@Serializable
data class LatestRevision(
    val id: Long,
    val timestamp: String
)

@Serializable
data class License(
    val url: String,
    val title: String
)


In [21]:
import io.ktor.client.*
import io.ktor.client.engine.okhttp.OkHttp
import io.ktor.client.plugins.contentnegotiation.*
import io.ktor.serialization.kotlinx.json.*
import kotlinx.serialization.json.Json

val client = HttpClient(OkHttp) {
    install(ContentNegotiation) {
        json(Json {
            prettyPrint = true
            isLenient = true
            ignoreUnknownKeys = true
        })
    }
}

In [23]:
import io.ktor.client.call.*
import io.ktor.client.request.*

suspend fun fetchPageMetadata(project: String, language: String, title: String): PageResponse {
    val url = "https://api.wikimedia.org/core/v1/$project/$language/page/${title}/bare"
    return client.get(url).body()
}

In [24]:
import io.ktor.client.statement.*

suspend fun fetchHtmlContent(htmlUrl: String): String {
    return client.get(htmlUrl).bodyAsText()
}

In [25]:
import org.jsoup.Jsoup

fun extractTextFromHtml(html: String): String {
    val document = Jsoup.parse(html)
    return document.body().text()
}

In [26]:
import kotlinx.coroutines.runBlocking

fun main() = runBlocking {
    try {
        val project = "wikipedia"
        val language = "en"
        val title = "Earth"

        // Step 1: Fetch page metadata
        val pageMetadata = fetchPageMetadata(project, language, title)
        println("Page Metadata: $pageMetadata")

        // Step 2: Fetch HTML content
        val htmlContent = fetchHtmlContent(pageMetadata.html_url)
        println("HTML Content Fetched Successfully.")

        // Step 3: Extract plain text from HTML
        val pageText = extractTextFromHtml(htmlContent)
        println("Extracted Text:\n$pageText")
    } catch (e: Exception) {
        println("An error occurred: ${e.message}")
    } finally {
        client.close()
    }
}


In [27]:
main()

Page Metadata: PageResponse(id=9228, key=Earth, title=Earth, latest=LatestRevision(id=1264684651, timestamp=2024-12-23T00:35:03Z), content_model=wikitext, license=License(url=https://creativecommons.org/licenses/by-sa/4.0/deed.en, title=Creative Commons Attribution-Share Alike 4.0), html_url=https://en.wikipedia.org/w/rest.php/v1/page/Earth/html)
HTML Content Fetched Successfully.
Extracted Text:
Third planet from the Sun "Planet Earth" redirects here. For other uses, see Earth (disambiguation) and Planet Earth (disambiguation). Earth The Blue Marble, Apollo 17, December 1972 Designations Alternative names The world, the globe, Sol III, Terra, Tellus, Gaia, Mother Earth Adjectives Earthly, terrestrial, terran, tellurian Symbol 🜨 and ♁ Orbital characteristics Epoch J2000[n 1] Aphelion 152097597 km Perihelion 147098450 km[n 2] Semi-major axis 149598023 km[1] Eccentricity 0.0167086[1] Orbital period (sidereal) 365.256363004 d[2] (1.00001742096 aj) Average orbital speed 29.7827 km/s[3] M