# Corpus Analysis Using Kotlin
-------------

### Import Libraries

In [32]:
import java.awt.Color
import java.awt.Font
import java.awt.Graphics2D
import java.awt.Rectangle
import java.awt.image.BufferedImage
import java.io.File
import javax.imageio.ImageIO
import kotlin.random.Random

### Function to load dates from CSV file

In [23]:

fun loadDatesFromCsv(filePath: String): List<String> {
    val file = File(filePath)
    val dates = mutableListOf<String>()

    file.forEachLine { line ->
        if (!line.startsWith("tweet_id,")) {
            val columns = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)".toRegex())
            if (columns.size > 2) {
                dates.add(columns[2].trim()) // Extract date_created column
            }
        }
    }
    return dates
}

### Function to load text from the CSV file

In [2]:
fun loadTextFromCsv(filePath: String): List<String> {
    val file = File(filePath)
    val corpus = mutableListOf<String>()

    file.forEachLine { line ->
        if (!line.startsWith("tweet_id,")) {
            val columns = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)".toRegex())
            if (columns.size > 3) {
                corpus.add(columns[3].trim().lowercase()) // Extract text column
            }
        }
    }
    return corpus
}

### Function to count the total number of words

In [3]:
fun wordCount(corpus: List<String>): Int {
    return corpus.sumOf { it.split("\\s+".toRegex()).size }
}

### Function to count the unique words (vocabulary size)

In [4]:
fun vocabularySize(corpus: List<String>): Int {
    return corpus.flatMap { it.split("\\s+".toRegex()) }
        .filter { it.isNotBlank() }
        .distinct()
        .size
}

### Function to count how many times a word appears in the texts

In [5]:
fun wordFrequency(corpus: List<String>): Map<String, Int> {
    return corpus.flatMap { it.split("\\s+".toRegex()) }
        .filter { it.isNotBlank() } 
        .groupingBy { it }
        .eachCount()
        .toList()
        .sortedByDescending { (_, count) -> count }
        .toMap()
}

### Function to count how many times a character was used in the texts

In [6]:
fun characterFrequency(corpus: List<String>): Map<Char, Int> {
    return corpus.flatMap { it.toList() }
        .groupingBy { it }
        .eachCount()
        .toList()
        .sortedByDescending { (_, count) -> count }
        .toMap()
}

### Function to get the frequency of stop words in the texts

In [7]:
fun identifyStopWords(wordFrequencies: Map<String, Int>, stopWords: Set<String>): Map<String, Int> {
    return wordFrequencies.filter { (word, _) -> stopWords.contains(word) }
}

### Function that displays the top 20 most used words

In [8]:
fun showTopFrequentWords(wordFrequencies: Map<String, Int>, topN: Int = 20) {
    println("Top $topN Frequent Words:")
    wordFrequencies.entries.take(topN).forEach { (word, freq) ->
        println("$word: $freq")
    }
}

### Function that displays the top 10 most used characters

In [9]:
fun showTopFrequentCharacters(charFrequencies: Map<Char, Int>, topN: Int = 10) {
    println("Top $topN Frequent Characters:")
    charFrequencies.entries.take(topN).forEach { (char, freq) ->
        println("$char: $freq")
    }
}

### Function that generates a word cloud using java.awts elements and outputs it as a png file

In [39]:
fun generateWordCloud(wordFrequencies: Map<String, Int>, outputFilePath: String) {
    val width = 800
    val height = 600

    val image = BufferedImage(width, height, BufferedImage.TYPE_INT_RGB)
    val graphics = image.createGraphics()

    graphics.color = Color.WHITE
    graphics.fillRect(0, 0, width, height)

    val placedWords = mutableListOf<Rectangle>()

    val sortedWords = wordFrequencies.entries.sortedByDescending { it.value }.take(20)
    val centerX = width / 2
    val centerY = height / 2

    sortedWords.forEachIndexed { index, (word, frequency) ->
        val fontSize = (10 + frequency * 5).coerceAtMost(60) // Clamp font size to a maximum
        graphics.font = Font("Arial", Font.BOLD, fontSize)

        val wordWidth = graphics.fontMetrics.stringWidth(word)
        val wordHeight = graphics.fontMetrics.height

        var x: Int
        var y: Int
        var attempts = 0

        while (true) {
            if (index == 0 && attempts == 0) {
                // Place the largest word at the center
                x = centerX - wordWidth / 2
                y = centerY + wordHeight / 2
                val rect = Rectangle(x, y - wordHeight, wordWidth, wordHeight)
                placedWords.add(rect)
                break
            } else {
                // Random placement
                x = (0 until (width - wordWidth)).random()
                y = (wordHeight until (height - wordHeight)).random()

                val newWordRect = Rectangle(x, y - wordHeight, wordWidth, wordHeight)
                if (placedWords.none { it.intersects(newWordRect) }) {
                    placedWords.add(newWordRect)
                    break
                }

                attempts++
                if (attempts > 100) {
                    println("Failed to place word: $word after 100 attempts.")
                    break
                }
            }
        }

        // Set random color
        graphics.color = Color(
            (50..255).random(),
            (50..255).random(),
            (50..255).random()
        )

        // Draw the word
        graphics.drawString(word, x, y)
    }

    ImageIO.write(image, "png", File(outputFilePath))
    graphics.dispose()

    println("Word cloud saved to $outputFilePath")
}


### Load CSV

In [10]:
val csvFilePath = "fake_tweets.csv"

val corpus = loadTextFromCsv(csvFilePath)
if (corpus.isEmpty()) {
    println("No data loaded from CSV. Exiting program.")
}

    

### Load dates from CSV

In [12]:
val dates = loadDatesFromCsv(csvFilePath)
if (dates.isEmpty() || corpus.isEmpty()) {
    println("No data loaded from CSV. Exiting program.")
}
println("Dates and text data loaded successfully.")
println("Total dates: ${dates.size}, Total text entries: ${corpus.size}")


Dates and text data loaded successfully.
Total dates: 100, Total text entries: 100


### Show descriptive statistics

In [34]:
val wordFrequencies = wordFrequency(corpus)

val wordsSortedByFrequency = wordFrequencies.entries
    .sortedByDescending { it.value } 
    .associate { it.toPair() } 

val charFrequencies = characterFrequency(corpus)

val charsSortedByFrequency = charFrequencies.entries
.sortedByDescending { it.value } 
.associate { it.toPair() }

val totalWordCount = wordCount(corpus)
val vocabSize = vocabularySize(corpus)

println("\nDescriptive Statistics:")
println("Total Word Count: $totalWordCount")
println("Vocabulary Size: $vocabSize")
println("\nWord Frequency:")
wordsSortedByFrequency.forEach { (word, count) ->
    println("$word: $count")
}


Descriptive Statistics:
Total Word Count: 720
Vocabulary Size: 530

Word Frequency:
#travel: 24
#news: 20
#food: 19
#trending: 14
#funny: 11
😱: 7
citizen: 3
single: 3
unit: 3
itself: 3
serve: 3
of: 3
forget: 3
😭: 3
soldier: 3
throughout: 3
bill: 3
deep: 3
measure.: 3
instead: 3
poor: 3
🤯: 3
church: 3
deal: 3
budget: 3
through: 3
political: 2
show: 2
pretty: 2
race.: 2
assume: 2
book: 2
but: 2
minute: 2
establish: 2
ok: 2
allow: 2
👍: 2
southern: 2
one: 2
or: 2
people: 2
ago: 2
newspaper: 2
them: 2
front: 2
yeah: 2
power: 2
agree: 2
quickly: 2
activity: 2
purpose: 2
my: 2
heavy: 2
news: 2
side: 2
well.: 2
food: 2
never: 2
remain: 2
media.: 2
left: 2
much: 2
😍: 2
view: 2
teacher: 2
south: 2
continue: 2
increase: 2
happen: 2
wall: 2
education: 2
several: 2
future.: 2
❤️: 2
recent: 2
to: 2
what: 2
reveal: 2
center: 2
🔥: 2
other: 2
production: 2
half: 2
us: 2
themselves: 2
cost: 2
common: 1
any.: 1
camera: 1
ready: 1
@user2471465: 1
😂: 1
opportunity: 1
family: 1
style: 1
manage: 1
personal.

In [14]:

println("\nCharacter Frequency:")
charsSortedByFrequency.forEach { (character, count) ->
    println("$character: $count")
}



Character Frequency:
 : 620
e: 534
r: 340
t: 276
n: 257
s: 251
a: 247
o: 241
i: 220
u: 197
l: 184
c: 135
d: 133
h: 112
.: 100
f: 89
#: 88
g: 83
p: 82
w: 73
y: 73
@: 71
m: 68
3: 60
5: 59
v: 59
7: 58
9: 56
2: 53
8: 48
6: 45
1: 44
b: 40
4: 37
0: 37
k: 20
?: 17
?: 7
x: 7
z: 6
q: 5
?: 3
?: 3
?: 3
?: 2
j: 2
?: 2
❤: 2
️: 2
?: 2
?: 1


In [15]:

println("\nFrequency Analysis:")
showTopFrequentWords(wordsSortedByFrequency, 20)



Frequency Analysis:
Top 20 Frequent Words:
#travel: 24
#news: 20
#food: 19
#trending: 14
#funny: 11
😱: 7
citizen: 3
single: 3
unit: 3
itself: 3
serve: 3
of: 3
forget: 3
😭: 3
soldier: 3
throughout: 3
bill: 3
deep: 3
measure.: 3
instead: 3


In [16]:
val stopWords = setOf("although", "happen", "new", "none", "form", "something", "where", "try", "out", "medical")

val identifiedStopWords = identifyStopWords(wordFrequencies, stopWords)

println("\nStop Word Identification:")
identifiedStopWords.entries.take(10).forEach { (word, count) ->
    println("$word: $count")
}


Stop Word Identification:
happen: 2
try: 1
where: 1
new: 1
although: 1
form: 1
none: 1
medical: 1


### Generate word cloud

In [40]:
val outputFilePath = "wordcloud_custom.png"
    generateWordCloud(wordFrequencies, outputFilePath)

Word cloud saved to wordcloud_custom.png
