version 1.0.0

cran · Dec 6, 2023 · b5b8112 · b5b8112
commit b5b8112
Show file tree

Hide file tree

Showing 32 changed files with 1,490 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,23 @@
+Package: genCountR
+Title: Interacting with Roberts and Utych's (2019) Gendered Language
+        Dictionary
+Version: 1.0.0
+Authors@R: 
+    person("Damon", "Roberts", , "damon.charles.roberts@gmail.com", role = c("aut", "cre"),
+           comment = c(ORCID = "0000-0002-4360-3675"))
+Description: Allows users to generate a gendered language score according to the gendered language dictionary in Roberts and Utych (2019) <doi:10.1177/1065912919874883>.
+License: MIT + file LICENSE
+Encoding: UTF-8
+RoxygenNote: 7.2.3
+Depends: R (>= 2.10)
+LazyData: true
+Suggests: devtools, knitr, rmarkdown, testthat
+VignetteBuilder: knitr
+URL: https://gencounter.app.damoncroberts.com,
+        https://damoncharlesroberts.github.io/genCountR/
+NeedsCompilation: no
+Packaged: 2023-12-05 00:37:50 UTC; dcr
+Author: Damon Roberts [aut, cre] (<https://orcid.org/0000-0002-4360-3675>)
+Maintainer: Damon Roberts <damon.charles.roberts@gmail.com>
+Repository: CRAN
+Date/Publication: 2023-12-05 17:50:04 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2023
+COPYRIGHT HOLDER: genCountR authors
diff --git a/MD5 b/MD5
@@ -0,0 +1,31 @@
+99aa1790afcd2fd3ec2db69637636e3c *DESCRIPTION
+57eb66a14d63272aae4d53d18caf55f9 *LICENSE
+f24b374b29900adc0b9e7c185d616119 *NAMESPACE
+b765f6481b5e07835af5da7c50a6219a *R/dict.R
+df6cc46bc7fae1a55b713f3d5065b35a *R/genCountR-package.R
+1d57ac357049fbef5296852f25b3e272 *R/gen_count.R
+1e0e9d71d0eb04f2fff2596b05518c9b *R/gen_score.R
+7861d5546b952689049afa0b1a860aa6 *R/text_clean.R
+81d94cf11fd9fde737f3b5123fc28f91 *R/word_count.R
+3f9bcf526e32af4bc36db5ff5622d0f9 *README.md
+439bf689fa27cf9affd0335332142165 *build/partial.rdb
+142303c55feaab697db426ba88b0ec1b *build/vignette.rds
+edf70d7dc769d86a8f9236738ce4bbbb *data/dict.rda
+28022ec175d3d4803457da88ed0a078d *inst/doc/gen_count.R
+42386983344f1fd6443025bac9a2b278 *inst/doc/gen_count.Rmd
+76d1c5e3b730c2b94dbfe174eb6aaafb *inst/doc/gen_count.html
+9364e385a39a7cd688b50d7a145ced41 *inst/doc/gen_score.R
+98487cc4e52fd35ad5e7e6d1fc66bede *inst/doc/gen_score.Rmd
+7985d9186596a36a40f5bb6753e6b6c4 *inst/doc/gen_score.html
+0db43a890a369b941b461bd4bdfa7b8d *man/dict.Rd
+fcff4adeefbbdd9d5b1305b3ac72581f *man/genCountR-package.Rd
+bdcfb9d1c7c6727975f148a724efe1d8 *man/gen_count.Rd
+21cd66b3097a82a5c8ea2eae7cfc4793 *man/gen_score.Rd
+5c49d7fccd1d918bf03e9014a9f4d88f *man/text_clean.Rd
+bb7ccdaae30ad4f6227128c2d026b8bc *man/word_count.Rd
+2910db24b4be2d49f537dc589a6e855a *tests/testthat/test-gen-count.R
+0a8d88c14d44dca66890f2198ad75721 *tests/testthat/test-gen-score.R
+33d4edd9765607946fece4908ea8414a *tests/testthat/test-text-clean.R
+8697ed5ff0ec37fb3471a3c4a45dfc27 *tests/testthat/test-word-count.R
+42386983344f1fd6443025bac9a2b278 *vignettes/gen_count.Rmd
+98487cc4e52fd35ad5e7e6d1fc66bede *vignettes/gen_score.Rmd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,4 @@
+# Generated by roxygen2: do not edit by hand
+
+export(gen_count)
+export(gen_score)
diff --git a/R/dict.R b/R/dict.R
@@ -0,0 +1,15 @@
+#' @title
+#' Data from Gendered Language Dictionary Developed by Roberts and Utych (2019)
+#'
+#' @description
+#' Each word in the dataset contains a rating by human coders. See details of dataset in the original paper.
+#'
+#' @format A tibble with 701 rows and 15 columns:
+#' \describe{
+#'   \item{Word}{The word to match}
+#'   \item{POS}{Part Of Speech}
+#'   \item{mean.a}{Mean score provided by all participants}
+#'   \item{std.dev.a}{Standard deviation of score provided by all participants}
+#' }
+#' @source \url{https://journals.sagepub.com/doi/10.1177/1065912919874883}
+"dict"
diff --git a/R/genCountR-package.R b/R/genCountR-package.R
@@ -0,0 +1,6 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+## usethis namespace: end
+NULL
diff --git a/R/gen_count.R b/R/gen_count.R
@@ -0,0 +1,48 @@
+#' @title gen_count
+#' 
+#' @description
+#' Counts the number of masculine and feminine words in the document
+#'
+#' @details
+#' Takes the number of words that are loosely categorized as Masculine, Feminine, or Neutral based on Roberts and Utych's (2019) definition.
+#' Feminine words had a score below 2.5, Neutral words had a score higher than 2.5 and lower than 5.5, Masculine words had a score higher than 5.5.
+#'
+#' @param
+#' text (string): A string object.
+#'
+#' @return
+#' data.frame with each word from the dictionary matched with the text and its number of occurances.
+#'
+#' @examples
+#' text <- 'This person was a heroine due to their fighting during the war.'
+#' result_df <- genCountR::gen_count(text)
+#'
+#' @export
+gen_count <- function(
+  text
+) {
+  # Clean the text in the document
+  word_list <- text_clean(text)
+
+  # Execute the function to count words
+  df <- word_count(word_list)
+
+  # Merge with dictionary for their score
+  df <- merge(df, genCountR::dict, how="inner", by.x = "word", by.y = "Word")
+
+  # Create column that assigns label
+  result_df <- data.frame(
+    word = df$word
+    , count = df$count
+    , score = df$mean.a
+  )
+  result_df["classified"] <- ifelse(
+    result_df$score < 2.5, "Feminine" # if the score is below 2.5, feminine
+    , ifelse(
+      result_df$score >= 5.5, "Masculine" # if the score is above or equal to 5.5, masculine
+      , "Neutral" # all other scores should be labelled neutral
+    )
+  )
+  # Return the dataframe
+  return(result_df)
+}
diff --git a/R/gen_score.R b/R/gen_score.R
@@ -0,0 +1,64 @@
+#' @title gen_score
+#' 
+#' @description
+#' Calculates the score of the supplied text string based on the Gendered Language Dictionary created by Roberts and Utych (2019).
+#' 
+#' @details
+#' Takes the matched words and their occurrence in the supplied text, finds the score for those matched words in the dictionary, sums those scores up and then divides it by the total number of words in the dictionary.
+#' 
+#' @param 
+#' text (string): A string object
+#' 
+#' @return 
+#' list object with avg_score of the supplied text string, total_score of supplied text string, and data.frame of matches
+#' 
+#' @examples
+#' text <- 'Hero. hero Heroine. heroine, Prison. Prisom.'
+#' result <- genCountR::gen_score(text)
+#' result$avg_score
+#' result$total_score
+#' result$df
+#' 
+#' @export
+gen_score <- function(
+    text
+) {
+    # Create empty list object
+    result <- list(
+        "avg_score" = numeric(0)
+        , "total_score" = numeric(0)
+        , "df" = data.frame(
+            "word" = character(0)
+            , "count" = integer(0)
+            , "score" = numeric(0)
+            , "total_score" = numeric(0)
+        )
+    )
+    # Clean the text in the document
+    word_list <- text_clean(text)
+
+    # Execute the function to count words
+    df <- word_count(word_list)
+
+    # Merge the score
+    df <- base::merge(df, genCountR::dict, how="inner", by.x = "word", by.y = "Word")
+
+    # Create a data.frame for the score
+    result[["df"]] <- data.frame(
+      "word" = df["word"]
+      , "count" = df["count"]
+      , "score" = df["mean.a"]
+    )
+
+    # Calculate total row-wise score
+    result[["df"]]["total_score"] <- (result[["df"]]["count"] * result[["df"]]["mean.a"])
+
+    # Get the total score for the document
+    result[["total_score"]] <- sum(result[["df"]]["total_score"])
+
+    # Get the average score for the document
+    result[["avg_score"]] <- (result[["total_score"]]/length(word_list))
+
+    # Return result
+    return(result)
+}
diff --git a/R/text_clean.R b/R/text_clean.R
@@ -0,0 +1,29 @@
+#' @title text_clean
+#' 
+#' @description
+#' Cleans the supplied text string and converts it into a list of individual words.
+#' 
+#' @details
+#' Takes the string, converts all the characters to lower case, removes punctuation, and splits the string into individual words.
+#' 
+#' @param
+#' text (string): A string object.
+#' 
+#' @return
+#' list of each word in all lower case and without punctuation.
+#'
+text_clean <- function(
+  text
+){
+  # Convert all characters to lower case
+  text_clean <- base::tolower(text)
+
+  # Remove punctuation
+  text_clean <- base::gsub("[[:punct:]]", " ", text_clean)
+
+  # Split into list
+  word_list <- base::strsplit(text_clean, "\\s+")[[1]]
+
+  # Return word list
+  return(word_list)
+}
diff --git a/R/word_count.R b/R/word_count.R
@@ -0,0 +1,35 @@
+#' @title word_count
+#'
+#' @description
+#' Count number of times a particular word from the dictionary shows up in a document.
+#'
+#' @details
+#' Takes the number of words that are loosely categorized as Masculine, Feminine, or Neutral based on Roberts and Utych's (2019) definition.
+#' Feminine words had a score below 2.5, Neutral words had a score higher than 2.5 and lower than 5.5, Masculine words had a score higher than 5.5.
+#'
+#' @param
+#' word_item (vector or array): A vector or array of words from the text wanting to be matched to dictionary.
+#'
+#' @return
+#' data.frame object of with count of masculine, feminine, and masculine words.
+#'
+#'
+word_count <- function(
+  word_item
+) {
+  # Filter the words that are in the document
+  unique_words <- unique(word_item)
+  matched <- unique_words[unique_words %in% genCountR::dict$Word]
+
+  # Count number of times the word shows up
+  count <- sapply(matched, function(w) sum(word_item == w))
+
+  # Create a dataframe for this current row
+  row_df <- base::data.frame(
+    word = matched
+    , count = count
+  )
+
+  # Return the row
+  return(row_df)
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,36 @@
+<h4 align="center">genCountR R Package</h4>
+<p align="center">
+    <a href="https://github.com/DamonCharlesRoberts/genCountR/commits/main">
+    <img src="https://img.shields.io/github/last-commit/DamonCharlesRoberts/genCountR.svg?style=flat-square&logo=github&logoColor=white"
+         alt="GitHub last commit"></a>
+    <a href="https://github.com/DamonCharlesRoberts/genCountR/issues">
+    <img src="https://img.shields.io/github/issues-raw/DamonCharlesRoberts/genCountR.svg?style=flat-square&logo=github&logoColor=white"
+         alt="GitHub issues"></a>
+    <a href="https://github.com/DamonCharlesRoberts/genCountR/pulls">
+    <img src="https://img.shields.io/github/issues-pr-raw/DamonCharlesRoberts/genCountR.svg?style=flat-square&logo=github&logoColor=white"
+         alt="GitHub pull requests"></a>
+   <a href = "https://github.com/DamonCharlesRoberts/genCountR/actions/workflows/publish.yml">
+</p>
+
+I developed this R package for researchers and other users to be able to utilize the Gendered Language Dictionary developed by [Roberts and Utych (2019)](https://journals.sagepub.com/doi/full/10.1177/1065912919874883).
+
+The package allows users to take a loaded document in R and will then count the number of words contained in the document, and it will then create a score based on the average score of all of the words in the document that matches with those in the Gendered Language Dictionary.
+
+This package is a quick side project that I worked on during my dissertation. So use at your own risk.
+
+If there are issues with the package or if you'd like to request new features, please do so on the issues tab of this repository.
+
+### Vignettes
+
+See [https://gencounter.app.damoncroberts.com](https://gencounter.app.damoncroberts.com)
+
+### Webapp
+
+The alternate webapp for this R package has been depreciated. 
+
+### Project Contributors:
+
+<ul>
+	<li> <a href="https://github.com/DamonCharlesRoberts/">Damon C. Roberts (Developer, Maintainer)</a> </li>
+	<li>Stephen M. Utych (co-author on original academic project) </li>
+</ul>
diff --git a/build/partial.rdb b/build/partial.rdb
diff --git a/build/vignette.rds b/build/vignette.rds
diff --git a/data/dict.rda b/data/dict.rda
diff --git a/inst/doc/gen_count.R b/inst/doc/gen_count.R
@@ -0,0 +1,16 @@
+## ----include = FALSE----------------------------------------------------------
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+
+## -----------------------------------------------------------------------------
+# Load the package
+library(genCountR)
+
+# Pass a string to be analyzed
+str <- "This person was a hero. They were a prisoner of war and I, as President, got them out."
+
+# Use the gen_count() function on the str
+gen_count(str)
+
diff --git a/inst/doc/gen_count.Rmd b/inst/doc/gen_count.Rmd
@@ -0,0 +1,32 @@
+---
+title: "gen_count"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{gen_count}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+The `gen_count` function is a really helpful function for those who may not necessarily want to generate a score based on the Gendered Language Dictionary in Roberts and Utych ([2019](https://journals.sagepub.com/doi/10.1177/1065912919874883)). Rather, the function will enable you to simply count the occurances of a word in your supplied text with that of the dictionary along with the corresponding score of that word in the Dicitionary and whether it would be classified as Masculine, Neutral, or Feminine as according to Robert and Utych's ([2019](https://journals.sagepub.com/doi/10.1177/1065912919874883)) definition in the original paper.
+
+First, you should supply some text as a string to the function. Then, once you execute the function, it should return a `data.frame` with 4 columns: the word that was matched, the count of occurances the word appeared in your supplied text, the score of that word according to the dictionary, and the classification of the word.
+
+Here is an example of how that function works.
+
+```{r}
+# Load the package
+library(genCountR)
+
+# Pass a string to be analyzed
+str <- "This person was a hero. They were a prisoner of war and I, as President, got them out."
+
+# Use the gen_count() function on the str
+gen_count(str)
+```