# NLTK: gender analysis

In [1]:
# On Jupyter - run this cell
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from urllib.request import urlopen 

In [None]:
# on Colab -- run this cell
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from urllib.request import urlopen 

## selecting and cleaning our text

In [2]:
def clean(url):
    # loads and tokenizes the text
    open_url = urlopen(url)
    read_text = open_url.read()
    decoded_text = read_text.decode()
    tokens = nltk.word_tokenize(decoded_text)
    
    # cleans the text
    stops = stopwords.words('english')
    text = [] 
    for word in tokens:
        if word.isalpha(): 
            if word not in stops:
                text.append(word.lower())     
    
    return text

In [4]:
# books by women
frankenstein = 'https://www.gutenberg.org/cache/epub/84/pg84.txt'
little_women = 'https://www.gutenberg.org/cache/epub/37106/pg37106.txt'
pride_prejudice = 'https://www.gutenberg.org/cache/epub/1342/pg1342.txt'
orlando = 'http://gutenberg.net.au/ebooks02/0200331.txt'

# books by men
dracula = 'https://www.gutenberg.org/cache/epub/345/pg345.txt'
dorian_gray = 'https://www.gutenberg.org/cache/epub/174/pg174.txt'
crime_punishment = 'https://www.gutenberg.org/files/2554/2554-0.txt'
heart_darkness = 'https://www.gutenberg.org/files/219/219-0.txt'

In [5]:
# running the "clean" function on the text "orlando"

text = clean(orlando)

In [6]:
print(text)

['project', 'gutenberg', 'australia', 'title', 'orlando', 'author', 'virginia', 'woolf', 'a', 'project', 'gutenberg', 'australia', 'ebook', 'ebook', 'no', 'language', 'english', 'date', 'first', 'posted', 'april', 'date', 'recently', 'updated', 'july', 'this', 'ebook', 'produced', 'sue', 'asscher', 'project', 'gutenberg', 'australia', 'ebooks', 'created', 'printed', 'editions', 'public', 'domain', 'australia', 'unless', 'copyright', 'notice', 'included', 'we', 'not', 'keep', 'ebooks', 'compliance', 'particular', 'paper', 'edition', 'copyright', 'laws', 'changing', 'world', 'be', 'sure', 'check', 'copyright', 'laws', 'country', 'downloading', 'redistributing', 'file', 'this', 'ebook', 'made', 'available', 'cost', 'almost', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'give', 'away', 'terms', 'project', 'gutenberg', 'australia', 'license', 'may', 'viewed', 'online', 'to', 'contact', 'project', 'gutenberg', 'australia', 'go', 'orlando', 'a', 'biography', 'by', 'virginia', 'woolf', 

## NLTK methods for text analysis

In [10]:
# turn our saved text (list of strings) to an NLTK object for 
# text analysis

orlando = nltk.Text(text)

In [11]:
# most common words

orlando.vocab().most_common(20)

[('orlando', 436),
 ('one', 369),
 ('the', 309),
 ('would', 292),
 ('could', 234),
 ('she', 218),
 ('i', 197),
 ('said', 172),
 ('but', 158),
 ('like', 157),
 ('upon', 153),
 ('great', 143),
 ('he', 140),
 ('it', 135),
 ('thought', 135),
 ('man', 130),
 ('time', 129),
 ('woman', 123),
 ('life', 117),
 ('and', 113)]

In [12]:
# load up and clean a text by a man for comparison 

dorian = clean(dorian_gray)

In [13]:
print(dorian)



In [14]:
dorian = nltk.Text(dorian)

dorian.vocab().most_common(20)

[('i', 1684),
 ('he', 499),
 ('the', 434),
 ('one', 430),
 ('dorian', 410),
 ('it', 384),
 ('you', 376),
 ('would', 305),
 ('said', 261),
 ('lord', 248),
 ('henry', 234),
 ('life', 222),
 ('like', 210),
 ('gray', 195),
 ('but', 182),
 ('man', 178),
 ('never', 173),
 ('know', 171),
 ('harry', 166),
 ('must', 164)]

In [15]:
# question: how does each text treat male and female characters?

orlando.similar('man')

writing never father young window still death heart even always
carriage fell matters countrywoman boys tongue indulged ships grown
become


In [16]:
orlando.similar('woman')

no could till growing saw moment reached servant always live wretch
cried led resembled serpentine


In [18]:

orlando.common_contexts(["woman"])

old_loved young_raised bumboat_carrying boy_loose must_could
she_orlando old_answered country_hacking old_natural beard_skin
old_hobbling old_whatever certainly_bred cossack_waste there_white
suffocated_bed rustle_dress belated_quarter faithless_insults
christian_understood


## group challenge

Compare and contrast one text by a woman and one text by a man. One possibility is to compare the main characters from each text. Another possibility is to compare the genders of the main characters. Or you can explore themes, or settings from the text.

After cleaning your text, use NLTK methods like: 
- `concordance()`
- `dispersion_plot([])`
- `collocations()`
- `common_contexts([])`
- most common words: `text.vocab().most_common(20)`

Try to answer the following questions:
- What words are associated with "man" and "woman" from each text?
- What are the differences between the genders?
- What are the differences in gender portrayal between the male and female authors?

Frankenstein: "man" vs "woman" 
- with `similar()`, "man" has 14 words, and "woman" has 1. 
- Means that men has more references than women text. Makes sense because it's about a male scientist creating a man. 

In [19]:
orlando.count("woman")

123

In [20]:
orlando.count("man")

130

In [21]:
text = clean(dracula)

In [22]:
dracula = nltk.Text(text)

In [24]:
dracula.similar('woman')

diary go asked said though look man lady less but key peace confined
volcanoes
