# Assignment 1

3 different subcorpora, belonging to 3 different genres (aka registers or text types). Each genre at least 5,000 words long 

1. The length (in words).
2. The lexical diversity.
3. Top 10 most frequent words and their counts.
4. Words that are at least 10 characters long and their counts.
5. The longest sentence (type the sentence and give the number of words). Hint: look at the Gutenberg part of Section 2.1 in NLTK.
6. A stemmed version of the longest sentence.

In [80]:
#put any imports
import nltk
from nltk import FreqDist
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.stem import PorterStemmer
import string
import re 

In [81]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

In [82]:
def frequent_words(text):
    # Filter out punctuation, single quotes, and double quotes from the tokenized words
    words = [word for word in text if word not in string.punctuation and word != "'" and word != '”' and word !="’" and word !='“']
    
    # Create frequency distribution
    fdist = FreqDist(words)
    return fdist

In [83]:
def long_words(text, word_list, count_list):
    word_list.clear() #make sure nothing is in the list before appending
    count_list.clear() #make sure nothing is in the list before appending
    for w in sorted(set(w.lower() for w in text if w.isalpha())): #normalize to get rid of duplicates
        if len(w) > 10:
            word_list.append(w)
            count_list.append(text.count(w))

In [84]:
def sentences(path, text_file):
    subCorpus = PlaintextCorpusReader(path, text_file)
    subCorpora1_sentences = subCorpus.sents() #puts sentences into list: [[sentence 1], [sentence 2], etc.]

    longest_len = max(len(s) for s in subCorpora1_sentences) #max length sentence
    longest_sentence = "" #empty variable to store longest sentence
    
    #find sentence with longest_len
    for s in subCorpora1_sentences:
        if len(s) == longest_len:
            longest_sentence = s
    return longest_sentence

In [85]:
def stemmed_sentence(sentence):
    ps = PorterStemmer()
    stemmed_sentence = []
    
    #i is the index at which the word is in the list 
    for i in range(len(sentence)):
        #modifies the stemmed_sentence list to contain the stemmed version of each word
        stemmed_sentence.append(ps.stem(sentence[i]))
    return stemmed_sentence

In [86]:
def print_answers(title, subCorpora, fdist, words_list, count_list, longest_sent, stemmed_sent):
    print("SubCorpus:", title)
    print("1. Length (in words):", len(subCorpora), "\n")
    print("2. Lexical diversity:", lexical_diversity(subCorpora), "\n")
    print("3. Top 10 most frequent words and their counts:\n", fdist.most_common(10), "\n")
    print("4. Words that are at least 10 characters long and their counts:")
    for i in range(len(words_list)):
        print(words_list[i], count_list[i])
    print("\n")
    print("The longest sentence:\n", longest_sent)
    print("The length of the longest sentence:", len(longest_sent), "\n")
    print("The stemmed longest sentence:\n", stemmed_sent)
    print("The length of the stemmed longest sentence:", len(stemmed_sent), "\n\n")

In [87]:
path = "./dataset/"
words_list = []
count_list = []

### subcorpus 1

In [88]:
#subcorpora 1
with open("./dataset/researchPaper.txt", "r", encoding = "utf8") as f:
    corpora = f.read()
subCorpora = nltk.word_tokenize(corpora)

#top 10 most frequent words & their counts
fdist = frequent_words(subCorpora)

#words at least 10 characters long & their counts
long_words(subCorpora, words_list, count_list)
  
#longest sentence with number of words
text_file = "researchPaper.txt"
longest_sentence = sentences(path, text_file)

#stemmed version of the longest sentence
stemmed_longest_sentence = stemmed_sentence(longest_sentence)

#print answers
print_answers("Research Paper", subCorpora, fdist, words_list, count_list, longest_sentence, stemmed_longest_sentence)


SubCorpus: Research Paper
1. Length (in words): 9395 

2. Lexical diversity: 0.25502927088877064 

3. Top 10 most frequent words and their counts:
 [('the', 387), ('of', 244), ('and', 239), ('a', 188), ('to', 151), ('in', 139), ('s', 103), ('is', 86), ('that', 78), ('as', 74)] 

4. Words that are at least 10 characters long and their counts:
abandonment 1
accelerating 1
accomplishment 1
acquisition 1
acquisitive 1
adjudication 1
alternatives 1
ambiguously 1
anthropocentric 0
anticipates 2
antithetical 2
apocalyptic 3
appropriate 1
architecture 1
aspirations 1
association 0
bestselling 1
celebrating 1
characterized 1
checkerslike 1
chronological 1
civilization 1
commentators 1
communicated 1
communication 2
comparative 0
competition 1
complicating 1
complication 1
computerization 1
concentrated 1
concentration 1
conditioning 1
considering 0
constitutive 1
constraints 2
constructing 1
construction 1
contemporary 2
contradictions 1
contradictory 2
contraption 1
contrasting 1
contributed 1

### subcorpus 2

In [89]:
#subcorpora 2
with open("./dataset/languageAndThought.txt", "r", encoding = "utf8") as f:
    corpora = f.read()
subCorpora = nltk.word_tokenize(corpora)

#top 10 most frequent words & their counts
fdist = frequent_words(subCorpora)

#words at least 10 characters long & their counts
long_words(subCorpora, words_list, count_list)
  
#longest sentence with number of words
text_file = "languageAndThought.txt"
longest_sentence = sentences(path, text_file)

#stemmed version of the longest sentence
stemmed_longest_sentence = stemmed_sentence(longest_sentence)

#print answers
print_answers("Language and Thought", subCorpora, fdist, words_list, count_list, longest_sentence, stemmed_longest_sentence)


SubCorpus: Language and Thought
1. Length (in words): 6230 

2. Lexical diversity: 0.26324237560192615 

3. Top 10 most frequent words and their counts:
 [('the', 184), ('of', 173), ('to', 147), ('and', 114), ('language', 94), ('a', 88), ('in', 87), ('that', 78), ('is', 76), ('as', 63)] 

4. Words that are at least 10 characters long and their counts:
accumulating 1
acquisition 3
adaptations 1
adjustments 1
adolescents 1
alternatively 0
application 1
appropriate 4
approximately 1
automatically 4
backgrounds 1
behaviourists 1
broekhuizen 0
characteristics 1
characterize 1
characterized 1
characterizes 1
cognitively 1
communicate 11
communicated 1
communicates 1
communicating 2
communication 17
communicative 5
communities 1
complexities 1
complicated 1
comprehension 6
concentrated 1
concentrating 1
conditioning 2
congratulations 1
connections 1
connotations 1
constructed 2
conventional 1
conventions 1
conversation 1
demonstrated 1
development 4
developmental 1
differences 5
differently 2

### subcorpus 3

In [90]:
#subcorpora 2
with open("""./dataset/emails.txt""", "r", encoding="utf8") as f:
    corpora = f.read()
subCorpora = nltk.word_tokenize(corpora)

#top 10 most frequent words & their counts
fdist = frequent_words(subCorpora)

#words at least 10 characters long & their counts
long_words(subCorpora, words_list, count_list)
  
#longest sentence with number of words
text_file = """emails.txt"""
longest_sentence = sentences(path, text_file)

#stemmed version of the longest sentence
stemmed_longest_sentence = stemmed_sentence(longest_sentence)

#print answers
print_answers("emails", subCorpora, fdist, words_list, count_list, longest_sentence, stemmed_longest_sentence)


SubCorpus: emails
1. Length (in words): 7151 

2. Lexical diversity: 0.20109075653754718 

3. Top 10 most frequent words and their counts:
 [('I', 302), ('a', 175), ('to', 160), ('you', 159), ('the', 122), ('and', 107), ('it', 106), ('for', 78), ('m', 75), ('s', 69)] 

4. Words that are at least 10 characters long and their counts:
accommodations 1
backstories 1
brightening 1
challenging 1
collectibles 1
comfortable 1
considering 1
conversations 2
daydreaming 1
destinations 1
distractions 1
distributions 1
experimenting 1
highlighter 1
inspiration 1
merchandise 1
moisturizer 1
overwhelming 3
photography 1
probability 1
professional 1
recommendations 7
recommended 3
suggestions 4
surprisingly 1
thanksgiving 0
therapeutic 1


The longest sentence:
 ['I', '’', 'm', 'not', 'sure', 'if', 'I', 'want', 'to', 'go', 'anywhere', 'this', 'year', 'or', 'just', 'stay', 'local', ',', 'but', 'I', 'thought', 'I', '’', 'd', 'ask', 'if', 'you', 'had', 'any', 'ideas', 'or', 'plans', 'for', 'the', 'holida