
# Objective of the project:

1- Download the following book (Plain text):

    https://www.gutenberg.org/files/11/11-0.txt

2- Read the file line by line and:

    a. remove all non-alphabetic characters
    b. convert all characters to lowercase

3- Calculate the following stats:

    a. number of paragraphs (paragraphs are separated by empty lines)
    b. number of lines (excluding empty lines)
    c. number of words
    d. number of unique words

4- Calculate the number of times each unique word is repeated in the entire book

5- Create a CSV file, named “stats.csv”, with the above calculated info similar to the
following example (the stats below are fake). The words are sorted by the number of
times repeated (highest to lowest):

    _paragraphs,1000
    _lines,5000
    _words,25000
    _unique_words,10000
    a,7500
    an,5000
    and,3500
    or,2500


In [None]:
import csv

non_alpha_char = '0123456789“”#!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'


def output_data_stats(output_file='stats.csv', input_file='wonderland.txt'):
    fieldnames = ['description ', ' count ']
    data_input = open(input_file, encoding="utf8")
    stats_output = open(output_file, 'w')
    writer = csv.DictWriter(stats_output, fieldnames=fieldnames)
    writer.writeheader()

    # read the file and coverts to lowercase string.
    data = data_input.read().lower()

    # remove all non-alphabetic characters
    for char in non_alpha_char:
        data = data.replace(char, '')

    # coverts data from string to list at every new line
    data_lower_lst = data.split('\n')

    line_count = 0
    paragraph_count = 0
    word_count = 0
    unique_word = 0
    unique_word_lst = []
    unique_word_dict = {}
    empty = True

    # loop through each line of the list. determine whether line is empty or not. depending on previous iteration,
    # and current content of line count will increase or stay the same.
    for line in data_lower_lst:
        if len(line) > 0:
            line_count += 1
        if len(line) == 0:
            empty = True
        elif len(line) > 0 and empty is True:
            paragraph_count += 1
            empty = False
        if empty is True:
            paragraph_number = 0
        else:
            paragraph_number = paragraph_count

    # splits the list so that each index carries a word.
    lst = data.split()
    while "" in lst:
        lst.remove("")
    for item in lst:
        word_count += 1
        if item not in unique_word_lst:
            unique_word_lst.append(item)
            unique_word_dict[item] = lst.count(item)
            unique_word += 1

    # prints output to the csv file.
    writer.writerow({'description ': '_paragraphs,', ' count ': paragraph_count})
    writer.writerow({'description ': '_lines,', ' count ': line_count})
    writer.writerow({'description ': '_word,', ' count ': word_count})
    writer.writerow({'description ': '_unique_words,', ' count ': unique_word})

    # converts dictionary into list so we can sort in descending order.
    sorted_unique_word = sorted(unique_word_dict.items(), key=lambda x: x[1], reverse=True)

    # prints each unique word and its count of appearances.
    for i in sorted_unique_word:
        writer.writerow({'description ': i[0], ' count ': i[1]})

    # exit both input and output file.
    data_input.close()
    stats_output.close()

In [None]:
output_data_stats()