# Applied Statistical Data Analysis

## 1st Assignment - 24.10.2024 - Farmand Bazdiditehrani (Group 5)

### Importing Libraries

In [57]:
from docx import Document # in order to read the text from the word document file
import re # used for regular expressions and strings
import pandas as pd
from tabulate import tabulate

### Importing Text from the Word document file

In [58]:
def read_docx(file_path):
    doc = Document("C:\\Users\\ASUS\\OneDrive\\Desktop\\Group5Text.docx") # This path is from my own system
    text = []
    for para in doc.paragraphs:
        text.append(para.text)
    return '\n'.join(text)

### Classification of the words based on their type (Noun, Article, ...)

#### Many examples are defined for each category. However, for Verbs, Adverbs, and Adjectives it is needed to find them by their ending characters.
##### Verbs Endings: ing, ed, es
##### Adverbs Endings: ly
##### Adjectives Endings: ous, ful, ic

In [59]:
def classify_word(word):
    function_words = {
        'articles': {'the', 'a', 'an'},
        'prepositions': {'in', 'on', 'at', 'by', 'with', 'about', 'against', 'among', 'between', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'under', 'over', 'again', 'further', 'then', 'once'},
        'conjunctions': {'and', 'but', 'or', 'nor', 'so', 'for', 'yet'},
        'pronouns': {'he', 'she', 'it', 'they', 'we', 'you', 'I', 'me', 'him', 'her', 'us', 'them'},
        'determiners': {'this', 'that', 'these', 'those', 'my', 'your', 'his', 'her', 'its', 'our', 'their'},
        'modals': {'can', 'could', 'may', 'might', 'will', 'would', 'shall', 'should', 'must'},
        'auxiliary_verbs': {'am', 'is', 'are', 'was', 'were', 'being', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did'}
    }

# Verbs, Adverbs, and Adjectives are identified based on their ending! 
    
    if word in function_words['articles']:
        return 'article'
    elif word in function_words['prepositions']:
        return 'preposition'
    elif word in function_words['conjunctions']:
        return 'conjunction'
    elif word in function_words['pronouns']:
        return 'pronoun'
    elif word in function_words['determiners']:
        return 'determiner'
    elif word in function_words['modals']:
        return 'modal'
    elif word in function_words['auxiliary_verbs']:
        return 'auxiliary verb'
    elif word.endswith('ly'):
        return 'adverb'
    elif word.endswith('ing') or word.endswith('ed') or word.endswith('es'):
        return 'verb'
    elif word.endswith('ous') or word.endswith('ful') or word.endswith('ic'):
        return 'adjective'
    else:
        return 'noun'  # If it's none of the categories above, it should be a noun!

### Creating a Dictionary and including their Frequency, Length, and Type

In [60]:
def process_text_to_dict(text):
    words_dict = {}
    words = re.findall(r'\b\w+\b', text.lower()) # lowercases every word

    for word in words:
        word_type = classify_word(word)
        if word in words_dict:
            words_dict[word]['frequency'] += 1
        else:
            words_dict[word] = {
                'frequency': 1, # it means there only exists one of this word
                'length': len(word),
                'type': word_type,
            }
    return words_dict

### Transforming created dictionary to lists in a table

In [61]:
def get_word_attributes(words_dict):
    table_data = []
    for word, attributes in words_dict.items():
        table_data.append([word, attributes['frequency'], attributes['length'], attributes['type']])
    return table_data

### An easy way to show content of the table using the pandas library

In [62]:
import pandas as pd
df = pd.DataFrame.from_dict(words_dict, orient='index')
df

Unnamed: 0,frequency,length,type
the,20,3,article
idea,2,4,noun
of,15,2,noun
d,11,1,noun
ecroissance,11,11,noun
...,...,...,...
sense,1,5,noun
seen,1,4,noun
should,1,6,modal
abandoned,1,9,verb


### Another way to create a table including all words using tabulate

In [63]:
file_path = "C:\\Users\\ASUS\\OneDrive\\Desktop\\Group5Text.docx"
doc_string = read_docx(file_path)
words_dict = process_text_to_dict(doc_string)
word_attributes_table = get_word_attributes(words_dict)
headers = ["Word", "Frequency", "Length", "Type"]
print(tabulate(word_attributes_table, headers, tablefmt="grid"))

+----------------+-------------+----------+----------------+
| Word           |   Frequency |   Length | Type           |
| the            |          20 |        3 | article        |
+----------------+-------------+----------+----------------+
| idea           |           2 |        4 | noun           |
+----------------+-------------+----------+----------------+
| of             |          15 |        2 | noun           |
+----------------+-------------+----------+----------------+
| d              |          11 |        1 | noun           |
+----------------+-------------+----------+----------------+
| ecroissance    |          11 |       11 | noun           |
+----------------+-------------+----------+----------------+
| emerged        |           1 |        7 | verb           |
+----------------+-------------+----------+----------------+
| in             |          12 |        2 | preposition    |
+----------------+-------------+----------+----------------+
| france         |      

## 