In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [16]:
twenty_ng_df = pd.read_csv("./data/without_stopwords/20newsgroups/corpus.tsv", sep="\t", header=None)
twenty_ng_df.columns = ["text"]

trump_df = pd.read_csv("./data/without_stopwords/trump/corpus.tsv", sep="\t", header=None)
trump_df.columns = ["text"]

un_df = pd.read_csv("./data/without_stopwords/un/corpus.tsv", sep="\t", header=None)
un_df.columns = ["text"]

50422


In [None]:
list_of_words = " ".join(trump_df["text"]).split()

In [None]:
Counter(list_of_words)

In [10]:
def get_variable_name(variable):
    return [name for name, value in globals().items() if value is variable][0]

def calculate_text_statistics(df, column_name):
    number_of_documents = df.shape[0]
    df["word_count"] = df[column_name].apply(lambda x: len(x.split()))
    df["word_length"] = df[column_name].apply(lambda x: sum(len(word) for word in x.split())/len(x.split()))
    average_word_count = df["word_count"].mean()
    average_word_length = df["word_length"].mean()

    all_words = " ".join(df[column_name]).split()
    filtered_words = [word for word in all_words if word.lower() not in stop_words]
    
    common_words_counter = Counter(filtered_words)
    most_common_words = [word for word, _ in common_words_counter.most_common(10)]
    vocabulary_size = len(set(all_words))

    return number_of_documents, average_word_count, average_word_length, most_common_words, vocabulary_size

In [11]:
datasets = [twenty_ng_df, trump_df, un_df]
datasets_names = ["20 News Groups", "Trump tweets", "United Nations"]
results = {}
for dataset_name, dataset in zip(datasets_names, datasets):
    n_documents, avg_wc, avg_wl, most_common_words, vocab_size = calculate_text_statistics(dataset, "text")
    results[dataset_name] = (n_documents, avg_wc, avg_wl, most_common_words, vocab_size)
    print(f"""Statistics for {dataset_name} dataset: \n
          Number of documents: {n_documents} \n
          Average word count: {avg_wc} \n
          Average word length {avg_wl} \n
          Most common words: {most_common_words} \n
          Vocabulary size: {vocab_size} \n
          -----------------------------------------""")

Statistics for 20 News Groups dataset: 

          Number of documents: 16309 

          Average word count: 47.752713225826234 

          Average word length 5.563211579887476 

          Most common words: ['make', 'people', 'time', 'good', 'work', 'year', 'system', 'file', 'find', 'give'] 

          Vocabulary size: 1603 

          -----------------------------------------
Statistics for Trump tweets dataset: 

          Number of documents: 43754 

          Average word count: 8.072519083969466 

          Average word length 5.795128008019047 

          Most common words: ['great', 'trump', 'thank', 'people', 'would', 'get', 'new', 'president', 'like', 'big'] 

          Vocabulary size: 20182 

          -----------------------------------------
Statistics for United Nations dataset: 

          Number of documents: 50422 

          Average word count: 34.195827218277735 

          Average word length 7.207733668551151 

          Most common words: ['united', 'internatio