Prints 10 most used words

In [None]:
import os
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import nltk

nltk.download('stopwords')

stop_words_dutch = set(stopwords.words('dutch'))
stop_words_dutch_list = list(stop_words_dutch)

def get_top_tfidf_words(tfidf_scores, feature_names, top_n=10):
    sorted_indices = tfidf_scores.argsort()[::-1][:top_n]
    top_words = [(feature_names[i], tfidf_scores[i]) for i in sorted_indices]
    return top_words

def plot_top_tfidf_words(top_words):
    words, scores = zip(*top_words)

    plt.figure(figsize=(10, 8))
    plt.bar(words, scores, color='blue')
    plt.xlabel('Words')
    plt.ylabel('TF-IDF Score')
    plt.title('Top Words in Text Files by TF-IDF Score')
    plt.xticks(rotation=45)
    plt.savefig('Top_words_TF-IDF.png')
    plt.show()

def create_wordcloud(tfidf_scores, feature_names):
    word_scores = {feature_names[i]: tfidf_scores[i] for i in range(len(feature_names))}

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_scores)

    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Top Words by TF-IDF Score')
    plt.savefig('Word_Cloud.png')
    plt.show()

def analyze_text_files_with_tfidf(folders, top_n=10, manual_stopwords=None):
    documents = []

    if manual_stopwords is None:
        combined_stopwords = stop_words_dutch_list
    else:
        combined_stopwords = stop_words_dutch_list + manual_stopwords

    for folder in folders:
        for filename in os.listdir(folder):
            if filename.endswith('.txt'):
                input_path = os.path.join(folder, filename)
                try:
                    with open(input_path, 'r', encoding='utf-8') as file:
                        content = file.read()
                    documents.append(content)
                except Exception as e:
                    print(f"Error processing file {filename}: {e}")

    vectorizer = TfidfVectorizer(stop_words=combined_stopwords)
    tfidf_matrix = vectorizer.fit_transform(documents)

    avg_tfidf_scores = tfidf_matrix.mean(axis=0).A1
    feature_names = vectorizer.get_feature_names_out()

    top_words = get_top_tfidf_words(avg_tfidf_scores, feature_names, top_n)

    plot_top_tfidf_words(top_words)
    create_wordcloud(avg_tfidf_scores, feature_names)


manual_stopwords_list = ['deze', 'die', 'het', 'we', 'wij', 'onze', 'wel', 'jouw']
folders = [
    '../input/data_folder/Part1',
    '../input/data_folder/part2',
    '../input/data_folder/part3',
    '../input/data_folder/part4',
    '../input/data_folder/part5',
    '../input/data_folder/part6'
]
analyze_text_files_with_tfidf(folders, top_n=10, manual_stopwords=manual_stopwords_list)

In [None]:
file distribution

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def get_file_sizes(folders):
    file_sizes = []

    for folder in folders:
        for root, _, files in os.walk(folder):
            for file in files:
                if file.endswith('.txt'):
                    try:
                        file_path = os.path.join(root, file)
                        file_size = os.path.getsize(file_path)
                        file_sizes.append((folder, file, file_size))
                    except OSError as e:
                        i = 1
                        #print(f"Error accessing file {file_path}: {e}")

    return file_sizes

def print_size_distribution(file_sizes):
    df = pd.DataFrame(file_sizes, columns=['folder', 'file', 'size'])

    folder_mapping = {folder: f"Folder {i+1}" for i, folder in enumerate(df['folder'].unique())}
    df['folder'] = df['folder'].map(folder_mapping)

    # Print the distribution of file sizes for each folder
    for folder in df['folder'].unique():
        folder_sizes = df[df['folder'] == folder]['size']
        print(f"Folder: {folder}")
        print(f"  Count: {folder_sizes.count()}")
        print(f"  Mean: {folder_sizes.mean()}")
        print(f"  Median: {folder_sizes.median()}")
        print(f"  Std: {folder_sizes.std()}")
        print(f"  Min: {folder_sizes.min()}")
        print(f"  Max: {folder_sizes.max()}")
        print()

def main():

    folders = [
        '../input/data_folder/Part1',
        '../input/data_folder/part2',
        '../input/data_folder/part3',
        '../input/data_folder/part4',
        '../input/data_folder/part5',
        '../input/data_folder/part6'
    ]

    file_sizes = get_file_sizes(folders)

    if not file_sizes:
        print("No text files found or could not access files.")
        return

    # Print the distribution of file sizes
    print_size_distribution(file_sizes)

    # Create a DataFrame
    df = pd.DataFrame(file_sizes, columns=['folder', 'file', 'size'])

    # Map folder paths to folder names
    folder_mapping = {folder: f"Folder {i+1}" for i, folder in enumerate(folders)}
    df['folder'] = df['folder'].map(folder_mapping)


    sns.set(style="whitegrid")
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df, y='folder', x='size', orient='h')

    # Customize the plot
    plt.ylabel("Folder")
    plt.xlabel("File Size (mega bytes)")
    plt.title('Distribution of Text File Sizes in Folders')
    plt.savefig('File_distribution.png')

    plt.show()

if __name__ == "__main__":
    main()


Language detection

In [None]:
import os
from langdetect import detect, DetectorFactory, LangDetectException
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

DetectorFactory.seed = 0

def detect_language(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            return detect(text)
    except LangDetectException:
        return 'unknown'
    except Exception as e:
        #print(f"Error reading file {file_path}: {e}")
        return 'error'

def get_languages(folders):
    languages = []

    for folder in folders:
        for root, _, files in os.walk(folder):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    language = detect_language(file_path)
                    languages.append((file, language))

    return languages

def plot_language_distribution(languages, excluded_languages=None):
    df = pd.DataFrame(languages, columns=['file', 'language'])

    if excluded_languages:
        df = df[~df['language'].isin(excluded_languages)]

    sns.set(style="whitegrid")
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, y='language', order=df['language'].value_counts().index, palette="viridis")
    plt.xlabel("Number of Files")
    plt.ylabel("Language")
    plt.title("Distribution of Languages in Text Files")
    plt.show()

def main():

    folders = [
        '../input/data_folder/Part1',
        '../input/data_folder/part2',
        '../input/data_folder/part3',
        '../input/data_folder/part4',
        '../input/data_folder/part5',
        '../input/data_folder/part6'
    ]

    languages = get_languages(folders)
    excluded_languages = ['ro','et','pl','es','hi','pt','sk','it','ca','sq','cy']  # Add languages you want to exclude here

    plot_language_distribution(languages, excluded_languages)
    plt.savefig('../img/Language_distribution.png')

if __name__ == "__main__":
    main()
