In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import codecs
from tabulate import tabulate

# Task 1: Data exploration 

In [50]:
# Function to read text files safely
def read_text_file(file_path):
    with codecs.open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    return sentences

# Function to calculate most frequent word
def most_frequent_word(text):
    words = text.split()
    word_freq = Counter(words)
    return word_freq.most_common(1)[0][0]

# Function to calculate unique words count
def count_unique_words(text):
    words = text.split()
    return len(set(words))

# Function to calculate numeral frequencies
def count_numerals(text):
    numeral_count = Counter(char for char in text if char.isdigit())
    return numeral_count

# File paths
en_file_path = 'de-en/europarl-v7.de-en.en'  # English language file
de_file_path = 'de-en/europarl-v7.de-en.de'  # German language file

# Read the files
en_sentences = read_text_file(en_file_path)
de_sentences = read_text_file(de_file_path)

# Create a DataFrame
df = pd.DataFrame({'en': en_sentences, 'de': de_sentences})

In [51]:
# Basic statistics
num_sentences = len(df)
en_lengths = df['en'].str.len()
de_lengths = df['de'].str.len()
length_diff = en_lengths - de_lengths

# Calculate total number of words, unique words, and average word length
def calculate_word_stats(text):
    words = text.split()
    num_words = len(words)
    unique_words = count_unique_words(text)
    avg_word_length = sum(len(word) for word in words) / num_words
    return num_words, unique_words, avg_word_length

en_num_words, unique_en_words, awl_en = calculate_word_stats(' '.join(df['en']))
de_num_words, unique_de_words, awl_de = calculate_word_stats(' '.join(df['de']))

# Calculate most frequent words
most_freq_word_en = most_frequent_word(' '.join(df['en']))
most_freq_word_de = most_frequent_word(' '.join(df['de']))

# Calculate numeral frequencies
numeral_freq_en = count_numerals(' '.join(df['en']))
numeral_freq_de = count_numerals(' '.join(df['de']))

# Summary statistics
summary_stats = [
    ['Number of sentences', num_sentences],
    ['Total words (English)', en_num_words],
    ['Total words (German)', de_num_words],
    ['Unique words (English)', unique_en_words],
    ['Unique words (German)', unique_de_words],
    ['Average word length (English)', awl_en],
    ['Average word length (German)', awl_de],
    ['Average sentence length (English)', en_lengths.mean()],
    ['Average sentence length (German)', de_lengths.mean()],
    ['Average sentence length difference (English - German)', length_diff.mean()],
    ['Most frequent word (English)', most_freq_word_en],
    ['Most frequent word (German)', most_freq_word_de],
    ['Numerals frequency (English)', numeral_freq_en],
    ['Numerals frequency (German)', numeral_freq_de]
]

# Print summary statistics
print(tabulate(summary_stats, headers=['Statistic', 'Value'], tablefmt='pretty'))

+-------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                       Statistic                       |                                                                                  Value                                                                                  |
+-------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                  Number of sentences                  |                                                                                 1920209                                                                                 |
|                 Total words (English)                 |                               

In [53]:
df['en_sentence_length'] = df['en'].str.len()
df['de_sentence_length'] = df['de'].str.len()

df['en_num_words'] = df['en'].str.split().apply(len)
df['de_num_words'] = df['de'].str.split().apply(len)

df['en_avg_word_length'] = df['en_sentence_length'] / df['en_num_words']
df['de_avg_word_length'] = df['de_sentence_length'] / df['de_num_words']

# Plotting with Seaborn
plt.figure(figsize=(4, 4))

# Distribution of Average Word Length
plt.subplot(1, 1, 1)
sns.histplot(df['en_avg_word_length'], kde=True, color='blue', label='English')
sns.histplot(df['de_avg_word_length'], kde=True, color='green', label='German')
plt.title('Distribution of Average Word Length')
plt.xlabel('Average Word Length (Characters)')
plt.legend()

# Show plot
plt.tight_layout()
plt.show()


# Task 2: Pre-processing