In [1]:
#day19

In [11]:
# Day 19: 30 Days of Python - File Handling and Text Analysis

import json
import re
from collections import Counter
from difflib import SequenceMatcher

# -----------------------------
#  Exercise Level 1
# -----------------------------

# Count number of lines and words in a text file
def count_lines_and_words(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
        lines = text.splitlines()
        words = re.findall(r'\b\w+\b', text)
        return len(lines), len(words)


files = [
    'obama_speech.txt',
    'michelle_obama_speech.txt',
    'donald_speech.txt',
    'melina_trump_speech.txt'
]

for file in files:
    lines, words = count_lines_and_words(file)
    print(f"{file}: {lines} lines, {words} words")


# -----------------------------
# Most spoken languages
# -----------------------------

def most_spoken_languages(filename, top_n):
    with open(filename, 'r', encoding='utf-8') as f:
        countries = json.load(f)
    languages = []
    for country in countries:
        languages.extend(country['languages'])
    lang_counts = Counter(languages)
    return lang_counts.most_common(top_n)


print("\nMost spoken languages (top 10):")
print(most_spoken_languages('countries_data.json', 10))

print("\nMost spoken languages (top 3):")
print(most_spoken_languages('countries_data.json', 3))


# -----------------------------
# Most populated countries
# -----------------------------

def most_populated_countries(filename, top_n):
    with open(filename, 'r', encoding='utf-8') as f:
        countries = json.load(f)
    sorted_countries = sorted(countries, key=lambda x: x['population'], reverse=True)
    top_countries = [{'country': c['name'], 'population': c['population']} for c in sorted_countries[:top_n]]
    return top_countries


print("\nMost populated countries (top 10):")
print(most_populated_countries('countries_data.json', 10))

print("\nMost populated countries (top 3):")
print(most_populated_countries('countries_data.json', 3))


obama_speech.txt: 66 lines, 2401 words
michelle_obama_speech.txt: 83 lines, 2223 words
donald_speech.txt: 48 lines, 1266 words
melina_trump_speech.txt: 33 lines, 1377 words

Most spoken languages (top 10):
[('English', 91), ('French', 45), ('Arabic', 25), ('Spanish', 24), ('Portuguese', 9), ('Russian', 9), ('Dutch', 8), ('German', 7), ('Chinese', 5), ('Serbian', 4)]

Most spoken languages (top 3):
[('English', 91), ('French', 45), ('Arabic', 25)]

Most populated countries (top 10):
[{'country': 'China', 'population': 1377422166}, {'country': 'India', 'population': 1295210000}, {'country': 'United States of America', 'population': 323947000}, {'country': 'Indonesia', 'population': 258705000}, {'country': 'Brazil', 'population': 206135893}, {'country': 'Pakistan', 'population': 194125062}, {'country': 'Nigeria', 'population': 186988000}, {'country': 'Bangladesh', 'population': 161006790}, {'country': 'Russian Federation', 'population': 146599183}, {'country': 'Japan', 'population': 12696

In [17]:
# -----------------------------
# Extract all email addresses
# -----------------------------

def extract_emails(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
    return re.findall(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}', text)


emails = extract_emails('email_exchanges_big.txt')
print("\nExtracted emails:", emails[:10], '...')  # show first 10
print("Total emails found:", len(emails))



Extracted emails: ['stephen.marquard@uct.ac.za', 'postmaster@collab.sakaiproject.org', '200801051412.m05ECIaH010327@nakamura.uits.iupui.edu', 'source@collab.sakaiproject.org', 'source@collab.sakaiproject.org', 'source@collab.sakaiproject.org', 'source@collab.sakaiproject.org', 'stephen.marquard@uct.ac.za', 'source@collab.sakaiproject.org', 'stephen.marquard@uct.ac.za'] ...
Total emails found: 20140


In [23]:
# -----------------------------
# Most frequent words in speeches
# -----------------------------

def find_most_common_words(filename, n):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read().lower()
        words = re.findall(r'\b\w+\b', text)
        return Counter(words).most_common(n)


def most_frequent_words(filename, n=10):
    return find_most_common_words(filename, n)

print("\nObama’s top 10 words:", most_frequent_words('obama_speech.txt'))
print("Michelle’s top 10 words:", most_frequent_words('michelle_obama_speech.txt'))
print("Trump’s top 10 words:", most_frequent_words('donald_speech.txt'))
print("Melina’s top 10 words:", most_frequent_words('melina_trump_speech.txt'))



Obama’s top 10 words: [('the', 129), ('and', 113), ('of', 81), ('to', 70), ('our', 67), ('we', 62), ('that', 50), ('a', 48), ('is', 36), ('in', 25)]
Michelle’s top 10 words: [('and', 96), ('the', 85), ('to', 84), ('that', 50), ('of', 46), ('a', 41), ('he', 37), ('in', 36), ('my', 28), ('i', 28)]
Trump’s top 10 words: [('the', 65), ('and', 59), ('we', 44), ('will', 40), ('of', 38), ('to', 32), ('our', 30), ('is', 20), ('america', 17), ('for', 13)]
Melina’s top 10 words: [('and', 77), ('to', 55), ('the', 52), ('is', 29), ('i', 28), ('for', 27), ('of', 25), ('that', 24), ('a', 22), ('you', 21)]


In [25]:
# -----------------------------
# Text similarity
# -----------------------------

def clean_text(text):
    return re.sub(r'[^A-Za-z\s]', '', text).lower()

def check_text_similarity(file1, file2):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        text1, text2 = clean_text(f1.read()), clean_text(f2.read())
    similarity = SequenceMatcher(None, text1, text2).ratio()
    return round(similarity * 100, 2)


similarity_score = check_text_similarity('michelle_obama_speech.txt', 'melina_trump_speech.txt')
print(f"\nText similarity between Michelle and Melina speeches: {similarity_score}%")



Text similarity between Michelle and Melina speeches: 0.72%


In [29]:
# -----------------------------
# Top repeated words in Romeo and Juliet
# -----------------------------

print("\nTop 10 words in Romeo and Juliet:")
print(find_most_common_words('romeo_and_juliet.txt', 10))



Top 10 words in Romeo and Juliet:
[('the', 868), ('and', 800), ('to', 661), ('i', 658), ('of', 535), ('a', 530), ('is', 381), ('in', 378), ('that', 371), ('you', 367)]


In [31]:
# -----------------------------
# Hacker News CSV word frequency
# -----------------------------

def count_keyword_occurrences(filename, keyword):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return sum(1 for line in lines if re.search(keyword, line, re.IGNORECASE))

python_count = count_keyword_occurrences('hacker_news.csv', r'\bpython\b')
js_count = count_keyword_occurrences('hacker_news.csv', r'\bjavascript\b')
java_count = count_keyword_occurrences('hacker_news.csv', r'\bjava\b(?!script)')

print("\nLines containing 'Python':", python_count)
print("Lines containing 'JavaScript':", js_count)
print("Lines containing 'Java' but not 'JavaScript':", java_count)



Lines containing 'Python': 162
Lines containing 'JavaScript': 183
Lines containing 'Java' but not 'JavaScript': 59
