#### Exercises: Day 20

In [None]:
# Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'

import requests
from collections import Counter
from bs4 import BeautifulSoup

def get_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()

        return text

    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from URL: {e}")

def find_most_frequent_words(text, num_words=10):
    # Clean the text and split into words
    words = text.split()
    # Remove punctuation and convert to lowercase
    cleaned_words = [word.strip('.,?!:;()[]{}"\'').lower() for word in words]

    # Count word occurrences
    word_counts = Counter(cleaned_words)

    # Get the most frequent words
    most_frequent_words = word_counts.most_common(num_words)

    return most_frequent_words

# usage:
romeo_and_juliet_url = 'http://www.gutenberg.org/files/1112/1112.txt'
romeo_and_juliet_text = get_text_from_url(romeo_and_juliet_url)

if romeo_and_juliet_text:
    most_frequent_words = find_most_frequent_words(romeo_and_juliet_text, num_words=10)

    print(f"Top 10 Most Frequent Words:")
    for word, count in most_frequent_words:
        print(f"{word}: {count} occurrences")


In [2]:
# Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find

import requests
import numpy as np
import pandas as pd

cats_api = 'https://api.thecatapi.com/v1/breeds'

def get_cats_data(api_url):
    try:
        response = requests.get(api_url)
        response.raise_for_status()
        cats_data = response.json()
        return cats_data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Cats API: {e}")
        return None

def analyze_weight_lifespan(data):
    weights = [cat.get('weight', {}).get('metric', '').split()[0] for cat in data]
    lifespans = [cat.get('life_span', '').split()[0] for cat in data]

    weights_numeric = [float(weight) for weight in weights if weight]
    lifespans_numeric = [float(lifespan) for lifespan in lifespans if lifespan]

    weight_stats = {
        'min': np.min(weights_numeric),
        'max': np.max(weights_numeric),
        'mean': np.mean(weights_numeric),
        'median': np.median(weights_numeric),
        'std_dev': np.std(weights_numeric)
    }

    lifespan_stats = {
        'min': np.min(lifespans_numeric),
        'max': np.max(lifespans_numeric),
        'mean': np.mean(lifespans_numeric),
        'median': np.median(lifespans_numeric),
        'std_dev': np.std(lifespans_numeric)
    }

    return weight_stats, lifespan_stats

def create_frequency_table(data):
    df = pd.DataFrame(data)
    frequency_table = df.groupby(['origin', 'name']).size().reset_index(name='count')
    return frequency_table

# usage:
cats_data = get_cats_data(cats_api)

if cats_data:
    weight_stats, lifespan_stats = analyze_weight_lifespan(cats_data)
    print("Weight Statistics:")
    print(weight_stats)

    print("\nLifespan Statistics:")
    print(lifespan_stats)

    # Create a frequency table of country and breed of cats
    frequency_table_data = [{'origin': cat.get('origin', 'Unknown'), 'name': cat.get('name', 'Unknown')} for cat in cats_data]
    frequency_table = create_frequency_table(frequency_table_data)
    
    print("\nFrequency Table of Country and Breed of Cats:")
    print(frequency_table)


Weight Statistics:
{'min': 2.0, 'max': 5.0, 'mean': 3.2238805970149254, 'median': 3.0, 'std_dev': 0.8779367862598653}

Lifespan Statistics:
{'min': 8.0, 'max': 18.0, 'mean': 12.074626865671641, 'median': 12.0, 'std_dev': 1.814645500809068}

Frequency Table of Country and Breed of Cats:
           origin              name  count
0       Australia   Australian Mist      1
1           Burma           Burmese      1
2           Burma  European Burmese      1
3          Canada            Cymric      1
4          Canada            Sphynx      1
..            ...               ...    ...
62  United States          Savannah      1
63  United States       Selkirk Rex      1
64  United States          Snowshoe      1
65  United States            Toyger      1
66  United States    York Chocolate      1

[67 rows x 3 columns]


In [None]:
# Read the countries API and find

import requests

countries_api = 'https://restcountries.com/v3.1/all'

def get_countries_data(api_url):
    try:
        response = requests.get(api_url)
        response.raise_for_status()
        countries_data = response.json()
        return countries_data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Countries API: {e}")
        return None

def find_largest_countries(data, num_countries=10):
    # Sorting countries by area in descending order
    largest_countries = sorted(data, key=lambda x: x['area']['total'], reverse=True)[:num_countries]
    return largest_countries

def find_most_spoken_languages(data, num_languages=10):
    # Extracting languages from all countries
    all_languages = [language for country in data for language in country.get('languages', {}).values()]
    
    # Counting language occurrences
    language_counts = dict(sorted([(language, all_languages.count(language)) for language in set(all_languages)], key=lambda x: x[1], reverse=True)[:num_languages])
    
    return language_counts

def total_number_of_languages(data):
    # Extracting unique languages from all countries
    all_languages = set(language for country in data for language in country.get('languages', {}).values())
    return len(all_languages)

# usage:
countries_data = get_countries_data(countries_api)

if countries_data:
    # Find the 10 largest countries
    largest_countries = find_largest_countries(countries_data)
    print("\n10 Largest Countries:")
    for country in largest_countries:
        print(f"{country['name']['common']}: {country['area']['total']} square kilometers")

    # Find the 10 most spoken languages
    most_spoken_languages = find_most_spoken_languages(countries_data)
    print("\n10 Most Spoken Languages:")
    for language, count in most_spoken_languages.items():
        print(f"{language}: {count} countries")

    # Find the total number of languages
    total_languages = total_number_of_languages(countries_data)
    print(f"\nTotal Number of Languages: {total_languages}")
