In [1]:
import re
from collections import Counter

def analyze_readme_data(df):
    
    # Create a dictionary to categorize words by programming language
    word_categories = {word: [] for word in languages}

    # Create a dictionary to store the length of READMEs by language
    length_by_language = {}

    # Loop through each repository in the DataFrame
    for repo in df['Repository']:
        # Get the URL of the README file
        url = f"https://api.github.com/repos/{repo}/readme"

        # Send a GET request to the URL with authentication
        response = requests.get(url, headers=headers)

        # Check if the response is successful
        if response.status_code == 200:
            # Get the content from the response
            response_data = response.json()
            readme_content = response_data["content"]

            # Decode the base64 encoded content
            import base64
            readme_text = base64.b64decode(readme_content).decode("utf-8")

            # Extract words from the README text
            words = re.findall(r"\b\w+\b", readme_text.lower())

            # Calculate the length of the README
            length = len(readme_text)

            # Update length by language dictionary
            language = get_repo_language(repo)
            if language in length_by_language:
                length_by_language[language].append(length)
            else:
                length_by_language[language] = [length]

            # Categorize words by programming language
            for word in words:
                if word in word_categories:
                    word_categories[word].append(repo)

    # Find the most common words in READMEs
    common_words = Counter(words).most_common(10)

    # Display the most common words
    print("Most common words in READMEs:")
    for word, count in common_words:
        print(f"{word}: {count}")

    # Calculate the average length of READMEs by programming language
    average_length_by_language = {
        language: sum(lengths) / len(lengths)
        for language, lengths in length_by_language.items()
    }

    # Display the average length of READMEs by programming language
    print("Average README Length by Language:")
    for language, length in average_length_by_language.items():
        print(f"{language}: {length}")
        
    # Return the common_words list and average_length_by_language dictionary
    return common_words, average_length_by_language

In [None]:
def plot_common_words(common_words):
    # Plot the most common words
    words = [word for word, count in common_words]  # Extract the words from the common_words list
    counts = [count for word, count in common_words]  # Extract the counts from the common_words list

    plt.barh(words, counts)  # Create a horizontal bar plot with words as labels and counts as values
    plt.xlabel('Frequency')  # Set the x-axis label as 'Frequency'
    plt.ylabel('Words')  # Set the y-axis label as 'Words'
    plt.title('Most Common Words in READMEs')  # Set the plot title as 'Most Common Words in READMEs'
    plt.show()  # Display the plot

In [None]:
import matplotlib.pyplot as plt

def plot_average_length_by_language(languages, lengths):
    # Filter out None values from languages and lengths
    languages_filtered = [lang for lang in languages if lang is not None]
    lengths_filtered = [length for lang, length in zip(languages, lengths) if lang is not None]

    # Plot the average README length by language
    plt.plot(languages_filtered, lengths_filtered, marker='o')
    plt.xlabel('Programming Language')
    plt.ylabel('Average README Length')
    plt.title('Average README Length by Language')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
import matplotlib.pyplot as plt

def plot_average_length_by_language_barh(languages, lengths):
    # Filter out None values from languages and lengths
    languages_filtered = [lang for lang in languages if lang is not None]
    lengths_filtered = [length for lang, length in zip(languages, lengths) if lang is not None]

    # Plot the average README length by language using horizontal bar chart
    plt.barh(languages_filtered, lengths_filtered)
    plt.xlabel('Average README Length')
    plt.ylabel('Programming Language')
    plt.title('Average README Length by Language')
    plt.show()