<a href="https://colab.research.google.com/github/edurelated2021/tds-proj1/blob/main/code/get_user_repo_details.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
GitHub User and Repository details fetcher

This script queries the GitHub GraphQL API to retrieve user details and their associated
repositories based on specified criteria. It filters users by location and a minimum
number of followers, collects relevant information, and saves the results into CSV files.

Key functionalities include:
- Querying user details based on city and follower count.
- Fetching repositories for each user.
- Formatting and normalizing data for output.
- Writing user and repository information to separate CSV files.
- Measuring and displaying the total execution time.

Usage:
1. Update the GitHub token in the HEADERS dictionary.
2. Run the script to generate 'users.csv' and 'repositories.csv' files with the scraped data.
"""

import requests
import json
import time
import csv

# Change city and min_followers as per your requirements
CITY = "Chennai"
MIN_FOLLOWERS = 50

# Constants for GitHub GraphQL API
GITHUB_GRAPHQL_URL = "https://api.github.com/graphql"
HEADERS = {
    "Authorization": "Bearer Your_token_here",  # Replace with your GitHub token
    "Content-Type": "application/json"
}

def get_user_details(city, min_followers, after_cursor=None):
    """
    Queries GitHub's GraphQL API for user details based on location and minimum followers.

    Args:
        city (str): The city to filter users by.
        min_followers (int): The minimum number of followers a user must have.
        after_cursor (str, optional): Cursor for pagination.

    Returns:
        dict: Response data containing user details or an empty dict if an error occurs.
    """
    query = f"""
    query($after: String) {{
      search(query: "location:{city} followers:>{min_followers}", type: USER, first: 100, after: $after) {{
        userCount
        pageInfo {{
          endCursor
          hasNextPage
        }}
        edges {{
          node {{
            ... on User {{
              login
              name
              company
              location
              email
              isHireable
              bio
              publicRepositories: repositories {{
                totalCount
              }}
              followers {{
                totalCount
              }}
              following {{
                totalCount
              }}
              createdAt
            }}
          }}
        }}
      }}
    }}
    """
    variables = {"after": after_cursor}
    response = requests.post(GITHUB_GRAPHQL_URL, headers=HEADERS, json={'query': query, 'variables': variables})
    response.raise_for_status()  # Raise an error for bad responses
    data = response.json()

    if 'errors' in data:
        print(f"GitHub API returned errors: {data['errors']}")
        return {}
    return data

def update_company_name(company):
    """
    Cleans and formats the company name. Remove any whitespaces, remove leading @ symbols and convert to uppercase.

    Args:
        company (str): The company name to format.

    Returns:
        str: The formatted company name.
    """
    return company.strip().lstrip('@').upper() if company else ''

def fetch_users(city, min_followers):
    """
    Fetches user data based on city and minimum followers.

    Args:
        city (str): The city to filter users by.
        min_followers (int): The minimum number of followers a user must have.

    Returns:
        list: A list of users.
    """
    start_time = time.time()
    print("\n" + "*" * 70)
    print('Fetching User Details in chunks')

    all_users = []
    after_cursor = None

    while True:
        users_data = get_user_details(city, min_followers, after_cursor)

        if not users_data:
            break

        users = users_data['data']['search']['edges']
        all_users.extend(users)

        page_info = users_data['data']['search']['pageInfo']
        if not page_info['hasNextPage']:
            break
        after_cursor = page_info['endCursor']

        print(f"Fetched {len(users)} users.")
        time.sleep(1)  # Throttle requests

    print(f"{'Completed fetching User Details in'} {time.time() - start_time:.2f} seconds")
    print("*" * 70 + "\n")
    print(f"{'Now fetching repository details...'}")

    return all_users

def process_user_info(all_users):
    """
    Processes user information to extract user (and their associated repositories) details.

    Args:
        all_users (list): List of all users fetched from GitHub.

    Returns:
        tuple: A tuple containing user data and repository data as lists.
    """
    user_rows = []
    repo_rows = []

    for count, user in enumerate(all_users):
        user_info = user['node']
        if 'login' not in user_info:
            print(f"Login field missing for: {user_info}. Skipping this record from inclusion")
            continue

        # Gather user information
        user_rows.append(collect_user_data(user_info))

        print(f"[User {count + 1}] Fetching repository metadata for user {user_info.get('login')}")
        repo_data = get_repository_details(user_info['login'])

        # Gather repository information
        for repo_edge in repo_data:
            repo = repo_edge['node']
            repo_rows.append(collect_repo_data(user_info['login'], repo))

        time.sleep(1)  # Throttle requests

    return user_rows, repo_rows

def get_repository_details(login):
    """
    For a given user, fetches certain information related to their GitHub repositories.

    Args:
        login (str): The GitHub username.

    Returns:
        list: A list of repositories or an empty list if an error occurs.
    """
    all_repos = []
    after_cursor = None

    while len(all_repos) < 500:
        query = get_repos_query()
        variables = {"login": login, "after": after_cursor}
        response = requests.post(GITHUB_GRAPHQL_URL, headers=HEADERS, json={'query': query, 'variables': variables})
        response.raise_for_status()
        data = response.json()

        if 'errors' in data:
            print(f"GitHub API returned errors for {login}: {data['errors']}")
            break

        repos_edges = data['data']['user']['repositories']['edges']
        all_repos.extend(repos_edges)

        page_info = data['data']['user']['repositories']['pageInfo']
        if not page_info['hasNextPage'] or len(all_repos) >= 500:
            break
        after_cursor = page_info['endCursor']
        time.sleep(1)  # Throttle requests

    return all_repos[:500]  # Return only the first 500 repositories

def get_repos_query():
    """
    Creates the GraphQL query for fetching user repositories.

    Returns:
        str: The GraphQL query string.
    """
    return """
    query($login: String!, $after: String) {
      user(login: $login) {
        repositories(first: 100, after: $after, orderBy: {field: PUSHED_AT, direction: DESC}) {
          edges {
            node {
              fullName: nameWithOwner
              createdAt
              stargazerCount
              watchers {
                totalCount
              }
              primaryLanguage {
                name
              }
              hasProjectsEnabled
              hasWikiEnabled
              licenseInfo {
                key
              }
            }
          }
          pageInfo {
            endCursor
            hasNextPage
          }
        }
      }
    }
    """

# Utility functions below

def print_user_info(user_details):
    """
    DEBUGGING PURPOSES ONLY.
    Prints user information in a formatted manner.

    Args:
        user_details (dict): User details to be printed.
    """
    try:
        user_info = collect_user_data(user_details)
        user_info_string = ", ".join(f"{key}: {value}" for key, value in user_info.items())
        print(f"User {user_info['login']}: {user_info_string}")
    except Exception as e:
        print(f"Error printing info for user {user_details.get('login', 'Unknown')}: {e}")

def collect_user_data(user_info):
    """
    Collects and performs basic data transformation on the user data.

    Args:
        user_info (dict): Raw user data.

    Returns:
        dict: A dictionary of normalized user data.
    """
    return {
        "login": user_info.get('login', ''),
        "name": user_info.get('name', ''),
        "company": update_company_name(user_info.get('company')),
        "location": user_info.get('location', ''),
        "email": user_info.get('email', ''),
        "hireable": str(user_info.get('isHireable', False)).lower(),
        "bio": user_info.get('bio', ''),
        "public_repos": user_info['publicRepositories'].get('totalCount', 0),
        "followers": user_info['followers'].get('totalCount', 0),
        "following": user_info['following'].get('totalCount', 0),
        "created_at": user_info.get('createdAt', '')
    }

def collect_repo_data(login, repo):
    """
    Collects and performs basic data transformation on the repository data for a given user.

    Args:
        login (str): The login of the user.
        repo (dict): Raw repository data.

    Returns:
        dict: A dictionary of normalized repository data.
    """
    return {
        "login": login,
        "full_name": repo.get('fullName', ''),
        "created_at": repo.get('createdAt', ''),
        "stargazers_count": repo.get('stargazerCount', 0),
        "watchers_count": repo.get('watchers', {}).get('totalCount', 0),
        "language": repo['primaryLanguage']['name'] if repo['primaryLanguage'] else '',
        "has_projects": str(repo.get('hasProjectsEnabled', False)).lower(),
        "has_wiki": str(repo.get('hasWikiEnabled', False)).lower(),
        "license_name": repo['licenseInfo']['key'] if repo['licenseInfo'] else ''
    }

def write_to_csv(filename, fieldnames, rows):
    """
    Writes data to a csv file.

    Args:
        filename (str): The name of the csv file.
        fieldnames (list): The headers for the csv file.
        rows (list): The data to write to the csv file.
    """
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

def main():
    start_time = time.time()

    all_users = fetch_users(CITY, MIN_FOLLOWERS)
    user_rows, repo_rows = process_user_info(all_users)

    # Write user data to users.csv
    print('Generating users.csv...')
    write_to_csv('users.csv', ['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at'], user_rows)

    # Write repository data to repositories.csv
    print('Generating repositories.csv...')
    write_to_csv('repositories.csv', ['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name'], repo_rows)

    # Display total execution time
    print(f"Total execution time: {time.time() - start_time:.2f} seconds")

if __name__ == "__main__":
    main()



**********************************************************************
Fetching User Details in chunks
Fetched 100 users.
Fetched 100 users.
Fetched 100 users.
Fetched 100 users.
Completed fetching User Details in 12.53 seconds
**********************************************************************

Now fetching repository details...
[User 1] Fetching repository metadata for user Premalatha-success
[User 2] Fetching repository metadata for user anitaa1990
[User 3] Fetching repository metadata for user codewithMUHILAN
[User 4] Fetching repository metadata for user sygops
[User 5] Fetching repository metadata for user Spikeysanju
[User 6] Fetching repository metadata for user suriyadeepan
[User 7] Fetching repository metadata for user IAmNithi
[User 8] Fetching repository metadata for user csjaba
[User 9] Fetching repository metadata for user sudip-mondal-2002
[User 10] Fetching repository metadata for user amuthansakthivel
[User 11] Fetching repository metadata for user jaganjavid
[User