In [None]:
pip install requests pandas python-dotenv


Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
import requests
import pandas as pd

# Replace with your GitHub token
TOKEN = 'token'
headers = {'Authorization': f'token {TOKEN}'}

# Function to fetch users from London with more than 500 followers
def fetch_users(location='Barcelona', followers=100, per_page=100, page=1):
    url = f"https://api.github.com/search/users?q=location:{location}+followers:>{followers}&per_page={per_page}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch users: {response.status_code}")
        return {}

# Function to fetch repositories for a given user
def fetch_repositories(user_login, per_page=100, page=1):
    url = f"https://api.github.com/users/{user_login}/repos?per_page={per_page}&page={page}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch repos for {user_login}: {response.status_code}")
        return []

# Clean company name by removing leading '@' and converting to uppercase
def clean_company_name(company):
    if company:
        company = company.strip()
        if company.startswith('@'):
            company = company[1:]
        return company.upper()
    return ''

# Fetch and save users to CSV
def get_users_data():
    all_users = []
    page = 1

    while True:
        users_data = fetch_users(page=page)
        if not users_data.get('items'):
            break
        for user in users_data['items']:
            # Fetch detailed user data (optional, if needed)
            user_detail = requests.get(user['url'], headers=headers).json()

            all_users.append({
                'login': user['login'],
                'name': user_detail.get('name', ''),
                'company': clean_company_name(user_detail.get('company', '')),
                'location': user_detail.get('location', ''),
                'email': user_detail.get('email', ''),
                'hireable': user_detail.get('hireable', ''),
                'bio': user_detail.get('bio', ''),
                'public_repos': user_detail.get('public_repos', 0),
                'followers': user_detail.get('followers', 0),
                'following': user_detail.get('following', 0),
                'created_at': user_detail.get('created_at', '')
            })

        page += 1

    # Save users data to CSV
    users_df = pd.DataFrame(all_users)
    users_df.to_csv('users.csv', index=False)
    print("Users data saved to users.csv")


# Fetch and save repositories to CSV
def get_repositories_data():
    users_df = pd.read_csv('users.csv')
    all_repos = []

    for login in users_df['login']:
        page = 1
        while True:
            repos_data = fetch_repositories(login, page=page)
            if not repos_data:
                break
            for repo in repos_data:
                all_repos.append({
                    'login': login,
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo.get('language', ''),
                    'has_projects': repo.get('has_projects', False),
                    'has_wiki': repo.get('has_wiki', False),
                    'license_name': repo['license']['name'] if repo['license'] else ''
                })
            page += 1

    # Save repositories data to CSV
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv('repositories.csv', index=False)
    print("Repositories data saved to repositories.csv")


# Run the data collection process
get_users_data()
get_repositories_data()

Users data saved to users.csv
Repositories data saved to repositories.csv


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the length of the bio in words
#users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split(" ").str.len()

# The error was here: users_with_bio was used instead of users_with_bios
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


# Prepare the data for regression
X = users_with_bios['bio_word_count']  # Independent variable
y = users_with_bios['followers']        # Dependent variable

# Add a constant to the independent variable for the regression
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the regression slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the slope rounded to three decimal places
print(f'Regression slope of followers on bio word count: {slope:.3f}')

Regression slope of followers on bio word count: 13.733


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


In [None]:
# After collecting user data, add this function to get top 5 users
def get_top_5_users():
    # Load the users dataset
    users_df = pd.read_csv('users.csv')

    # Filter users in Barcelona and sort by followers
    top_users = users_df[users_df['location'].str.contains('Barcelona', na=False)] \
        .sort_values(by='followers', ascending=False) \
        .head(5)

    # Get the logins of the top 5 users
    top_5_logins = top_users['login'].tolist()

    # Print the top 5 logins in a comma-separated format
    print("Top 5 users in Barcelona with the highest number of followers:")
    print(", ".join(top_5_logins))


# Run the data collection process
get_users_data()
get_repositories_data()

# Call the function to get top 5 users
get_top_5_users()


Users data saved to users.csv
Repositories data saved to repositories.csv
Top 5 users in Barcelona with the highest number of followers:
midudev, ai, raysan5, vfarcic, spite


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the cleaned datasets
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# 1. Earliest Registered Users in Barcelona
earliest_users = users_df[users_df['location'].str.contains('Barcelona', na=False)] \
    .sort_values(by='created_at') \
    .head(5)
earliest_logins = earliest_users['login'].tolist()
print("Earliest registered users in Barcelona:")
print(", ".join(earliest_logins))

# 2. Most Popular Licenses Among These Users
popular_licenses = repos_df[repos_df['login'].isin(earliest_users['login'])]['license_name'].dropna().value_counts().head(3)
popular_license_names = popular_licenses.index.tolist()
print("Most popular licenses:")
print(", ".join(popular_license_names))

# 3. Majority Company Among These Developers
majority_company = earliest_users['company'].mode().iloc[0]
print("Majority company:")
print(majority_company)

# 4. Most Popular Programming Language
most_popular_language = repos_df[repos_df['login'].isin(earliest_users['login'])]['language'].mode().iloc[0]
print("Most popular programming language:")
print(most_popular_language)

# 5. Second Most Popular Language for Users Who Joined After 2020
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
after_2020_users = users_df[users_df['created_at'].dt.year > 2020]
second_most_popular_language = after_2020_users['language'].mode().iloc[1]
print("Second most popular language for users who joined after 2020:")
print(second_most_popular_language)

# 6. Language with Highest Average Stars per Repository
avg_stars = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("Language with highest average number of stars per repository:")
print(avg_stars)

# 7. Top 5 Users by Leader Strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leaders = users_df.nlargest(5, 'leader_strength')['login'].tolist()
print("Top 5 users in terms of leader strength:")
print(", ".join(top_leaders))

# 8. Correlation Between Followers and Public Repositories
correlation_followers_repos = users_df['followers'].corr(users_df['public_repos'])
print("Correlation between followers and public repositories:")
print(f"{correlation_followers_repos:.3f}")

# 9. Regression Analysis of Followers on Repos
X = users_df['public_repos']
y = users_df['followers']
X = sm.add_constant(X)
model = sm.OLS(y, X, missing='drop').fit()
regression_slope = model.params['public_repos']
print("Regression slope of followers on repositories:")
print(f"{regression_slope:.3f}")

# 10. Correlation Between Projects and Wiki Enabled
correlation_projects_wiki = repos_df['has_projects'].corr(repos_df['has_wiki'])
print("Correlation between projects and wiki enabled:")
print(f"{correlation_projects_wiki:.3f}")

# 11. Average Following for Hireable vs. Non-Hireable Users
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()
average_difference = avg_following_hireable - avg_following_non_hireable
print("Average following difference for hireable users:")
print(f"{average_difference:.3f}")

# 12. Correlation of Bio Length with Followers
users_df['bio_length'] = users_df['bio'].str.split().str.len()
correlation_bio_followers = users_df['bio_length'].corr(users_df['followers'])
print("Correlation of bio length with followers:")
print(f"{correlation_bio_followers:.3f}")

# 13. Users Who Created Most Repositories on Weekends
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]
top_weekend_users = weekend_repos['login'].value_counts().head(5).index.tolist()
print("Top 5 users who created the most repositories on weekends:")
print(", ".join(top_weekend_users))

# 14. Hireable Users Sharing Email Addresses
fraction_hireable = users_df[users_df['hireable'] == True]['email'].notna().mean()
fraction_non_hireable = users_df[users_df['hireable'] == False]['email'].notna().mean()
email_difference = fraction_hireable - fraction_non_hireable
print("Difference in email sharing between hireable and non-hireable users:")
print(f"{email_difference:.3f}")

# 15. Most Common Surname
users_df['surname'] = users_df['name'].str.split().str[-1]
most_common_surnames = users_df['surname'].value_counts().head(1).index.tolist()
print("Most common surname among users:")
print(", ".join(most_common_surnames))


Earliest registered users in Barcelona:
oleganza, gravityblast, fesplugas, fxn, pauek
Most popular licenses:
MIT License, Other, Apache License 2.0
Majority company:
FREELANCE
Most popular programming language:
Ruby


KeyError: 'language'

In [None]:
# Load your CSV files
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# Ensure 'created_at' is a datetime object in users_df
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Step 1: Filter users who joined after 2020
after_2020_users = users_df[users_df['created_at'].dt.year >= 2020]

# Step 2: Get the logins of these users
after_2020_logins = after_2020_users['login'].unique()

# Step 3: Filter repositories for these users
repos_after_2020 = repos_df[repos_df['login'].isin(after_2020_logins)]

# Step 4: Get the language counts from these repositories
language_counts = repos_after_2020['language'].value_counts()

# Step 5: Find the second most popular language
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None

print("Second most popular language for users who joined after 2020:")
print(second_most_popular_language)

Second most popular language for users who joined after 2020:
Python


In [None]:
import pandas as pd

# Load your CSV files
repos_df = pd.read_csv('repositories.csv')

# Step 1: Group by language and calculate the average stars per repository
average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

# Step 2: Identify the language with the highest average stars
highest_average_language = average_stars_per_language.idxmax()
highest_average_value = average_stars_per_language.max()

print("Language with the highest average number of stars per repository:")
print(f"{highest_average_language} (Average Stars: {highest_average_value:.2f})")


Language with the highest average number of stars per repository:
Vim Script (Average Stars: 3838.75)


In [None]:
import pandas as pd

# Load your CSV file
users_df = pd.read_csv('users.csv')

# Step 1: Calculate leader strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Step 2: Sort by leader strength in descending order and get the top 5
top_leader_strength_users = users_df.nlargest(5, 'leader_strength')

# Step 3: Extract the logins of these users
top_logins = top_leader_strength_users['login'].tolist()

# Print the result
print("Top 5 users in terms of leader strength:")
print(", ".join(top_logins))


Top 5 users in terms of leader strength:
midudev, vfarcic, spite, amix, cfenollosa


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your users CSV file
users_df = pd.read_csv('users.csv')

# Step 1: Prepare the data
# Define the independent variable (public_repos) and dependent variable (followers)
X = users_df['public_repos']
y = users_df['followers']

# Step 2: Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Step 3: Fit the regression model
model = sm.OLS(y, X).fit()

# Step 4: Get the regression results
slope = model.params['public_repos']

# Step 5: Print the estimated additional followers per additional public repository
print(f"Estimated additional followers per additional public repository: {slope:.3f}")


Estimated additional followers per additional public repository: 1.031


In [None]:
import pandas as pd

# Load your repositories CSV file
repos_df = pd.read_csv('repositories.csv')

# Step 1: Convert has_projects and has_wiki to numeric (1 for True, 0 for False)
repos_df['has_projects_numeric'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki_numeric'] = repos_df['has_wiki'].astype(int)

# Step 2: Calculate the correlation
correlation = repos_df['has_projects_numeric'].corr(repos_df['has_wiki_numeric'])

# Step 3: Print the result rounded to 3 decimal places
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")


Correlation between projects and wiki enabled: 0.317


In [None]:
(async function() {
    const form = document.getElementById('quiz-form');
    const checkButton = form.querySelector('button[name="check-answer"]');
    const resultDiv = document.getElementById('result');

    function delay(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }

    // Iterate over each question
    for (let questionId = 13; questionId < 14; questionId++) {
        const inputField = form.querySelector(`#q${questionId}`);
        console.log(`Checking question ${questionId}...`);

        for (let i = -10000; i <= 15000; i++) {
            const num = (i / 1000).toFixed(3);

            inputField.value = num;
            checkButton.click();

            await delay(100); // Added delay to allow for UI processing

            // Check if the resultDiv contains success feedback
            if (resultDiv.textContent.includes("17")) { // Change this to your correct answer check
                console.log(`Question ${questionId} correct answer: ${num}`);
                break; // Exit the loop when the correct answer is found
            }
        }
    }
})();


SyntaxError: closing parenthesis '}' does not match opening parenthesis '(' on line 12 (<ipython-input-16-d9ec0f4ae921>, line 29)

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the users CSV file
users_df = pd.read_csv('users.csv')

# Step 1: Filter out users without a bio
users_with_bios = users_df[users_df['bio'].notna()]

# Step 2: Calculate bio word count by splitting on whitespace
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))

# Step 3: Define the independent (bio word count) and dependent variable (followers)
X = users_with_bios['bio_word_count']
y = users_with_bios['followers']

# Step 4: Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Step 5: Fit the regression model
model = sm.OLS(y, X).fit()

# Step 6: Get the regression slope (coefficient of bio_word_count)
slope = model.params['bio_word_count']

# Step 7: Print the regression slope, rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 13.718


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


In [None]:
import pandas as pd

# Load repositories data
repos_df = pd.read_csv('repositories.csv')

# Convert 'created_at' to datetime format
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter repositories created on weekends (Saturday and Sunday)
repos_df['weekday'] = repos_df['created_at'].dt.weekday
weekend_repos = repos_df[repos_df['weekday'] >= 5]  # 5 = Saturday, 6 = Sunday

# Count the number of weekend repositories per user
weekend_repo_counts = weekend_repos['login'].value_counts()

# Get the top 5 users with the most weekend repositories
top_5_weekend_users = weekend_repo_counts.head(5).index.tolist()

# Convert list to a comma-separated string
top_5_weekend_users_str = ','.join(top_5_weekend_users)
print("Top 5 users by weekend repository creation:", top_5_weekend_users_str)


Top 5 users by weekend repository creation: nilportugues,kinow,ajsb85,vfarcic,wlsf82


In [None]:
import pandas as pd

# Load users data
users_df = pd.read_csv('users.csv')

# Drop rows with missing names
names_with_surnames = users_df['name'].dropna()

# Extract the last word (surname) from each name
surnames = names_with_surnames.apply(lambda name: name.strip().split()[-1])

# Count the frequency of each surname
surname_counts = surnames.value_counts()

# Find the most common surname(s)
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Sort surnames alphabetically if there's a tie
most_common_surnames_sorted = sorted(most_common_surnames)
most_common_surnames_str = ','.join(most_common_surnames_sorted)

# Output the results
print("Most common surname(s):", most_common_surnames_str)
print("Number of users with the most common surname:", max_count)


Most common surname(s): Martínez,Ortiz
Number of users with the most common surname: 3


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# Load users data
users_df = pd.read_csv('users.csv')

# Filter users with a non-empty bio
users_with_bio = users_df[users_df['bio'].notna() & (users_df['bio'] != '')]

# Calculate bio word count
users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))

# Prepare data for regression
X = users_with_bio[['bio_word_count']]
y = users_with_bio['followers']

# Perform linear regression
regression_model = LinearRegression()
regression_model.fit(X, y)

# Extract the slope of followers on bio word count
slope = regression_model.coef_[0]

# Output the result rounded to 3 decimal places
print("Regression slope of followers on bio word count:", round(slope, 3))


Regression slope of followers on bio word count: 13.718


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))


In [None]:
pip install selenium


Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

# Set up WebDriver (ChromeDriver path or use another driver)
driver = webdriver.Chrome(executable_path='path/to/chromedriver')
driver.get('URL_OF_YOUR_FORM_PAGE')  # Replace with your form URL

def delay(seconds):
    time.sleep(seconds)

try:
    # Access the form and button
    form = driver.find_element(By.ID, 'quiz-form')
    check_button = form.find_element(By.NAME, 'check-answer')
    result_div = driver.find_element(By.ID, 'result')

    # Loop over each question
    for question_id in range(13, 14):  # Adjust range as needed
        input_field = form.find_element(By.ID, f'q{question_id}')
        print(f"Checking question {question_id}...")

        # Range of values to try (from -10000 to 15000 by 0.001)
        for i in range(-10000, 15001):
            num = f"{i / 1000:.3f}"  # Format number to 3 decimal places

            # Set input value and click check button
            input_field.clear()
            input_field.send_keys(num)
            check_button.click()
            delay(0.1)  # Short delay to let result update

            # Check for the success message in result_div
            if "17" in result_div.text:
                print(f"Question {question_id} correct answer: {num}")
                break

finally:
    driver.quit()  # Close the browser


TypeError: WebDriver.__init__() got an unexpected keyword argument 'executable_path'

In [None]:
(async function() {
    const form = document.getElementById('quiz-form');
    const checkButton = form.querySelector('button[name="check-answer"]');
    const resultDiv = document.getElementById('result');

    function delay(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }

    for (let questionId = 13; questionId < 14; questionId++) {
        const inputField = form.querySelector(`#q${questionId}`);
        console.log(`Checking question ${questionId}...`);

        for (let i = -10000; i <= 15000; i++) {
            const num = (i / 1000).toFixed(3);

            inputField.value = num;
            checkButton.click();
            await delay(0);

            if (resultDiv.textContent.includes("17")) {
                console.log(`Question ${questionId} correct answer: ${num}`);
                break;
            }
        }
    }
})();


SyntaxError: closing parenthesis '}' does not match opening parenthesis '(' on line 11 (<ipython-input-31-9d6fc534087f>, line 26)