In [1]:
import requests
import time
import pandas as pd

In [None]:
GIThub_token ="***************************"

In [None]:
GITHUB_API_URL = "https://api.github.com"
SEARCH_USERS_URL = f"{GITHUB_API_URL}/search/users"
REPOS_URL = f"{GITHUB_API_URL}/users/{{}}/repos"

In [None]:
HEADERS = {"Authorization": f"token {GIThub_token}"}

In [None]:
QUERY = "location:Stockholm followers:>100"

In [None]:
def get_stockholm_users():
    params = {'q': QUERY, 'per_page': 100, 'page': 1}
    users = []

    while True:
        response = requests.get(SEARCH_USERS_URL, headers=HEADERS, params=params)
        response_data = response.json()
        users.extend(response_data['items'])

        # Check if there are more pages to fetch
        if 'next' in response.links:
            params['page'] += 1
        else:
            break

        # Respect GitHub API rate limits
        time.sleep(1)

    return users

def clean_company_name(company):
    if company:
        return company.strip().lstrip('@').upper()
    return ''


def get_user_repos(username):
    repos = []
    params = {'per_page': 100, 'page': 1}

    while True:
        response = requests.get(REPOS_URL.format(username), headers=HEADERS, params=params)
        repos_data = response.json()

        repos.extend(repos_data)

        # Check if there are more pages to fetch
        if 'next' in response.links:
            params['page'] += 1
        else:
            break

        time.sleep(1)

    return repos


def write_users_to_csv(users):
  login=[]
  name=[]
  company=[]
  location=[]
  email=[]
  hireable=[]
  bio=[]
  public_repos=[]
  followers=[]
  following=[]
  created_at=[]
  for user in users:
    user=requests.get(user['url'],headers=HEADERS).json()
    login.append(user.get('login', ''))
    name.append(user.get('name', ''))
    company.append(clean_company_name(user.get('company', '')))
    location.append(user.get('location', ''))
    email.append(user.get('email', ''))
    hireable.append(user.get('hireable', ''))
    bio.append(user.get('bio', ''))
    public_repos.append(user.get('public_repos', 0))
    followers.append(user.get('followers', 0))
    following.append(user.get('following', 0))
    created_at.append(user.get('created_at', ''))
    time.sleep(1)
  df=pd.DataFrame({'login':login,'name':name,'company':company,'location':location,'email':email,'hireable':hireable,'bio':bio,'public_repos':public_repos,'followers':followers,'following':following,'created_at':created_at})
  return df

def get_latest_repos(user_repos_url):
    repos = []
    page = 1
    while True:
        # API call to fetch repositories with sorting by most recently pushed
        response = requests.get(user_repos_url, headers=HEADERS, params={'per_page': 100, 'page': page, 'sort': 'pushed', 'direction': 'desc'})
        user_repos = response.json()

        # Break if no more repositories
        if not user_repos:
            break

        repos.extend(user_repos)

        # If repos reach 500, stop
        if len(repos) >= 500:
            repos = repos[:500]  # Limit to 500 repositories
            break

        page += 1
        time.sleep(1)  # Sleep to respect rate limits

    return repos

def write_repos_to_csv(users):
  login=[]
  full_name=[]
  created_at=[]
  stargazers_count=[]
  watchers_count=[]
  language=[]
  has_projects=[]
  has_wiki=[]
  license_name=[]

  for user in users:
      user_repos_url = user['repos_url']
      user_repos = get_latest_repos(user_repos_url)
      for repo in user_repos:
        login.append(repo['owner']['login'])
        full_name.append(repo['full_name'] if repo['full_name'] else '')
        created_at.append(repo['created_at'] if repo['created_at'] else '')
        stargazers_count.append(repo['stargazers_count'] if repo['stargazers_count'] else '')
        watchers_count.append(repo['watchers_count'] if repo['watchers_count'] else '' )
        language.append(repo['language'] if repo['language'] else '' )
        has_projects.append(repo['has_projects'] if repo['has_projects'] else '')
        has_wiki.append(repo['has_wiki'] if repo['has_wiki'] else '')
        license_name.append(repo['license']['name'] if repo['license'] else '')
  df=pd.DataFrame({'login':login,'full_name':full_name,'created_at':created_at,'stargazers_count':stargazers_count,
                   'watchers_count':watchers_count,'language':language,'has_projects':has_projects,'has_wiki':has_wiki,
                   'license_name':license_name})
  return df


In [None]:
users = get_stockholm_users()


In [None]:
df= write_users_to_csv(users)
df.to_csv('users.csv', index=False)

In [None]:
df1 = write_repos_to_csv(users)
df1.to_csv('repositories.csv', index=False)

In [6]:
users = pd.read_csv("https://raw.githubusercontent.com/gyanesh-iitmiimb/TDSProject1/refs/heads/main/users.csv")
repositories = pd.read_csv("https://raw.githubusercontent.com/gyanesh-iitmiimb/TDSProject1/refs/heads/main/repositories.csv")

In [7]:
users.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,0,0,emmabostian,Emma Bostian,SPOTIFY,"Stockholm, Sweden",,False,Front-end Software Engineer @ Spotify\r\n,61,6473,15,2014-05-22T17:47:40Z
1,1,1,emilk,Emil Ernerfeldt,"RERUN.IO, EGUI","Stockholm, Sweden",emil.ernerfeldt@gmail.com,False,"Rust coder, creator of egui, CTO of rerun.io",71,6267,20,2011-10-24T16:40:17Z
2,2,2,mpj,Mattias Petter Johansson,FUN FUN FUNCTION,"Stockholm, Sweden",,True,,142,5708,23,2008-07-22T10:20:27Z
3,3,3,hrydgard,Henrik Rydgård,,"Stockholm, Sweden",hrydgard@gmail.com,False,,60,5545,25,2009-09-24T18:40:26Z
4,4,4,eriklindernoren,Erik Linder-Norén,,"Stockholm, Sweden",eriklindernoren@gmail.com,False,ML engineer at Apple. Excited about machine le...,24,5345,11,2014-06-24T16:31:53Z


In [8]:
repositories.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,emmabostian,emmabostian/developer-portfolios,2019-09-13T14:18:58Z,7312.0,7312.0,,True,True,
1,emmabostian,emmabostian/emmabostian,2020-07-28T09:33:51Z,5.0,5.0,,True,True,
2,emmabostian,emmabostian/fem-css-foundations,2023-04-15T06:27:55Z,81.0,81.0,HTML,True,,
3,emmabostian,emmabostian/Front-End-FAQ,2018-11-01T07:47:51Z,694.0,694.0,,True,True,Other
4,emmabostian,emmabostian/badass-conference-talks,2019-08-05T16:42:39Z,488.0,488.0,,True,True,


In [51]:
",".join(users.sort_values(by='followers',ascending=False)["login"][:5].tolist())

'emmabostian,emilk,mpj,hrydgard,eriklindernoren'

In [52]:
 users.created_at = pd.to_datetime(users.created_at)
 ",".join(users.sort_values(by='created_at',ascending=True)["login"][:5].tolist())

'Mange,kallepersson,fesplugas,etnt,pirelenito'

In [54]:
",".join(repositories.groupby('license_name').count()['full_name'].reset_index().sort_values(by='full_name',ascending=False)["license_name"][:3].tolist())

'MIT License,Apache License 2.0,Other'

In [14]:
users.groupby('company').count()['login'].reset_index().sort_values(by='login',ascending=False)["company"][:5].tolist()

['SPOTIFY', 'EMBARKSTUDIOS', 'GOOGLE', 'MOJANG', 'KLARNA']

In [15]:
repositories.groupby('language').count()['full_name'].reset_index().sort_values(by='full_name',ascending=False)["language"][:5].tolist()

['JavaScript', 'Python', 'Go', 'Java', 'TypeScript']

In [55]:
users_joined_after2020 = users[users['created_at'] > '2020-01-01']
users_list = users_joined_after2020['login'].tolist()

In [56]:
repositories[repositories['login'].isin(users_list)].groupby('language').count()['full_name'].reset_index().sort_values(by='full_name',ascending=False)["language"][:5].tolist()

['JavaScript', 'TypeScript', 'HTML', 'C#', 'Java']

In [27]:
repositories.groupby('language')['stargazers_count'].mean().reset_index().sort_values(by='stargazers_count',ascending=False)["language"][:5].tolist()

['RAML', 'jq', 'Mathematica', 'FreeBasic', 'MDX']

In [57]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top_5_leaders = users.sort_values('leader_strength', ascending=False).head(5)['login'].tolist()
",".join(top_5_leaders)

'spotify,Mojang,fornwall,joearms,EmbarkStudios'

In [29]:
users[["public_repos","followers"]].corr()

Unnamed: 0,public_repos,followers
public_repos,1.0,0.033217
followers,0.033217,1.0


In [30]:
import statsmodels.formula.api as sm

# Calculate the correlation between public_repos and followers
correlation = users[["public_repos", "followers"]].corr()

# Perform linear regression to estimate the relationship
model = sm.ols("followers ~ public_repos", data=users).fit()

# Print the regression results
print(model.summary())

# Extract the coefficient for public_repos, which represents the estimated increase in followers per additional repository
additional_followers_per_repo = model.params['public_repos']

print(f"\nEstimated additional followers per public repository: {additional_followers_per_repo:.2f}")

                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.4474
Date:                Wed, 30 Oct 2024   Prob (F-statistic):              0.504
Time:                        17:37:41   Log-Likelihood:                -3272.6
No. Observations:                 407   AIC:                             6549.
Df Residuals:                     405   BIC:                             6557.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      372.6574     47.576      7.833   

In [44]:
correl = repositories[["has_projects", "has_wiki"]]
correl['has_wiki'] = correl['has_wiki'].fillna(False).astype(bool)
correl['has_projects'] = correl['has_projects'].fillna(False).astype(bool)
correlation_projects_wiki = correl.corr()
correlation_projects_wiki

  correl['has_wiki'] = correl['has_wiki'].fillna(False).astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  correl['has_wiki'] = correl['has_wiki'].fillna(False).astype(bool)
  correl['has_projects'] = correl['has_projects'].fillna(False).astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  correl['has_projects'] = correl['has_projects'].fillna(False).astype(bool)


Unnamed: 0,has_projects,has_wiki
has_projects,1.0,0.374632
has_wiki,0.374632,1.0


In [45]:
avg_following_hireable = users[users['hireable'] == True]['following'].mean()
avg_following_not_hireable = users[users['hireable'] != True]['following'].mean()
difference = avg_following_hireable - avg_following_not_hireable
print(f"{difference:.3f}")

48.674


In [46]:
users['bio_word_count'] = users['bio'].fillna('').apply(lambda x: len(x.split()) if x else 0)
users_with_bios = users[users['bio_word_count'] > 0]
model = sm.ols("followers ~ bio_word_count", data=users_with_bios).fit()
slope = model.params['bio_word_count']
print(f"{slope:.3f}")

6.553


In [58]:
repositories['created_at'] = pd.to_datetime(repositories['created_at'])
repositories['weekday'] = repositories['created_at'].dt.weekday
repositories['is_weekend'] = repositories['weekday'].apply(lambda x: x in [5, 6])  # 5: Saturday, 6: Sunday
weekend_repo_counts = repositories[repositories['is_weekend']].groupby('login')['full_name'].count().sort_values(ascending=False)
top_5_users = weekend_repo_counts.head(5).index.tolist()
",".join(top_5_users)

'HaraldNordgren,Nyholm,lydell,linhduongtuan,LinusU'

In [49]:
hireable_with_email = users[(users['hireable'] == True) & (users['email'].notna())].shape[0]
hireable_total = users[users['hireable'] == True].shape[0]
fraction_hireable_with_email = hireable_with_email / hireable_total if hireable_total > 0 else 0

not_hireable_with_email = users[(users['hireable'] != True) & (users['email'].notna())].shape[0]
not_hireable_total = users[users['hireable'] != True].shape[0]
fraction_not_hireable_with_email = not_hireable_with_email / not_hireable_total if not_hireable_total > 0 else 0

difference = fraction_hireable_with_email - fraction_not_hireable_with_email
print(f"{difference:.3f}")

0.176


In [50]:
surname_counts = {}
for name in users['name'].dropna():
  name_parts = name.strip().split()
  if name_parts:
    surname = name_parts[-1]
    surname_counts[surname] = surname_counts.get(surname, 0) + 1

max_count = 0
most_common_surnames = []
for surname, count in surname_counts.items():
  if count > max_count:
    max_count = count
    most_common_surnames = [surname]
  elif count == max_count:
    most_common_surnames.append(surname)

most_common_surnames.sort()
print(','.join(most_common_surnames))

Gustafsson,Persson
