In [2]:
from dotenv import load_dotenv
load_dotenv()

%load_ext google.cloud.bigquery

In [3]:
%%bigquery top_1000_github_users
SELECT u.login, u.company, u.state, u.city, count(f.user_id) as follower_count
FROM `ghtorrent-bq.ght_2018_04_01.followers` f
JOIN `ghtorrent-bq.ght_2018_04_01.users` u ON f.follower_id = u.id
GROUP BY f.follower_id, u.login, u.company, u.fake, u.state, u.city
order by follower_count desc
LIMIT 1000

Unnamed: 0,follower_id,login,company,fake,state,city,follower_count
0,5203,torvalds,Linux Foundation,False,OR,Portland,80184
1,896,JakeWharton,"Google, Inc.",False,PA,Pittsburgh,48120
2,616741,ruanyf,,False,,Shanghai,39102
3,376498,Tj,Apex,False,BC,Victoria,37402
4,6240,addyosmani,Google,False,CA,Mountain View,32666
5,1779,paulirish,"Google Chrome, ♥z",False,CA,Palo Alto,29690
6,18556,yyx990803,,False,,,29200
7,417948,gaearon,@facebook,False,England,London,27415
8,3871,sindresorhus,@avajs @chalk @yeoman,False,,,25701
9,9236,mojombo,,False,CA,San Francisco,25112


In [79]:
import requests
import json
import os

GITHUB_URL = "https://api.github.com/"

"""
Function to get JSON response from a URL
:params:
    url     string
:return:
            JSON
"""
def __get_json_response(url):
    headers = {'Authorization': 'token ' + os.environ['GITHUB_API_KEY']}
    response = requests.get(url, headers=headers)
    return json.loads(response.text)

"""
Function to add email to a set of emails and set a loop break flag
:params:
    email_set       set         set of all the emails for the user
    email           string      new email to be added
    max_len         integer     maximum number of emails to be extracted
:return:
    email_set       set         set of all the emails for the user
    break_flag      boolean     if max_limit is reached, break_flag is set to True
"""
def __add_email(email_set, email, max_len):
    email_set = email_set | set([email])
    break_flag = (len(email_set) >= max_len)

    return email_set, break_flag

"""
Function to get user emails using GitHub APIs
:params:
    user        string      a valid GitHub username
    max_limit   integer     maximum number of email ID to be fetched
:return:
    user_email  set         a set of all emails extracted
    message     string      if any error occurs, this holds the respective error message
"""
def __get_github_emails(user, max_limit):
    user_email = set([])
    break_flag = False
    try:
        users_profile_url = GITHUB_URL + "users/{0}".format(user)
        response = __get_json_response(users_profile_url)

        # some error encountered
        if 'message' in response:
            if response['message'] == 'Not Found':
                return u'You need to enter a valid GitHub Username'
            else:
                return response['message']
        name = response['name']
        user_name = response['name']
        company = response['company']
        # if user has a public email, add that to the set of emails
        if response['email']:
            user_email, break_flag = __add_email(user_email, response['email'], max_limit)

        if not break_flag:
            users_repository_url = GITHUB_URL + "users/{0}/repos?type=owner&sort=updated".format(user)
            response = __get_json_response(users_repository_url)

            for repo in response:
                if not repo['fork']:
                    users_repository_name = repo['full_name']
                    repos_commit_url = GITHUB_URL + "repos/{0}/commits".format(users_repository_name)
                    commit_reponse = __get_json_response(repos_commit_url)
                    
                    possible_positions = ['committer', 'author']

                    for commit in commit_reponse:
                        for i in possible_positions:
                            if commit['commit'][i]['name'] == user_name:
                                email_string = commit['commit'][i]['email']
                                if "noreply" not in email_string:
                                    user_email, break_flag = __add_email(user_email, email_string, max_limit)

                        if break_flag:
                            break

                if break_flag:
                    break

        if len(user_email) > 0:
            return user_email, name, company
        else:
            return u'No emails found', u'No first name found', u'No company found'

    except requests.exceptions.ConnectionError:
        return u'Proper internet connection not found', u'Proper internet connection not found', u'Proper internet connection not found'

"""
Function to get the emails associated to a username on GitHub
:params:
    username    string      a valid GitHub username
    num         integer     maximum number of email ID to be fetched, default 1
:return:
    response    JSON response
        success     boolean     flag to determine other key in JSON
        email       list        if 'success' is True, list of all the emails fetched
        message     string      if 'success' is False, returns the error message
"""
def get(username, num=1):
    email, name, co = __get_github_emails(username, num)
    if type(email) == set:
        response = {
            'success' : True,
            'data': [email.pop(), name, co]
        }
    else:
        response = {
            'success' : False,
            'message' : [email, name, co]
        }

    return response

In [81]:
import pandas as pd
result = pd.DataFrame()
for index, row in top_1000_github_users.iterrows():
    print(index)
    login            = row['login']
    company          = row['company']
    state            = row['state']
    city             = row['city']
    follower_count   = row['follower_count'] 
    try:
        resp = get(login)
        if 'data' in resp:
            email, name, co  =resp['data']
            row['email'] = email
            row['name'] = name
            row['company'] = co
            result = result.append(row, ignore_index=True)
    except Exception as e:
        print(e)
    


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
too many values to unpack
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
string indices must be integers
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262


In [None]:

result.to_csv('top_1000_github_users.csv', encoding = 'utf-8')

In [42]:
!open top_1000_github_users.csv

In [39]:
result[result.columns]

Unnamed: 0,city,company,email,fake,follower_count,follower_id,login,name,state
0,Portland,Linux Foundation,torvalds@linux-foundation.org,0.0,80184.0,5203.0,torvalds,Linus Torvalds,OR
1,Pittsburgh,"Google, Inc.",jakewharton@gmail.com,0.0,48120.0,896.0,JakeWharton,Jake Wharton,PA
2,Shanghai,,yifeng.ruan@gmail.com,0.0,39102.0,616741.0,ruanyf,Ruan YiFeng,
3,Victoria,Apex,tj@apex.sh,0.0,37402.0,376498.0,Tj,TJ Holowaychuk,BC
4,Mountain View,Google,addyosmani@gmail.com,0.0,32666.0,6240.0,addyosmani,Addy Osmani,CA
5,Palo Alto,"Google Chrome, ♥z",paul.irish@gmail.com,0.0,29690.0,1779.0,paulirish,Paul Irish,CA
6,,,yyx990803@gmail.com,0.0,29200.0,18556.0,yyx990803,Evan You,
7,London,@facebook,dan.abramov@me.com,0.0,27415.0,417948.0,gaearon,Dan Abramov,England
8,,@avajs @xojs @chalk,sindresorhus@gmail.com,0.0,25701.0,3871.0,sindresorhus,Sindre Sorhus,
9,San Francisco,,tom@mojombo.com,0.0,25112.0,9236.0,mojombo,Tom Preston-Werner,CA
