forked from olaadapo/medium-stats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_stats.py
132 lines (107 loc) · 4.82 KB
/
scrape_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import time
import datetime
import calendar
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
# login info
from user import u as usernameStr
from user import p as passwordStr
# tag the options field
options = webdriver.FirefoxOptions()
# disable push/popups
options.set_preference("dom.push.enabled", False)
# initialize Selenium webdriver
browser = webdriver.Firefox(options=options)
# open new tab with link to sign-in options for Medium
browser.get('https://medium.com/m/signin?redirect=https%3A%2F%2Fmedium.com%2F&operation=login')
# this pauses/delays the code execution for () seconds;
# very important - give time for page to load before searching out elements
# adjust (increase/decrease) depending on internet speed?
time.sleep(5)
# link to sign-in with Google - the 1st button on the page
browser.find_element_by_xpath('//button[1]').click()
time.sleep(2)
# enter your Google email
browser.find_element_by_id('identifierId').send_keys(usernameStr)
# click the "Next" button
browser.find_element_by_id('identifierNext').click()
time.sleep(2)
# enter your account password
browser.find_element_by_xpath("//input[@name='password'][@type='password']").send_keys(passwordStr)
# sign-in to your Medium account with Google auth!
browser.find_element_by_id('passwordNext').click()
time.sleep(4)
# go to stats page - you are now logged in to your Medium account
browser.get('https://medium.com/me/stats')
time.sleep(2)
# number of months to get stats for - adjust as you please
months = 5
# number of views
viewstats = []
# number of reads
readstats = []
# number of fans
fanstats = []
i = 0
while i < months:
# default data is on views
soup = BeautifulSoup(browser.page_source, 'html.parser')
viewstats.append(soup.find_all('rect'))
# get stats on article reads as well - this will be in reverse order from views
# line below clicks on the "Reads" tab on the page - graph now shows read history
browser.find_element_by_xpath("//li[@data-action='switch-graph'][@data-action-value='reads']").click()
time.sleep(2)
# parse page, and get data
soup = BeautifulSoup(browser.page_source, 'html.parser')
readstats.append(soup.find_all('rect'))
# get stats on article fans as well - this will be in reverse order from views
# line below clicks on the "Fans" tab on the page - graph now shows read history
browser.find_element_by_xpath("//li[@data-action='switch-graph'][@data-action-value='fans']").click()
time.sleep(2)
# parse page, and get data
soup = BeautifulSoup(browser.page_source, 'html.parser')
fanstats.append(soup.find_all('rect'))
# increase counter
i += 1
# go to previous month
browser.find_element_by_xpath("//button[@data-action='show-graph-previous']").click()
time.sleep(2)
# switch back to views
browser.find_element_by_xpath("//li[@data-action='switch-graph'][@data-action-value='views']").click()
time.sleep(2)
# close the webdriver (Chrome window)
browser.close()
# compile the data
statInfo = []
# for each 30-day period...
for j in range(len(viewstats)):
# for each day in a selected 30-day period...
for k in range(len(viewstats[j])):
# add the year to the view date
viewDate = viewstats[j][k]['data-tooltip'].split('on')[-1].strip()
if viewDate.split('\xa0')[0].strip() == 'January':
viewDate = viewDate + ' 2019'
else:
viewDate = viewDate + ' 2019'
# extract numbers
numViews = int(viewstats[j][k]['data-tooltip'].split(' ')[0])
numReads = int(readstats[j][k]['data-tooltip'].split(' ')[0])
numFans = int(fanstats[j][k]['data-tooltip'].split(' ')[0])
# add collected numbers to pool
statInfo.append([viewDate, numViews, numReads, numFans])
# dataframe to store the number of views for the article each day
statData = pd.DataFrame(statInfo, columns=['date', 'numberOfViews', 'numberOfReads', 'numberOfFans'])
# create a string showing the full day and convert to datetime (Python time format)
statData['newDate'] = statData['date'].map(lambda x: x.split('\xa0')[0].strip() + '\\' + x.split('\xa0')[1].split(' ')[0].strip()+ '\\' + x.split('\xa0')[1].split(' ')[1].strip())
statData['newDate'] = statData['newDate'].map(lambda x: datetime.datetime.strptime(x, '%B\\%d\\%Y'))
statData['Weekday'] = statData.newDate.map(lambda x: calendar.day_name[x.weekday()])
# re-order by the date
statData.sort_values(by='newDate', inplace=True)
# rename columns, drop redundant columns, and re-order columns
statData.columns = ['oldDate', 'Views', 'Reads', 'Fans', 'Date', 'Weekday']
statData.reset_index(inplace=True)
statData.drop(labels=['oldDate', 'index'], axis=1, inplace=True)
statData = statData[['Date', 'Weekday', 'Views', 'Reads', 'Fans']]
# save to Excel if you want
statData.to_csv('medium_stats.csv', index=False)