# Building our Github web scraper!

In [1]:
#Import modules and libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Create a variable containing url to be pulled.

url = "https://github.com/search?q=python"

## We use the requests module to send the HTTP request, 
# and store it into the variable r.

r = requests.get(url)

## Next we will take the html data that we pulled, use
# BeautifulSoup to parse the data four us, and store it in variable Soup.

soup = BeautifulSoup(r.content, "html.parser")

## We can print our results.

## I wont print since there is a lot of data but feel free to unhash to see for yourself.
#print(soup.prettify)

In [30]:
## As you can see from above that we were able to scrape the repositories,
# but it is hard to read with all the extra metadata.

## Now will will use the find_all function to retrieve all the div tags in the class.
# We will create a variable "results" to store our new results in.

results = soup.find_all('div', class_= "f4 text-normal")
print(results)

[<div class="f4 text-normal">
<a class="v-align-middle" data-hydro-click='{"event_type":"search_result.click","payload":{"page_number":1,"per_page":10,"query":"python","result_position":1,"click_id":63476337,"result":{"id":63476337,"global_relay_id":"MDEwOlJlcG9zaXRvcnk2MzQ3NjMzNw==","model_name":"Repository","url":"https://github.com/TheAlgorithms/Python"},"originating_url":"https://github.com/search?q=python","user_id":null}}' data-hydro-click-hmac="8329a78703751e0c925c973cbb96a84b9f4d12177c2ea545c32496f47ab1e512" href="/TheAlgorithms/Python">TheAlgorithms/<em>Python</em></a>
</div>, <div class="f4 text-normal">
<a class="v-align-middle" data-hydro-click='{"event_type":"search_result.click","payload":{"page_number":1,"per_page":10,"query":"python","result_position":2,"click_id":2881789,"result":{"id":2881789,"global_relay_id":"MDEwOlJlcG9zaXRvcnkyODgxNzg5","model_name":"Repository","url":"https://github.com/geekcomputers/Python"},"originating_url":"https://github.com/search?q=python"

In [31]:
## It's looking better but there is still much going on.

# Here we created a base_url variable that will contain the base url from github.

base_url = "https://github.com"

### We will create two variables that will contain empty lists that we will
## use to store our results that we can use in our DataFrame.
# We will name the variables "Repo" and "Url".

Repo = []
Url = []

## Create a for loop to retrieve all "a" tags and their "href" attributes. 

for i in results:
    links = i.find_all("a")
    for link in links:
        link_url = base_url + link["href"]
    Repo.append(link.text.strip())
    Url.append(link_url)

## Above we created the variables "links" and "link_url".
# We than concatenated our base_url to the "link_url" variable.
# Next we used the append function on our variables "Repo" and "Url" and store our results into them.
    
## We print our results using the strip function.
    print(link.text.strip(), link_url)

TheAlgorithms/Python https://github.com/TheAlgorithms/Python
geekcomputers/Python https://github.com/geekcomputers/Python
walter201230/Python https://github.com/walter201230/Python
injetlee/Python https://github.com/injetlee/Python
kubernetes-client/python https://github.com/kubernetes-client/python
Show-Me-the-Code/python https://github.com/Show-Me-the-Code/python
xxg1413/python https://github.com/xxg1413/python
jakevdp/PythonDataScienceHandbook https://github.com/jakevdp/PythonDataScienceHandbook
joeyajames/Python https://github.com/joeyajames/Python
docker-library/python https://github.com/docker-library/python


# Building our DataFrame!

In [32]:
### We successfully managed to get the repo names and their urls, but now we need
## to build our DataFrame and perform our analysis.

## We will use the zip funciton to create a list containing
# the data from our variables "Repo" and "Url".
    
data = list(zip(Repo,Url))

## Now we will create another list for columns that we will use for
# our DataFrame.  We will store this in the variable "col".

col = ['Repo','Url']

## Now all we will use Pandas to create our DataFrame with our data.

df = pd.DataFrame(data= data, columns= col)

In [33]:
## View the dataset.  df.head() will display the first 5 rows.

df.head()

Unnamed: 0,Repo,Url
0,TheAlgorithms/Python,https://github.com/TheAlgorithms/Python
1,geekcomputers/Python,https://github.com/geekcomputers/Python
2,walter201230/Python,https://github.com/walter201230/Python
3,injetlee/Python,https://github.com/injetlee/Python
4,kubernetes-client/python,https://github.com/kubernetes-client/python


In [34]:
## df.tail() will display the last 5 rows.

df.tail()

Unnamed: 0,Repo,Url
5,Show-Me-the-Code/python,https://github.com/Show-Me-the-Code/python
6,xxg1413/python,https://github.com/xxg1413/python
7,jakevdp/PythonDataScienceHandbook,https://github.com/jakevdp/PythonDataScienceHa...
8,joeyajames/Python,https://github.com/joeyajames/Python
9,docker-library/python,https://github.com/docker-library/python


In [35]:
## We use df.shape to get the size of our dataset.

df.shape

(10, 2)

In [26]:
### We can see that our dataset only contains 10 rows and 2 columns which is
## very small but you can see how it would be usefull on a larger dataset.
# We will display all the data since we now that it is a small one

In [36]:
# We can simply type our DataFrame name!

df

Unnamed: 0,Repo,Url
0,TheAlgorithms/Python,https://github.com/TheAlgorithms/Python
1,geekcomputers/Python,https://github.com/geekcomputers/Python
2,walter201230/Python,https://github.com/walter201230/Python
3,injetlee/Python,https://github.com/injetlee/Python
4,kubernetes-client/python,https://github.com/kubernetes-client/python
5,Show-Me-the-Code/python,https://github.com/Show-Me-the-Code/python
6,xxg1413/python,https://github.com/xxg1413/python
7,jakevdp/PythonDataScienceHandbook,https://github.com/jakevdp/PythonDataScienceHa...
8,joeyajames/Python,https://github.com/joeyajames/Python
9,docker-library/python,https://github.com/docker-library/python


In [43]:
## Lets get some more info about our dataset, we will use df.info()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Repo    10 non-null     object
 1   Url     10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [118]:
## We can see it displays a lot of useful information about our dataset.
# It displays the total entries, dtypes, total columns, etc.

In [122]:
## Lets try view just one column and its data.

df[["Repo"]]

Unnamed: 0,Repo
0,TheAlgorithms/Python
1,geekcomputers/Python
2,walter201230/Python
3,injetlee/Python
4,kubernetes-client/python
5,Show-Me-the-Code/python
6,xxg1413/python
7,jakevdp/PythonDataScienceHandbook
8,joeyajames/Python
9,docker-library/python


In [123]:
## And now we will look at the other column.

df[["Url"]]

Unnamed: 0,Url
0,https://github.com/TheAlgorithms/Python
1,https://github.com/geekcomputers/Python
2,https://github.com/walter201230/Python
3,https://github.com/injetlee/Python
4,https://github.com/kubernetes-client/python
5,https://github.com/Show-Me-the-Code/python
6,https://github.com/xxg1413/python
7,https://github.com/jakevdp/PythonDataScienceHa...
8,https://github.com/joeyajames/Python
9,https://github.com/docker-library/python


In [62]:
## There isn't much going on this dataset, and it appears we only have
# categorical features to work with.

In [66]:
## If we had some numberical data we could use the describe() function
# to view the statistics and do some more analysis. For now we will keep moving along.

In [135]:
## Lets create a new DataFrame with just the "Url" column and call it df2.

df2 = df[["Url"]]
df2

Unnamed: 0,Url
0,https://github.com/TheAlgorithms/Python
1,https://github.com/geekcomputers/Python
2,https://github.com/walter201230/Python
3,https://github.com/injetlee/Python
4,https://github.com/kubernetes-client/python
5,https://github.com/Show-Me-the-Code/python
6,https://github.com/xxg1413/python
7,https://github.com/jakevdp/PythonDataScienceHa...
8,https://github.com/joeyajames/Python
9,https://github.com/docker-library/python


In [136]:
## lets do the same with the "Repo" column and call it df3.

df3 = df[["Repo"]]
df3

Unnamed: 0,Repo
0,TheAlgorithms/Python
1,geekcomputers/Python
2,walter201230/Python
3,injetlee/Python
4,kubernetes-client/python
5,Show-Me-the-Code/python
6,xxg1413/python
7,jakevdp/PythonDataScienceHandbook
8,joeyajames/Python
9,docker-library/python


In [137]:
## Now lets filter for data using the .loc function and slicing.  
# We will create 2 more DataFrames by assigning our results into df4 and df5.

df4 = df3.loc[:3]

In [138]:
## Now do the same for df2.

df5 = df2.loc[:3]

In [139]:
## View the new DataFrames.

df4

Unnamed: 0,Repo
0,TheAlgorithms/Python
1,geekcomputers/Python
2,walter201230/Python
3,injetlee/Python


In [140]:
df5

Unnamed: 0,Url
0,https://github.com/TheAlgorithms/Python
1,https://github.com/geekcomputers/Python
2,https://github.com/walter201230/Python
3,https://github.com/injetlee/Python
