In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

**To scrape the data we will use BeautifulSoup**

In [None]:
soup = BeautifulSoup("http", 'html.parser')

#Let's start on the main page of the website
URL = 'https://www.androidrank.org/android-most-popular-google-play-apps'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

# Initialization of an empty dataframe with the desired columns (URL is ou unique ID)
new_columns = ["URL", 'Rank','Name', 'Nb_Rating', 'Installs', 'Avg_Rating', "Ev_30", "Ev_60", "Price", "Category"]
app_pd = pd.DataFrame(columns=new_columns)

# We go throught all the categories and take all the given app and their ranking in that category
nav = soup.find("nav")
start = nav.find("b", text="Comics") #1
for b in start.find_next_siblings("b"):
    a = b.find("a")
    Category = a.get_text()
    
    # Now we move on the webpage of the desired category
    URL = 'https://www.androidrank.org/'+a["href"]
    r = requests.get(URL)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    # We select all the app on the webpage with all their informations
    table = soup.find("table", id="ranklist")
    apps = table.find_all("tr")[1:]
    for app in apps:
        
        # Ev_30 and Ev_60 corresponds to the growth of the number of ratings over the 30 and 60 respectively
        URL, Rank, Name, Nb_Rating, Installs, Avg_Rating, Ev_30, Ev_60, Price = '','','','','','','','',''
        p = app.find_all("td")
        Rank = p[0].text
        URL = p[1].find('a', href=True)["href"]
        Name = p[1].find('a', href=True).text
        Nb_Rating = p[3].text
        Installs = p[4].text
        Avg_Rating = p[5].text
        Ev_30 = p[6].text
        Ev_60 = p[7].text
        Price = p[8].text
        
        # We create a "dummy" dataframe to organize the collected data
        characteristics_pd = pd.DataFrame([[URL, Rank, Name, Nb_Rating, Installs, Avg_Rating,
                                            Ev_30, Ev_60, Price,Category]], columns = new_columns)
        
        # We append this dataframe to the global dataframe
        app_pd = app_pd.append(characteristics_pd,ignore_index=True)
    
    #we get the the URL for the next page in the same category
    nextpage = soup.find_all('small')[1].find("a", text="Next >")
    
    # We continue this until there is no next page in the given category anymore
    while(len(nextpage['href'])):
        
        urlnext_page = "https://www.androidrank.org" + nextpage['href']
        r = requests.get(urlnext_page)
        page_body = r.text
        soup = BeautifulSoup(page_body, 'html.parser')

        table = soup.find("table", id="ranklist")
        apps = table.find_all("tr")[1:]

        for app in apps:
            URL, Rank, Name, Nb_Rating, Installs, Avg_Rating, Ev_30, Ev_60, Price = '','','','','','','','',''
            p = app.find_all("td")
            Rank = p[0].text
            URL = p[1].find('a', href=True)["href"]
            Name = p[1].find('a', href=True).text
            Nb_Rating = p[3].text
            Installs = p[4].text
            Avg_Rating = p[5].text
            Ev_30 = p[6].text
            Ev_60 = p[7].text
            Price = p[8].text
            characteristics_pd = pd.DataFrame([[URL, Rank, Name, Nb_Rating, Installs, Avg_Rating,
                                                Ev_30, Ev_60, Price, Category]], columns = new_columns)
            app_pd = app_pd.append(characteristics_pd,ignore_index=True)
            
            # To avoid a crash due to the return of a None when there is no next page we manually set it to ""
            if(soup.find_all('small')[1].find("a", text="Next >")):
                nextpage = soup.find_all('small')[1].find("a", text="Next >")
            else:
                nextpage["href"] = ""

**Due to the excess of requests, we had to relaunch the previous cell and modify line #1 to restart from the desired category**

In [None]:
# Read the first csv containing data
df_app = pd.read_csv('data/1.csv', index_col=[0])

# Iterate through the 7 remaining files
for i in range(2,9):
    df_temp = pd.read_csv('data/' + str(i) + '.csv', index_col=[0])
    df_app = pd.concat([df_app, df_temp], ignore_index=True, join='inner')
    
# Dropping duplicates since the parsing was sometimes stopped during the analysis of one category of apps
df_app.drop_duplicates(inplace = True)

**At the end we have the following dataset**

In [None]:
df_app.head(10)

In [None]:
print('URL: ' + str(df_app.URL.isnull().values.any()))
print('Rank: ' + str(df_app.Rank.isnull().values.any()))
print('Name: ' + str(df_app.Name.isnull().values.any()))
print('Nb_Rating: ' + str(df_app.Nb_Rating.isnull().values.any()))
print('Installs: ' + str(df_app.Installs.isnull().values.any()))
print('Avg_Rating: ' + str(df_app.Avg_Rating.isnull().values.any()))
print('Ev_30: ' + str(df_app.Ev_30.isnull().values.any()))
print('Ev_60: ' + str(df_app.Ev_60.isnull().values.any()))
print('Price: ' + str(df_app.Price.isnull().values.any()))
print('Category: ' + str(df_app.Category.isnull().values.any()))

**The only column with null values is the Category. It corresponds to the global ranking of all apps that were parsed, where the category was not mentionned. Since these apps already appear in their own category (if they are in the global top 500, they are also in their own category top 500), they are duplicates. So let's drop them.**

In [None]:
df_app.dropna(inplace=True)
df_app.isna().values.any()

**Isn't it weird that we don't have 24 500 rows since we have a top 500 in each category and 49 categories?**
**Let's have a closer look**

In [None]:
df_app.Category.value_counts().tail(10)

**By looking on the website, we see that this is not a scraping problem but that some categories have less app in the Top. This is already a good thing to highlights the difference of diversity between each categories**

**Little bit of cleaning let's convert back the number from string to int**

In [None]:
df_app["Nb_Rating"] = df_app["Nb_Rating"].str.replace(',', '')
df_app["Nb_Rating"] = df_app["Nb_Rating"].astype(int)
df_app["Installs"] = df_app["Installs"].str.replace(' ', '')
df_app["Installs"] = (df_app["Installs"].str.replace(r'[kM]+$', '', regex=True).astype(float) * \
                        df_app["Installs"].str.extract(r'[\d\.]+([kM]+)', expand=False)
                         .fillna(1)
                          .replace(['k','M'], [10**3, 10**6]).astype(int))
df_app["Avg_Rating"] = df_app["Avg_Rating"].astype(float)
df_app["Ev_30"] = df_app["Ev_30"].str.replace('%', '').astype(float)
df_app["Ev_60"] = df_app["Ev_60"].str.replace('%', '').astype(float)
df_app.head(5)

**We save the cleaned dataset**

In [None]:
df_app.to_csv("data/final_dataset.csv")