## This is a Task given by the HR of the SOIL company

### Task: The Task is to get any educational website and scrap all the data and convert the scrapped data into csv and json format 

### website URL https://testbook.com/objective-questions/mcq-on-valuation--5eea6a0c39140f30f369e0bc

### Importing the neccesary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def request(url):
    """
    This function takes the website URL and checks whether the url is good or not .
    It checks the status code of the given website and return the relevant parameters
    for more status code refer below documentation
    https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
    """
    try:
        request = requests.get(url)
        status_code = request.status_code
        if status_code < 199:
            return "Informational responses "
        if status_code == 200:
            return request
        else:
            return "Something went wrong while getting the data from the given website"

    except Exception as e:
        raise Exception(f"(request): Something went wrong  \n"+str(e))

In [3]:
def get_soup(url):
    """
    this function takes the request and pass the text and converts into beautiful Soup 
    """
    try:
        soup = BeautifulSoup(url.text, 'html.parser')
        return soup
    except Exception as e:
        raise Exception(f"(get_soup): Something went wrong  \n"+str(e))

In [4]:
website_url = 'https://testbook.com/objective-questions/mcq-on-valuation--5eea6a0c39140f30f369e0bc'
url = request(website_url)

In [5]:
soup = get_soup(url)

### Parsing the soup and extracts the questions from the url

In [6]:
def questions(soup):
    """
    This function extracts the all the questions from the given URL and return the questions in the list
    """
    try:
        questions = []  # creating a empty list to stores the questions
        questionbox = soup.findAll('div', {'class': 'card question-card'})
        for i in range(len(questionbox)):
            question = questionbox[i].find_all('h1', {'class': 'questionBody'})
            question = str(question)  # converting the bs.element into string
            # starting element of the string
            start = '[<h1 class="questionBody">'
            end = '</h1>]'  # ending element of the string
            question = (question.split(start))[1].split(end)[0]
            questions.append(question)
        return questions
    except Exception as e:
        raise Exception(f"(questions): Something went wrong  \n"+str(e))

In [7]:
questions_list = questions(soup)

In [8]:
def answers(soup):
    """
    This function scraps all the answers from the given url and returns the  answers presents in a list 
    """
    try:
        answerbox = soup.findAll('div', {'class': 'card answer-card'})
        answers_list = []  # Creating an empty list while will store the answers later
        for i in range(20):
            answer = answerbox[i].findAll('div', {'class': ""})
            answer = str(answer)
            start = '[<div class="">'  # starting element of the string
            end = '</div>]'  # ending element of the string
            answer = (answer.split(start))[1].split(end)[0]
            answers_list.append(answer)
        return answers_list
    except Exception as e:
        raise Exception(f"(answers): Something went wrong  \n"+str(e))

In [9]:
answers_list = answers(soup)

In [10]:
def options(soup):
    """
    This function returns the all the options presents in the webpage and returns the options in a list 
    """
    try:
        options_list = []  # creating a empty list to stores all the options in the webpage
        questionbox = soup.findAll('div', {'class': 'card question-card'})
        for i in range(20):
            single_question = questionbox[i]
            options = single_question.find_all('ol', {'class': 'options-list'})
            options = str(options)
            options = BeautifulSoup(options)
            for ultag in options.find_all('ol', {'class': 'options-list'}):
                for litag in ultag.find_all('li'):
                    options_list.append(litag.text)
        return options_list
    except Exception as e:
        raise Exception(f"(options): Something went wrong  \n"+str(e))

In [11]:
options_list=  options(soup)

### Splitting the Options based on their respective Question Numbers 

In [12]:
first_Option= []
second_Option = []
third_Option = []
fourth_Option = []
for i in range(0,80):
    if i%4 == 0:
        first_Option.append(options_list[i])
    elif i%4 ==1:
        second_Option.append(options_list[i])
    elif i%4==3:
        fourth_Option.append(options_list[i])
    else:
        third_Option.append(options_list[i])

### Creating A DataFrame

In [13]:
df = pd.DataFrame(questions_list,columns =['Question'])

### Creating a dummy variable and assign the Question Number to the respective number and set the index of the dataframe

In [14]:
df['Question_No'] = [i for i in range(1,21)]
df.set_index('Question_No')

Unnamed: 0_level_0,Question
Question_No,Unnamed: 1_level_1
1,Calculate the years purchase for a property of...
2,The total area of floor in-between walls and c...
3,Determine the present value of a building that...
4,A construction equipment has an initial cost o...
5,The value at the end of the utility period wit...
6,Which among the following methods of calculati...
7,"For Mortgage purposes, the mortgage value of a..."
8,For estimation of painting area of corrugated ...
9,The value of the property shown in the account...
10,"A test facility setup coast Rs. 10,00,000 at t..."


###  Assigning the Options to the Dataframe

In [15]:
df['first_Option'] = first_Option
df['Second_Option'] = second_Option
df['Third_Option'] = third_Option
df['Fourth_Option'] = fourth_Option
df['Answers_list'] = answers_list

### Checking the Dataframe

In [16]:
df

Unnamed: 0,Question,Question_No,first_Option,Second_Option,Third_Option,Fourth_Option,Answers_list
0,Calculate the years purchase for a property of...,1,12.5,14,17,22,Option 2 : 14
1,The total area of floor in-between walls and c...,2,circulation area,plinth area,floor area,carpet area,Option 3 : floor area
2,Determine the present value of a building that...,3,"Rs. 24,000","Rs. 25,000","Rs. 23,000","Rs. 15,000","Option 3 : Rs. 23,000"
3,A construction equipment has an initial cost o...,4,"0.1 and Rs. 1,50,000","0.2 and Rs. 1,50,000","0.1 and Rs. 1,00,000","0.2 and Rs. 1,00,000","Option 2 : 0.2 and Rs. 1,50,000"
4,The value at the end of the utility period wit...,5,Market value,Salvage value,Scrap value,Book value,Option 2 : Salvage value
5,Which among the following methods of calculati...,6,Straight-line method,Constant percentage method,Sinking fund method,Quantity survey method,Option 4 : Quantity survey method
6,"For Mortgage purposes, the mortgage value of a...",7,\(\frac{1}{2}to\;\frac{2}{3}\;of\;the\;capital...,\(\frac{1}{3}to\;\frac{3}{4}\;of\;the\;capital...,\(\frac{1}{8}to\;\frac{1}{{16}}\;of\;the\;capi...,\(\frac{1}{5}to\;\frac{1}{{10}}\;of\;the\;capi...,"Option 1 : <span class=""math-tex"">\(\frac{1}{2..."
7,For estimation of painting area of corrugated ...,8,10%,14%,20%,25%,Option 2 : 14%
8,The value of the property shown in the account...,9,Scrap value,Salvage value,Book value,Market value,Option 3 : Book value
9,"A test facility setup coast Rs. 10,00,000 at t...",10,"Rs. 5,75,000","Rs. 4,75,000","Rs. 5,00,000","Rs. 5,25,000","Option 4 : Rs. 5,25,000"


### Converting the dataframe to Comma Seperated File Using Pandas (csv)

In [17]:
# df.to_csv("final.csv")

### Converting the dataframe to json file

In [18]:
json = df.to_json()

In [19]:
json

'{"Question":{"0":"Calculate the years purchase for a property of useful life of 30 years and the rate of interest of 5% per annum. The rate of interest for the sinking fund is 3%.","1":"The total area of floor in-between walls and consists of floor of all rooms, verandahs passages, corridors, stair case, entrance halls, kitchen, stores, bath and latrines is known as ________.","2":"Determine the present value of a building that was constructed 30 years ago at Rs. 50,000. The estimated life of the building is 50 years, at the end of which it will have 10% scrap value of its cost of construction. Depreciation is to be calculated by straight line method.","3":"A construction equipment has an initial cost of Rs. 2,00,000 and salvage value of Rs. 50,000 at the end of an economic life of 5 years. The rate of straight-line depreciation and total depreciation will be","4":"The value at the end of the utility period without being dismantled is termed as:","5":"Which among the following methods

['12.5', 'circulation area', 'Rs. 24,000', '0.1 and Rs. 1,50,000', 'Market value', 'Straight-line method', '\\(\\frac{1}{2}to\\;\\frac{2}{3}\\;of\\;the\\;capitalized\\;value\\)', '10%', 'Scrap value', 'Rs. 5,75,000', 'Tender', 'Rs. 400.20', 'V2 only', 'Rs. 1,22,500', '10%', '2.5 square meter', 'value at the end of utility period', '30,00,000', 'Capital value method', '2%']
['14', 'plinth area', 'Rs. 25,000', '0.2 and Rs. 1,50,000', 'Salvage value', 'Constant percentage method', '\\(\\frac{1}{3}to\\;\\frac{3}{4}\\;of\\;the\\;capitalized\\;value\\)', '14%', 'Salvage value', 'Rs. 4,75,000', 'Schedule of rates', 'Rs. 200.70', 'V1 only', 'Rs. 1,32,500', '15%', '5.0 square meter', 'original cost minus the amount of depreciation till date', '31,50,000', 'Profit-based method', '5%']
