In [1]:
# Import required modules
from concurrent.futures import ProcessPoolExecutor
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from itertools import chain
import time
import urllib3

In [2]:
# Suppress warnings
urllib3.disable_warnings()

# Define deaders and date
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
today = pd.to_datetime("today").strftime("%d_%b_%y")


# Generate cover pages links
def generateCoverPageLink(url):
    """This function generates cover page links from total courses,
    url = url to make request in which total course number is found,
    return = cover pages links"""
    
    # Store cover page links
    coverPageLink = []
    
    # Making request
    r = requests.get(url, headers=HEADERS, verify=False)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape total course number
    totalCourse = int(s.find("span", class_="h1").text.strip().replace(",", ""))
    
    # Create stop page
    stopPage = int(np.ceil(totalCourse/100))
    
    # Iterate through stop page and create cover page links
    for page in range(1, stopPage+1):
        coverPageLink.append(url + f"?pageno={page}&sortby=MostPopular&pagesize=100")
    return coverPageLink



# This function scrapes individual course links from every cover page links
def scrapeIndividualCourseLink(url):
    """Scrapes individual course link from cover page links,
    url = cover page link,
    return = Individual course links"""
    
    # Store course links
    courseLink = []
    
    # Making request
    r = requests.get(url, headers=HEADERS, verify=False)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape course links and store
    for lnk in s.find_all("div", class_="course-overview"):
        courseLink.append("https://www.reed.co.uk" + lnk.find("h2").find("a").get("href"))
    return courseLink




# This function scrapes individual course info from individual course link
def scrapeCourseInfo(url):
    """url = individual course link,
    return = scraped course info as a dataframe"""
    
    # Initialize empty list of variables to be scraped
    courseTitle = []
    subtitle = []
    offerPrice = []
    originalPrice = []
    courseProvider = []
    unitSold = []
    category = []
    savings = []
    haveCpd = []
    awardingBody = []
    qualName = []
    cpdPoint = []
    isRegulated = []
    hasProfCert = []
    soldOrEnq = []
    
    # Making request
    r = requests.get(url, headers=HEADERS, verify=False)
    s = BeautifulSoup(r.text, "lxml")
    
    # Scrape course name
    try:
        courseTitle.append(s.find("div", class_="course-title").find("h1").text.strip())
    except:
        courseTitle.append("na")
        
    # Scrape subtitle
    try:
        subtitle.append(s.find("div", class_="course-title").find("h2").text.strip())
    except:
        subtitle.append("na")
        
    # Scrape offer price
    try:
        offerPrice.append(s.find("span", class_="current-price").text.strip())
    except:
        offerPrice.append("na")
        
    # Scrape original price
    try:
        originalPrice.append(s.find("small", class_="vat-status").text.strip())
    except:
        originalPrice.append("na")
        
    # Scrape course provider
    try:
        try:
            courseProvider.append(s.find("a", class_="provider-link").text.strip())
        except:
            courseProvider.append(s.find("span", class_="thumbnail").text.strip())
    except:
        courseProvider.append("na")
            
    # Scrape unit sale
    try:
        unitSold.append(s.find(id="number-enquiries-purchases").text.strip())
    except:
        unitSold.append(0)
        
    # Scrape category
    try:
        # Scrape total category
        totalCat = len(s.find_all("ol", class_="breadcrumb pb-0"))
        for cat in range(totalCat):
            category.append([x.text.strip() for x in s.find_all("ol", class_="breadcrumb pb-0")[cat].find_all("li")])
    except:
        category.append("na")
        
    # Scrape savings
    try:
        savings.append(s.find("span", class_="icon-savings-tag price-saving").text.strip())
    except:
        savings.append("na")
        
    # Does the course have CPD?
    try:
        haveCpd.append(1 if s.find("div", class_="badge badge-dark badge-cpd mt-2") else 0)
    except:
        haveCpd.append(0)
    
    # Scrape awarding body
    try:
        try:
            # Executes if the course is "Endorsed by"
            awardingBody.append(s.find("div", class_="col").find("a").text.strip())
        except:
            # Executes if the course is "Awarded by"
            awardingBody.append(s.find("div", class_="small").find("div").find("a").text.strip())
    except:
        awardingBody.append("na")
            
    # Scrape qualification name
    try:
        qualName.append(s.find("div", class_="small").find("h3", class_="h4").text.strip())
    except:
        qualName.append("na")
    
    # Scrape cpd point
    try:
        cpdPoint.append(s.body.find_all(text=re.compile("\d{1,3}\sCPD hours / points"))[0].strip())
    except:
        cpdPoint.append(0)
        
    # Is the course regulated? Assign 1 if regulated, otherwise 0
    try:
        isRegulated.append(1 if s.find("div", class_="badge badge-dark badge-regulated mt-2") else 0)
    except:
        isRegulated.append(0)
        
    # Does the course offer professional certification?
    try:
        hasProfCert.append(1 if s.find("div", class_="badge badge-dark badge-professional mt-2") else 0)
    except:
        hasProfCert.append(0)
    
    # Check if the course is sold or enquired. 2 if the course has both purchade and enquired mode
    try:
        soldOrEnq.append(2 if (s.find(id="addToBasket") and s.find(id="enquireNow"))\
        else 1 if s.find(id="addToBasket") else 0)
    except:
        soldOrEnq.append("na")
        
    # Create a df off scraped variables
    df = pd.DataFrame({
        "courseTitle":courseTitle,
        "courseLink":url,
        "subtitle":subtitle,
        "courseProvider":courseProvider,
        "offerPrice":offerPrice,
        "originalPrice":originalPrice,
        "unitSold":unitSold,
        "category":[category], # This one is not scalar, converting into 1d
        "haveCpd":haveCpd,
        "cpdPoint":cpdPoint,
        "awardingBody":[awardingBody],
        "qualName":qualName,
        "isRegulated":isRegulated,
        "hasProfCert":hasProfCert,
        "savings":savings,
        "soldOrEnq":soldOrEnq
    })
    df = df.astype(str)
    return df



# This function cleans scraped data.
def cleanAndExtractFeature(df):
    """"df = dataFrame to clean,
    return = final cleaned data"""
    
    # Copy the input data
    finalDf = df.copy()
    
    # Create course id and insert to the df
    finalDf.insert(loc=0, value=finalDf.courseLink.str.split("/").str.get(5).str.replace("#",""), 
                     column="courseId")
    
    # Insert date
    finalDf.insert(loc=0, value=today, column="date")
    
    # Clean unit sold
    finalDf.unitSold = finalDf.unitSold.apply(lambda x: re.findall(r"\d+", x)).str.join("")
    finalDf.unitSold = pd.to_numeric(finalDf.unitSold, errors="coerce").fillna(0).astype("int")
    
    # Clean saving percent
    finalDf["savingsPercent"] = finalDf.savings.str.split("Save").str[-1].str.replace("%", "")
    finalDf.savingsPercent = pd.to_numeric(finalDf.savingsPercent, errors="coerce").fillna(0).astype("int")
    
    # Clean offer price
    finalDf.offerPrice = finalDf.offerPrice.str.split("£").str[-1].str.replace(",", "")
    
    # Clean original price
    finalDf.originalPrice = finalDf.originalPrice.str.split("£").str[-1].str.replace(",", "").str.replace(")", "")
    finalDf.originalPrice = pd.to_numeric(finalDf.originalPrice, errors="coerce")
    
    # If savings is 0, make offer price equals to original price
    finalDf.originalPrice = np.where(finalDf.savingsPercent==0,
                                     finalDf.originalPrice.fillna(finalDf.offerPrice), finalDf.originalPrice)
    
    # Extract CPD point
    finalDf["cpdPoint"] = pd.to_numeric(finalDf.cpdPoint.str.join("").str.split("CPD").str[0], errors="coerce").fillna(0).astype(int)
    
    # Clean CPD provider
    finalDf["cpdProvider"]  = finalDf.cpdProvider.str.join("").str.replace("Accredited by", "").str.strip()
    
    # Remove '"' from category
    finalDf.category = finalDf.category.apply(lambda x: eval(x))
    
    # Create broadCategory1 from category
    finalDf["broadCategory1"] = finalDf.category.str[0].str[0]
    
    # Create broadCategory2 from category
    finalDf["broadCategory2"] = finalDf.category.str[-1].str[0]
    
    # Create subCategory1 from category
    finalDf["subCategory1"] = finalDf.category.str[0].str[-1]
    
    # Create subCategory1 from category
    finalDf["subCategory2"] = finalDf.category.str[-1].str[-1]
    
    # Extract qualification name
    finalDf["qualName"] = np.where(finalDf.qualName.str.contains("CPD"), "na", finalDf.qualName)
    
    # Clean awarding body
    finalDf.awardingBody = finalDf.awardingBody.str[1:-1].str[1:-1]\
    .str.strip().replace(r"^\s*$", np.nan, regex=True).fillna("na")
    
    # Drop "savings"
    finalDf.drop("savings", axis=1, inplace=True)
    
    # Drop duplicates by "courseId"
    finalDf = finalDf.drop_duplicates("courseId")
    return finalDf

In [3]:
# Wrap all the functions inside main
def main(url):
    """url = url to make the 1st requests to generate cover pages,
    return = final cleaned dataframe"""
    
    # Record start time
    startTime = time.time()
    
    # Generates cover pages links
    coverPageLink = generateCoverPageLink(url)
    
    # Store individual course links
    courseLink = []
    
    # This loop ensures maximum no of course links scraped
    for _ in range(4):
        with ProcessPoolExecutor(max_workers=6) as ex:
            # Scrape individual course links
            indCourseLink = list(ex.map(scrapeIndividualCourseLink, coverPageLink))
            indCourseLink = list(chain(*indCourseLink)) # Flattening the list
        courseLink.append(indCourseLink)
    courseLink = list(chain(*courseLink))
    
    # Create a series to drop duplicates by ids. This portion keeps only the unique links
    tempSeries = pd.Series(courseLink, name="tempLink")
    splitTempSeries = tempSeries.str.split("/", expand=True)
    splitTempSeries.columns = ["a","b","c","d","e","f","g","h","i"]
    duplicatesDropped = splitTempSeries.drop_duplicates("f", keep="first").reset_index(drop=True)
    courseLink = duplicatesDropped.agg("/".join, axis=1) # This returns a series of course links
    
        
    # Scrapes course info from course link
    with ProcessPoolExecutor(max_workers=6) as ex:
        courseInfo = pd.concat(list(ex.map(scrapeCourseInfo, courseLink))).reset_index(drop=True)
    
    # Cleans and engineers new features from the scraped dataframe and returns the final dataframe   
    finalDf = cleanAndExtractFeature(courseInfo).reset_index(drop=True)
    
    # Measure execution time and return the final df
    endTime = time.time()
    durationInMins = round((endTime-startTime)/60, 2)
    print(f"{url.split('/')[4].capitalize()} ==> {len(courseLink)} Records ==> {durationInMins} Minutes")
    return finalDf

In [4]:
# These are the providers urls scraping will we started from.
providersUrl = {"oneEdu":"https://www.reed.co.uk/courses/one-education/p1812",
    "courseGate":"https://www.reed.co.uk/courses/course-gate/p1834",
    "janets":"https://www.reed.co.uk/courses/janets/p1778",
    "eustonCollege":"https://www.reed.co.uk/courses/euston-college/p2128",
    "tx":"https://www.reed.co.uk/courses/training-express-ltd/p2079",
    "hfOnline":"https://www.reed.co.uk/courses/academy-for-health-fitness/p2261",
    "beaco":"https://www.reed.co.uk/courses/be-acouk/p545",
    "cpdCourses":"https://www.reed.co.uk/courses/cpd-courses/p1534",
    "brentwood":"https://www.reed.co.uk/courses/brentwood-open-learning-college/p438",
    "oplex":"https://www.reed.co.uk/courses/oplex-careers/p630",
    "oxford":"https://www.reed.co.uk/courses/oxford-home-study-college/p1245",
    "trainingTerminal":"https://www.reed.co.uk/courses/the-training-terminal/p1064",
    "excelWithBusiness":"https://www.reed.co.uk/courses/excel-with-business/p930",
    "ofCourse":"https://www.reed.co.uk/courses/ofcourse/p675",
    "trendimi":"https://www.reed.co.uk/courses/trendimi/p964",
    "centreOfExcellence":"https://www.reed.co.uk/courses/centre-of-excellence-online/p652",
    "leadAcademy":"https://www.reed.co.uk/courses/lead-academy/p2144",
    "bekeCollege":"https://www.reed.co.uk/courses/beke-college-cic/p2140",
    "protrainings":"https://www.reed.co.uk/courses/protrainings-europe-limited/p981",
    "mandatoryCompliance":"https://www.reed.co.uk/courses/mandatory-compliance/p1514",
    "intOpenAcademy":"https://www.reed.co.uk/courses/international-open-academy/p967",
    "skillUp":"https://www.reed.co.uk/courses/skill-up/p2339", 
    "simplyCert":"https://www.reed.co.uk/courses/simply-cert/p669",
    "courseCloud":"https://www.reed.co.uk/courses/course-cloud/p2413",
    "teachersTraining":"https://www.reed.co.uk/courses/the-teachers-training/p2334",
    "1training":"https://www.reed.co.uk/courses/1-training/p1312",
    "instituteOfBeauty":"https://www.reed.co.uk/courses/institute-of-beauty-and-makeup/p2509",
    "animalCare":"https://www.reed.co.uk/courses/the-animal-care/p2520",
    "skillExpress":"https://www.reed.co.uk/courses/skill-express/p2510",
    "complianceCentral":"https://www.reed.co.uk/courses/compliance-central/p2584",
    "skillSuccess":"https://www.reed.co.uk/courses/skills-success/p1341",
    "globalEdulink":"https://www.reed.co.uk/courses/globaledulink/p533",
    "ukProfessionalDev":"https://www.reed.co.uk/courses/uk-professional-development-academy-ltd/p1749",
    "study365":"https://www.reed.co.uk/courses/study365/p1060",
    "staffTrainingSolution":"https://www.reed.co.uk/courses/staff-training-solutions/p1477",
    "inspireLondonCollege":"https://www.reed.co.uk/courses/inspire-london-college-ltd/p1746",
    "simplivllc":"https://www.reed.co.uk/courses/simpliv-llc/p1999",
    "echo3Education":"https://www.reed.co.uk/courses/echo3-education-limited/p1619",
    "lifeSavingTraining":"https://www.reed.co.uk/courses/life-saving-training-ltd/p2218",
    "southLondonCollege":"https://www.reed.co.uk/courses/south-london-college/p1405",
    "knowledgeDoor":"https://www.reed.co.uk/courses/knowledge-door/p2538",
    "hollyAndHugo":"https://www.reed.co.uk/courses/holly-and-hugo/p965",
    "coursePride":"https://www.reed.co.uk/courses/course-pride/p1706",
    "activeRecruitment":"https://www.reed.co.uk/courses/active-recruitment-ltd/p2041",
    "eCareersLifestyle":"https://www.reed.co.uk/courses/e-careerslifestyle/p1733",
    "eventtrix":"https://www.reed.co.uk/courses/eventtrix/p966",
    "apexLearning":"https://www.reed.co.uk/courses/apex-learning/p2601",
    "studyHub":"https://www.reed.co.uk/courses/studyhub/p2675",
    "nextLearn":"https://www.reed.co.uk/courses/next-level-academy/p1727"
    }

In [5]:
skillSuccess = main(providersUrl["skillSuccess"])



Skills-success ==> 2290 Records ==> 11.34 Minutes


In [6]:
globalEdulink = main(providersUrl["globalEdulink"])

Globaledulink ==> 955 Records ==> 5.66 Minutes




In [7]:
ukProfessionalDev = main(providersUrl["ukProfessionalDev"])

Uk-professional-development-academy-ltd ==> 646 Records ==> 5.76 Minutes




In [8]:
study365 = main(providersUrl["study365"])



Study365 ==> 1108 Records ==> 6.19 Minutes


In [9]:
staffTrainingSolution = main(providersUrl["staffTrainingSolution"])

Staff-training-solutions ==> 264 Records ==> 1.58 Minutes




In [10]:
inspireLondonCollege = main(providersUrl["inspireLondonCollege"])

Inspire-london-college-ltd ==> 208 Records ==> 1.3 Minutes




In [11]:
simplivllc = main(providersUrl["simplivllc"])



Simpliv-llc ==> 4437 Records ==> 21.08 Minutes


In [12]:
echo3Education = main(providersUrl["echo3Education"])

Echo3-education-limited ==> 75 Records ==> 0.47 Minutes




In [13]:
lifeSavingTraining = main(providersUrl["lifeSavingTraining"])

Life-saving-training-ltd ==> 34 Records ==> 0.24 Minutes




In [14]:
southLondonCollege = main(providersUrl["southLondonCollege"])

South-london-college ==> 299 Records ==> 1.84 Minutes




In [55]:
knowledgeDoor = main(providersUrl["knowledgeDoor"])



Knowledge-door ==> 2336 Records ==> 13.08 Minutes


In [17]:
hollyAndHugo = main(providersUrl["hollyAndHugo"])

Holly-and-hugo ==> 17 Records ==> 0.24 Minutes




In [18]:
coursePride = main(providersUrl["coursePride"])

Course-pride ==> 387 Records ==> 2.29 Minutes




In [19]:
activeRecruitment = main(providersUrl["activeRecruitment"])

Active-recruitment-ltd ==> 160 Records ==> 0.88 Minutes




In [20]:
eCareersLifestyle = main(providersUrl["eCareersLifestyle"])

E-careerslifestyle ==> 224 Records ==> 1.26 Minutes




In [21]:
eventtrix = main(providersUrl["eventtrix"])

Eventtrix ==> 18 Records ==> 0.19 Minutes




In [22]:
trendimi = main(providersUrl["trendimi"])

Trendimi ==> 47 Records ==> 0.38 Minutes




In [24]:
oneEdu = main(providersUrl["oneEdu"])

One-education ==> 2242 Records ==> 13.16 Minutes




In [25]:
janets = main(providersUrl["janets"])

Janets ==> 1480 Records ==> 8.53 Minutes




In [26]:
tx = main(providersUrl["tx"])

Training-express-ltd ==> 592 Records ==> 3.5 Minutes




In [27]:
hfOnline = main(providersUrl["hfOnline"])

Academy-for-health-fitness ==> 898 Records ==> 5.25 Minutes




In [28]:
beaco = main(providersUrl["beaco"])

Be-acouk ==> 334 Records ==> 1.89 Minutes




In [29]:
cpdCourses = main(providersUrl["cpdCourses"])

Cpd-courses ==> 816 Records ==> 5.19 Minutes




In [30]:
brentwood = main(providersUrl["brentwood"])

Brentwood-open-learning-college ==> 283 Records ==> 1.79 Minutes




In [31]:
oplex = main(providersUrl["oplex"])

Oplex-careers ==> 415 Records ==> 2.25 Minutes




In [32]:
oxford = main(providersUrl["oxford"])

Oxford-home-study-college ==> 249 Records ==> 1.57 Minutes




In [33]:
trainingTerminal = main(providersUrl["trainingTerminal"])

The-training-terminal ==> 107 Records ==> 0.69 Minutes




In [34]:
excelWithBusiness = main(providersUrl["excelWithBusiness"])

Excel-with-business ==> 51 Records ==> 0.36 Minutes




In [35]:
ofCourse = main(providersUrl["ofCourse"])

Ofcourse ==> 617 Records ==> 3.07 Minutes




In [36]:
centreOfExcellence = main(providersUrl["centreOfExcellence"])

Centre-of-excellence-online ==> 427 Records ==> 2.28 Minutes




In [37]:
leadAcademy = main(providersUrl["leadAcademy"])

Lead-academy ==> 2048 Records ==> 11.51 Minutes




In [38]:
bekeCollege = main(providersUrl["bekeCollege"])

Beke-college-cic ==> 40 Records ==> 0.27 Minutes




In [39]:
protrainings = main(providersUrl["protrainings"])

Protrainings-europe-limited ==> 116 Records ==> 0.74 Minutes




In [40]:
mandatoryCompliance = main(providersUrl["mandatoryCompliance"])

Mandatory-compliance ==> 326 Records ==> 1.94 Minutes




In [41]:
intOpenAcademy = main(providersUrl["intOpenAcademy"])

International-open-academy ==> 157 Records ==> 1.02 Minutes




In [42]:
skillUp = main(providersUrl["skillUp"])

Skill-up ==> 506 Records ==> 2.95 Minutes




In [43]:
simplyCert = main(providersUrl["simplyCert"])

Simply-cert ==> 226 Records ==> 1.24 Minutes




In [44]:
courseCloud = main(providersUrl["courseCloud"])

Course-cloud ==> 1202 Records ==> 6.95 Minutes




In [45]:
teachersTraining = main(providersUrl["teachersTraining"])

The-teachers-training ==> 417 Records ==> 2.49 Minutes




In [46]:
oneTraining = main(providersUrl["1training"])

1-training ==> 760 Records ==> 4.44 Minutes




In [47]:
instituteOfBeauty = main(providersUrl["instituteOfBeauty"])

Institute-of-beauty-and-makeup ==> 168 Records ==> 1.04 Minutes




In [48]:
animalCare = main(providersUrl["animalCare"])

The-animal-care ==> 116 Records ==> 0.76 Minutes




In [49]:
skillExpress = main(providersUrl["skillExpress"])

Skill-express ==> 302 Records ==> 1.82 Minutes




In [50]:
complianceCentral = main(providersUrl["complianceCentral"])

Compliance-central ==> 313 Records ==> 1.92 Minutes




In [51]:
apexLearning = main(providersUrl["apexLearning"])

Apex-learning ==> 500 Records ==> 2.99 Minutes




In [52]:
studyHub = main(providersUrl["studyHub"])

Studyhub ==> 501 Records ==> 2.94 Minutes




In [53]:
nextLearn = main(providersUrl["nextLearn"])

Next-level-academy ==> 616 Records ==> 3.52 Minutes




In [56]:
# Concat all the providers data
masterDf = pd.concat([
    oneEdu,
    janets,
    tx,
    hfOnline,
    beaco, 
    cpdCourses,
    brentwood,
    oplex, 
    oxford,
    trainingTerminal,
    excelWithBusiness,
    ofCourse,
    trendimi,
    centreOfExcellence,
    leadAcademy,
    bekeCollege,
    protrainings,
    mandatoryCompliance,
    intOpenAcademy,
    skillUp,
    simplyCert,
    courseCloud,
    teachersTraining,
    oneTraining,
    instituteOfBeauty,
    animalCare, 
    skillExpress,
    complianceCentral,
    skillSuccess,
    globalEdulink,
    ukProfessionalDev,
    study365,
    staffTrainingSolution,
    inspireLondonCollege,
    simplivllc,
    echo3Education,
    lifeSavingTraining,
    southLondonCollege,
    knowledgeDoor,
    hollyAndHugo,
    coursePride,
    activeRecruitment,
    eCareersLifestyle,
    eventtrix,
    apexLearning,
    studyHub,
    nextLearn
    
]).reset_index(drop=True)

In [58]:
# Providers with their courses
masterDf.groupby("courseProvider").courseTitle.agg(["count"]).sort_values("count", ascending=False)

Unnamed: 0_level_0,count
courseProvider,Unnamed: 1_level_1
Simpliv LLC,4437
Knowledge Door,2336
Skill Success,2290
One Education,2242
Lead Academy,2048
Janets,1480
Course Cloud,1202
Study365,1108
Global Edulink,955
Academy for Health & Fitness,898


In [59]:
# Unique providers and total records
masterDf.courseProvider.nunique(), masterDf.shape[0]

(47, 30330)

In [60]:
# Preview the data
masterDf.head()

Unnamed: 0,date,courseId,courseTitle,courseLink,subtitle,courseProvider,offerPrice,originalPrice,unitSold,category,...,awardingBody,qualName,isRegulated,hasProfCert,soldOrEnq,savingsPercent,broadCategory1,broadCategory2,subCategory1,subCategory2
0,16_Apr_21,234929,Car Mechanic Training,https://www.reed.co.uk/courses/car-mechanic-tr...,Spring Shopping Spree!! | Accredited by IAO & ...,One Education,10,425,4535,"[[Driving, Car maintenance], [Driving, Mechani...",...,na,na,0,0,1,97,Driving,Engineering,Car maintenance,Mechanical engineering
1,16_Apr_21,282714,Mental Health,https://www.reed.co.uk/courses/mental-health/2...,Level 5 Endorsed Mental Health Awareness Diplo...,One Education,10,425,445,"[[Law, Mental health law], [Health & care, Soc...",...,The Quality Licence Scheme,na,0,0,1,97,Law,Health & care,Mental health law,Mental health nursing
2,16_Apr_21,233614,Estate Agent Diploma,https://www.reed.co.uk/courses/estate-agent-di...,Spring Shopping Spree !! | Level 5 Endorsed Di...,One Education,18,425,2569,"[[Sales], [Construction, Estate agent], [Surve...",...,The Quality Licence Scheme,na,0,0,1,95,Sales,Surveying,Sales,Estate agent
3,16_Apr_21,239173,HR (Human Resources) and Payroll Administrator,https://www.reed.co.uk/courses/hr-human-resour...,Spring Shopping Spree !! | Level 7 Endorsed Ad...,One Education,10,425,2792,"[[Accounting and finance, Payroll], [HR, Manag...",...,The Quality Licence Scheme,na,0,0,1,97,Accounting and finance,HR,Payroll,HR Management
4,16_Apr_21,239198,Property Development,https://www.reed.co.uk/courses/property-develo...,Level 5 Endorsed Diploma | 150 CPD Points | Ad...,One Education,10,425,485,"[[Surveying, Estate management], [Surveying, P...",...,The Quality Licence Scheme,na,0,0,1,97,Surveying,Surveying,Estate management,Property development


In [61]:
# Save as csv
masterDf.to_csv(f"/home/faysal/Desktop/masterData/15Competitor/{today}_15_providers.csv", index=None)

In [62]:
# Read the data
df = pd.read_csv(f"/home/faysal/Desktop/masterData/15Competitor/{today}_15_providers.csv")

In [64]:
# Check for "na"
isNa = df[df.courseProvider=="na"]
notNa = df[df.courseProvider!="na"]
isNa

Unnamed: 0,date,courseId,courseTitle,courseLink,subtitle,courseProvider,offerPrice,originalPrice,unitSold,category,...,awardingBody,qualName,isRegulated,hasProfCert,soldOrEnq,savingsPercent,broadCategory1,broadCategory2,subCategory1,subCategory2
