In [434]:
import re
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import requests

# you must populate this dict with the schools required -> try talking to the teaching team about this

schools = {   
    'flatiron-school' : 10748,
    'general-assembly' :10761,
    'data-science-dojo' : 10685,
    'ironhack' : 10828,
    'le-wagon' : 10868,
    'wild-code-school' : 11169,
    'the-dev-masters' : 11077,
    'byte-academy' : 10574,
    'the-data-incubator' : 11074,
    "brainstation" : 10571,
    }

def get_comments_school(school):
    TAG_RE = re.compile(r'<[^>]+>')
    # defines url to make api call to data -> dynamic with school if you want to scrape competition
    url = "https://www.switchup.org/chimera/v1/school-review-list?mainTemplate=school-review-list&path=%2Fbootcamps%2F" + school + "&isDataTarget=false&page=3&perPage=10000&simpleHtml=true&truncationLength=250"
    #makes get request and converts answer to json
    # url defines the page of all the information, request is made, and information is returned to data variable
    data = requests.get(url).json()
    #converts json to dataframe
    reviews =  pd.DataFrame(data['content']['reviews'])

  
    #aux function to apply regex and remove tags
    def remove_tags(x):
        return TAG_RE.sub('',x)
    reviews['review_body'] = reviews['body'].apply(remove_tags)
    reviews['school'] = school
    return reviews   
    

In [435]:
# could you write this as a list comprehension?
# comments = [comments.append(get_comments_school(school)) for school in schools.keys()]

comments = []

for school in schools.keys():
    print(school)
    comments.append(get_comments_school(school))

comments = pd.concat(comments)

flatiron-school
general-assembly
data-science-dojo
ironhack
le-wagon
wild-code-school
the-dev-masters
byte-academy
the-data-incubator
brainstation


In [436]:
comments.head(2)

Unnamed: 0,id,name,anonymous,hostProgramName,graduatingYear,isAlumni,jobTitle,tagline,body,rawBody,...,queryDate,program,user,overallScore,comments,overall,curriculum,jobSupport,review_body,school
0,297686,Anonymous,True,,2021.0,False,Software Engineer,Waste of time. They want your money.,"<span class=""truncatable""><p></p><p>I found my...",<p>I found my therapist through Flatiron. That...,...,2023-02-07,Online Software Engineering,{'image': None},1.3,[],1.0,2.0,1.0,I found my therapist through Flatiron. That's ...,flatiron-school
1,290538,Anonymous,True,,2020.0,False,,Negative experience,"<span class=""truncatable""><p></p><p>I have a m...",<p>I have a money-back guarantee with them for...,...,2022-07-09,,{'image': None},1.7,[],2.0,2.0,1.0,I have a money-back guarantee with them for jo...,flatiron-school


In [438]:
comments.drop(['anonymous', 'hostProgramName', 'body', 'rawBody', 'user', 'comments', 'overallScore','queryDate' ], axis=1, inplace = True, errors='ignore')

In [439]:
comments.head()

Unnamed: 0,id,name,graduatingYear,isAlumni,jobTitle,tagline,createdAt,program,overall,curriculum,jobSupport,review_body,school
0,297686,Anonymous,2021.0,False,Software Engineer,Waste of time. They want your money.,2/7/2023,Online Software Engineering,1.0,2.0,1.0,I found my therapist through Flatiron. That's ...,flatiron-school
1,290538,Anonymous,2020.0,False,,Negative experience,7/9/2022,,2.0,2.0,1.0,I have a money-back guarantee with them for jo...,flatiron-school
2,288383,Anonymous,2021.0,False,Data Scientist,Great learning Experience,5/2/2022,Online Data Science,5.0,5.0,5.0,I had a great learning experience at Flatiron ...,flatiron-school
3,286189,Yeva Usatova,2022.0,False,Software engineer part time,The worst place ever,2/24/2022,,1.0,1.0,1.0,If you are working full time and looking for a...,flatiron-school
4,285483,Lisa Arends,2021.0,False,Data Analyst II,Empowering and Life-Changing,1/31/2022,Online Data Science,5.0,5.0,5.0,"After teaching math for 20 years, I decided th...",flatiron-school


In [440]:
comments.rename(columns  = {'id' : 'comment_id', 'name' : 'student_name', 'graduatingYear' : 'graduation_year', 'isAlumni': 'alumni', 'jobTitle' : 'job_title', 'tagline' : 'review_tag', 'createdAt' : 'review_date', 'overall':'overall_rating', 'curriculum': 'course_rating', 'jobSupport': 'support_rating', 'review_body' : 'comment'}, inplace = True)

In [442]:

comments['graduation_year'] = comments.graduation_year.apply(np.round).astype("Int64")

In [447]:
comments.head(2)

Unnamed: 0,comment_id,student_name,graduation_year,alumni,job_title,review_tag,review_date,program,overall_rating,course_rating,support_rating,comment,school
0,297686,Anonymous,2021,False,Software Engineer,Waste of time. They want your money.,2/7/2023,Online Software Engineering,1.0,2.0,1.0,I found my therapist through Flatiron. That's ...,flatiron-school
1,290538,Anonymous,2020,False,,Negative experience,7/9/2022,,2.0,2.0,1.0,I have a money-back guarantee with them for jo...,flatiron-school


In [448]:
import mysql.connector

In [449]:
# Import module
import pymysql
import getpass  # to view password print the password
password= getpass.getpass()

In [450]:
#  establishing a connection to a database --> group of tables
cnx = mysql.connector.connect(user ='root',
                             password = password,
                             host = 'localhost',
                             database = 'project',
                             auth_plugin = 'mysql_native_password')

In [451]:
# check if the connection was successful
cnx.is_connected()

True

In [498]:
# import the module
from sqlalchemy import create_engine

# create sqlalchemy engine
engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
                       .format(user="root",
                               pw= password,
                               db="project"))

In [499]:
cursor = cnx.cursor()
cursor

<mysql.connector.cursor_cext.CMySQLCursor at 0x1aff7e90fa0>

In [500]:
query = ('''CREATE DATABASE project IF NOT EXISTS;''')

In [460]:
# export DataFrame comments to SQL
comments.to_csv("comments.csv")
comments.to_sql("comments", engine, if_exists = 'replace', index = False)

5798

In [475]:
from pandas import json_normalize

def get_school_info(school, school_id):
    url = 'https://www.switchup.org/chimera/v1/bootcamp-data?mainTemplate=bootcamp-data%2Fdescription&path=%2Fbootcamps%2F'+ str(school) + '&isDataTarget=false&bootcampId='+ str(school_id) + '&logoTag=logo&truncationLength=250&readMoreOmission=...&readMoreText=Read%20More&readLessText=Read%20Less'

    data = requests.get(url).json()

    data.keys()

    courses = data['content']['courses']
    courses_df = pd.DataFrame(courses, columns= ['courses'])

    locations = data['content']['locations']
    locations_df = json_normalize(locations)

    badges_df = pd.DataFrame(data['content']['meritBadges'])
    
    website = data['content']['webaddr']
    description = data['content']['description']
    logoUrl = data['content']['logoUrl']
    school_df = pd.DataFrame([website,description,logoUrl]).T
    school_df.columns =  ['website','description','LogoUrl']

    locations_df['school'] = school
    courses_df['school'] = school
    badges_df['school'] = school
    school_df['school'] = school
    

    locations_df['school_id'] = school_id
    courses_df['school_id'] = school_id
    badges_df['school_id'] = school_id
    school_df['school_id'] = school_id

    return locations_df, courses_df, badges_df, school_df

locations_list = []
courses_list = []
badges_list = []
schools_list = []

for school, id in schools.items():
    print(school)
    a,b,c,d = get_school_info(school,id)
    
    locations_list.append(a)
    courses_list.append(b)
    badges_list.append(c)
    schools_list.append(d)



flatiron-school
general-assembly
data-science-dojo
ironhack
le-wagon
wild-code-school
the-dev-masters
byte-academy
the-data-incubator
brainstation


In [476]:
locations_list

[      id        description  country.id    country.name country.abbrev  \
 0  15743  New York City, NY         1.0   United States             US   
 1  16257             Online         NaN             NaN            NaN   
 2  16761         Denver, CO         1.0   United States             US   
 3  16958    London, England        11.0  United Kingdom             GB   
 
    state.id state.name state.abbrev state.keyword  city.id city.name  \
 0      34.0   New York           NY      new-york   1507.0       NYC   
 1       1.0     Online       Online        online      NaN       NaN   
 2       7.0   Colorado           CO      colorado     27.0    Denver   
 3       NaN        NaN          NaN           NaN  31176.0    London   
 
     city.keyword           school  school_id  
 0  new-york-city  flatiron-school      10748  
 1            NaN  flatiron-school      10748  
 2         denver  flatiron-school      10748  
 3         london  flatiron-school      10748  ,
       id      

In [477]:
locations = pd.concat(locations_list)
locations.head(2)

Unnamed: 0,id,description,country.id,country.name,country.abbrev,state.id,state.name,state.abbrev,state.keyword,city.id,city.name,city.keyword,school,school_id
0,15743,"New York City, NY",1.0,United States,US,34.0,New York,NY,new-york,1507.0,NYC,new-york-city,flatiron-school,10748
1,16257,Online,,,,1.0,Online,Online,online,,,,flatiron-school,10748


In [478]:
locations.drop(['description', 'country.abbrev', 'state.abbrev', 'state.keyword', 'city.keyword'], axis = 1, inplace = True, errors = "ignore")


In [479]:
locations.head(2)

Unnamed: 0,id,country.id,country.name,state.id,state.name,city.id,city.name,school,school_id
0,15743,1.0,United States,34.0,New York,1507.0,NYC,flatiron-school,10748
1,16257,,,1.0,Online,,,flatiron-school,10748


In [481]:
locations.rename(columns = {'id' : 'location_id', 'country_id' : 'country_id', "country.name" : "country_name", 'state.id' : 'state_id', 'state.name' : 'state_name', 'city.id' : 'city_id', 'city.name' : 'city_name'}, inplace = True)

In [482]:
locations.to_csv('locations.csv')
locations.to_sql('locations', engine, if_exists = 'replace', index = False)
locations

Unnamed: 0,location_id,country.id,country_name,state_id,state_name,city_id,city_name,school,school_id
0,15743,1.0,United States,34.0,New York,1507.0,NYC,flatiron-school,10748
1,16257,,,1.0,Online,,,flatiron-school,10748
2,16761,1.0,United States,7.0,Colorado,27.0,Denver,flatiron-school,10748
3,16958,11.0,United Kingdom,,,31176.0,London,flatiron-school,10748
0,16360,1.0,United States,34.0,New York,1507.0,NYC,general-assembly,10761
...,...,...,...,...,...,...,...,...,...
3,16909,1.0,United States,23.0,Massachusetts,47.0,Boston,brainstation,10571
4,16910,1.0,United States,15.0,Illinois,39.0,Chicago,brainstation,10571
5,16940,,,1.0,Online,,,brainstation,10571
6,17706,1.0,United States,11.0,Florida,31.0,Miami,brainstation,10571


In [484]:
courses = pd.concat(courses_list)
courses.head(10)



Unnamed: 0,courses,school,school_id
0,Cybersecurity Engineering,flatiron-school,10748
1,Data Science,flatiron-school,10748
2,Product Design (UX/UI),flatiron-school,10748
3,Software Engineering,flatiron-school,10748
4,Flatiron School’s ‘Intro to Product Design’ le...,flatiron-school,10748
5,Flatiron School’s Learn Cybersecurity for Free...,flatiron-school,10748
6,Flatiron School’s Learn Data Science for Free....,flatiron-school,10748
7,Flatiron School’s Learn to Code for Free. Sign up,flatiron-school,10748
8,Online Cybersecurity Engineering,flatiron-school,10748
9,Online Data Science,flatiron-school,10748


In [486]:
badges = pd.concat(badges_list)


In [496]:
badges.head()

Unnamed: 0,name,keyword,description,school,school_id
0,Available Online,available_online,<p>School offers fully online courses</p>,flatiron-school,10748
1,Verified Outcomes,verified_outcomes,<p>School publishes a third-party verified out...,flatiron-school,10748
2,Flexible Classes,flexible_classes,<p>School offers part-time and evening classes...,flatiron-school,10748
3,Accepts GI Bill,accepts_gi_bill,<p>School accepts the GI Bill</p>,flatiron-school,10748
0,Available Online,available_online,<p>School offers fully online courses</p>,general-assembly,10761


In [497]:
badges['detail'] =  badges['description'].apply(remove_tags)

NameError: name 'remove_tags' is not defined

In [290]:
# any data cleaning still missing here? take a look at the description
schools = pd.concat(schools_list)
schools.head(15)

Unnamed: 0,website,description,LogoUrl,school,school_id
0,flatironschool.com/?utm_campaign=FIS%20Partner...,"<span class=""truncatable""><p>Founded in 2012 a...",https://d92mrp7hetgfk.cloudfront.net/images/si...,flatiron-school,10748
0,generalassemb.ly?utm_source=switchup&utm_mediu...,"<span class=""truncatable""><p>General Assembly ...",https://d92mrp7hetgfk.cloudfront.net/images/si...,general-assembly,10761
0,datasciencedojo.com/data-science-bootcamp,"<span class=""truncatable""><p>Data Science Dojo...",https://d92mrp7hetgfk.cloudfront.net/images/si...,data-science-dojo,10685
0,www.ironhack.com/en,"<span class=""truncatable""><p>Ironhack is a glo...",https://d92mrp7hetgfk.cloudfront.net/images/si...,ironhack,10828
0,www.lewagon.com,"<span class=""truncatable""><p>Le Wagon is a glo...",https://d92mrp7hetgfk.cloudfront.net/images/si...,le-wagon,10868
0,wildcodeschool.com,"<span class=""truncatable""><p>Wild Code School ...",https://d92mrp7hetgfk.cloudfront.net/images/si...,wild-code-school,11169
0,www.thedevmasters.com,"<span class=""truncatable""><p>The goal of theDe...",https://d92mrp7hetgfk.cloudfront.net/images/si...,the-dev-masters,11077
0,byteacademy.co,"<span class=""truncatable""><p>Byte Academy is a...",https://d92mrp7hetgfk.cloudfront.net/images/si...,byte-academy,10574
0,www.thedataincubator.com,"<span class=""truncatable""><p>The Data Incubato...",https://d92mrp7hetgfk.cloudfront.net/images/si...,the-data-incubator,11074
0,brainstation.io,"<span class=""truncatable""><p>BrainStation is t...",https://d92mrp7hetgfk.cloudfront.net/images/si...,brainstation,10571


In [300]:
courses.to_csv("course.csv")

In [302]:
course = 'course'
courses.to_sql(course, engine, if_exists = 'replace', index = False)

84

In [303]:
badges.to_csv("badges.csv")


In [304]:
badges.to_sql('badges', engine, if_exists = 'replace', index = False)

25

In [305]:
schools.to_csv("schools.csv")

In [306]:
schools.to_sql('schools', engine, if_exists = 'replace', index = False)

10