# <center>Web Scraping</center>

### 1. Loading Dependencies

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import requests
from bs4 import BeautifulSoup 
import numpy as np
import pandas as pd
import re
from decimal import Decimal
import csv
import os
import string
import nltk
import ast
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet
#import math
#import matplotlib.pyplot as plt

### 2. Searching the drugs by category
We are using the link from WebMD to get top 5 drugs for the specified category. Here we have used "Chronic Pain" as our category and have obtained top 5 drugs after applying some optimization techniques.

In [2]:
page=requests.get("https://www.webmd.com/drugs/2/condition-3090/chronic%20pain") #Searching drugs by condition
soup=BeautifulSoup(page.content, 'html.parser')
#print(soup.prettify())

In [3]:
#print(containers[0])
containers = soup.findAll("table", {"class": "drugs-treatments-table"})   #getting the name and review number of drugs
c = containers[0]

### 3. Collecting the links to obtain drug ID's names and the number of reviews.

In [4]:
links=c.find_all("a",href=re.compile("drugreview"))    #using 'drugview' substring to avoid repetition of links
links[0]
for link in links:
    print(link.get('href'))                            

<a href="/drugs/drugreview-4398-tramadol-hcl.aspx?drugid=4398&amp;drugname=tramadol-hcl">2905 Reviews</a>

/drugs/drugreview-4398-tramadol-hcl.aspx?drugid=4398&drugname=tramadol-hcl
/drugs/drugreview-4398-tramadol-hcl-er.aspx?drugid=4398&drugname=tramadol-hcl-er
/drugs/drugreview-1025-oxycodone-hcl.aspx?drugid=1025&drugname=oxycodone-hcl
/drugs/drugreview-1025-oxycodone-hcl-concentrate.aspx?drugid=1025&drugname=oxycodone-hcl-concentrate
/drugs/drugreview-1025-oxycodone-hcl-er.aspx?drugid=1025&drugname=oxycodone-hcl-er
/drugs/drugreview-1025-oxycodone-tablet-only.aspx?drugid=1025&drugname=oxycodone-tablet-only
/drugs/drugreview-2798-oxycontin.aspx?drugid=2798&drugname=oxycontin
/drugs/drugreview-64741-suboxone-film-medicated.aspx?drugid=64741&drugname=suboxone-film-medicated
/drugs/drugreview-64741-suboxone-tablet.aspx?drugid=64741&drugname=suboxone-tablet
/drugs/drugreview-2671-methadone-hcl.aspx?drugid=2671&drugname=methadone-hcl
/drugs/drugreview-2671-methadone-hcl-tablet.aspx?drugid=2671&drugname=methadone-hcl-tablet
/drugs/drugreview-2671-methadone-hcl-tablet-soluble.aspx?drugid=2671&dr

In [5]:
st=[]
did=[]
for link in links:
    st=(link.get('href'))                            #Getting the drug-id 
    loc = st.find("drugid=") + 7
    temp = ""
    while(True):
        if st[loc]=="&":
            break
        else:
            temp = temp + st[loc]
            loc = loc + 1
    did.append(temp)
    
print(did)

['4398', '4398', '1025', '1025', '1025', '1025', '2798', '64741', '64741', '2671', '2671', '2671', '144617', '327', '327', '327', '327', '327', '1507', '1509', '32971', '94892', '75100', '155153', '156955', '64748', '152917', '92576', '144614', '144614', '171739', '167438', '9730', '9730', '9730', '64740', '64740', '1508', '4101', '152125', '165699', '11996', '164754', '152562', '152562', '5360', '170329', '387', '166704', '18450', '4105', '155016', '63814', '9143', '156872', '92515', '9480', '9480', '4149', '165341', '165341', '61838', '156918', '4876', '152909', '152569', '406', '170175', '13733', '61833', '163158', '57795', '171738', '4100', '166431', '94727', '170241', '78328', '155959', '148740', '13732', '13735', '977', '977', '61834', '61835', '1925', '171613', '154348', '148312', '170232', '94724', '22001', '9383', '165098', '166836', '173366', '170300', '10683']


In [6]:
#c.findAll('a')
drug=c.find_all("a", href=re.compile("details"))          #Getting drugnames
dlist=[]
for i in range(12):
    l=drug[i].text
    dlist.append(l)
#print(dlist)

In [7]:
rlist=[]
for i in range(12):                                       #Getting number of reviews for each drug
    review=c.find_all("a", href=re.compile("drugreview"))
    r=review[i].text.split(" ")
    rlist.append(r[0])
#print(rlist)


### 4. Creating a dataframe of top 5 drugs by eliminating duplicates

In [8]:
df = pd.DataFrame(                                       #Creating a df of drug name and number of reviews for that drug
    {'Drug': dlist,
     'Reviews': rlist,
     'Drug_ID' : did[0:12],
    })
#drug_list
drugsv = df.drop_duplicates(subset='Reviews', keep="first")       #Drop duplicates

In [10]:
drugsv.to_csv('/Users/grv/Downloads/result1.csv')

### 5. Function to Iterate over the strings to be replaced

In [11]:
def replaceMultiple(mainString, toBeReplaces, newString):
    # Iterate over the strings to be replaced
    for elem in toBeReplaces :
        # Check if string is in the main string
        if elem in mainString :
            # Replace the string
            mainString = mainString.replace(elem, newString)
    
    return  mainString

### 6. Scrapping
The data for the drug name is scrapped from the webpage. It includes the reviews, ratings in 3 different categories and comments. The scrapped data is written to the csv file that is saved on the local machine. We currently are working on a sample of first 1000 records for computational convenience.


In [14]:
pages = [str(i) for i in range(200)]
filename= "reviews6.csv"
f = open(filename, "w")
headers = "Reviewer Details, Effectiveness Rating, Ease_of_Use Rating, Satisfaction Rating, Comment\n"
f.write(headers)
for page in pages:
    url1 = 'https://www.webmd.com/drugs/drugreview-4398-tramadol+oral.aspx?drugid=4398&drugname=tramadol+oral&pageIndex='+page+'&sortby=3&conditionFilter=-1'
    page=requests.get(url1)
    soup=BeautifulSoup(page.content, 'html.parser')
    con1 = soup.findAll("div", {"class": "userPost"})
    con2 = con1[1]
    r_details = con2.p.text
    ratings = con2.findAll("p", {"class" : "inlineRating starRating"})
    r_effec = ratings[0].text
    r_ease = ratings[1].text
    r_satis = ratings[2].text
    comment = con2.findAll("p", {"class" : "comment"})
    r_comment = comment[1].text
    
    filename= "reviews6.csv"
    f = open(filename, "a")
    for con2 in con1:
        r_details = con2.p.text
        ratings = con2.findAll("p", {"class" : "inlineRating starRating"})
        r_effec = ratings[0].text
        r_ease = ratings[1].text
        r_satis = ratings[2].text
        comment = con2.findAll("p", {"class" : "comment"})
        r_comment = comment[1].text
        
        f.write(r_details.replace("," , "|") + "," + r_effec[-1] + "," + r_ease[-1] + "," + r_satis[-1] + ","  + 
                                replaceMultiple(r_comment,['\r\n', ','], "|") + "\n")
        
        
        
f.close()

89

1289

872

139

449

242

803

208

1605

111

374

153

417

261

427

513

290

385

1080

152

642

226

277

207

428

157

243

275

392

1194

249

377

172

153

782

335

249

160

390

226

120

115

337

1413

346

1409

98

143

244

65

981

360

102

303

467

342

238

1223

106

247

294

247

563

633

54

699

287

130

317

115

330

982

100

170

353

229

97

736

234

223

259

788

246

267

331

261

247

246

346

324

330

113

517

423

195

750

206

475

256

196

640

338

282

119

81

298

367

812

323

494

769

111

101

410

616

190

362

185

145

442

122

301

515

522

457

318

305

224

558

280

380

1073

456

205

608

106

200

131

120

1328

421

284

429

241

294

565

540

231

407

708

694

174

629

225

1312

266

177

332

115

691

186

625

355

527

187

431

405

813

122

250

344

433

208

1035

212

236

190

238

154

189

135

184

112

785

198

102

892

305

270

432

107

408

964

1214

792

383

454

306

244

294

631

696

407

528

96

249

568

313

398

687

248

734

359

174

435

240

475

248

479

336

290

259

476

175

740

121

843

370

880

340

310

172

102

394

437

355

191

355

206

149

567

213

203

1419

308

188

118

90

110

476

319

108

355

98

249

137

166

207

107

98

329

391

229

128

307

399

277

350

124

181

186

280

251

382

340

247

282

235

119

205

232

589

165

207

107

210

197

187

99

156

268

175

194

128

120

621

199

718

504

426

466

386

253

458

201

291

441

490

411

515

434

102

265

186

261

118

283

123

279

322

114

71

668

108

1092

416

247

539

420

366

145

107

662

98

215

145

157

157

1368

67

342

422

457

214

325

169

654

305

212

672

433

235

256

161

247

112

303

416

1031

1223

139

288

143

104

169

2113

374

405

359

223

535

134

339

259

330

187

249

247

513

1059

144

108

357

470

324

285

103

280

111

313

411

102

264

359

236

184

305

311

107

227

101

840

185

227

342

130

464

202

194

115

43

164

43

226

527

150

201

102

203

316

235

429

679

112

242

177

179

435

91

355

127

317

338

126

429

314

137

421

165

237

121

136

396

424

106

179

278

278

285

559

654

122

226

190

367

352

189

116

263

321

142

599

157

722

164

110

313

380

160

509

121

222

465

138

220

167

169

340

120

644

391

116

619

380

94

153

227

225

128

91

105

193

255

170

431

96

297

219

234

429

306

301

123

98

98

758

114

224

414

245

324

173

868

98

111

890

385

212

309

491

524

304

353

67

473

134

718

312

184

173

523

150

121

157

295

161

92

102

228

188

133

415

226

284

98

216

189

120

108

211

189

1021

142

211

308

336

113

109

699

147

100

575

238

105

193

210

243

135

110

203

208

219

372

181

401

140

455

235

145

478

54

268

114

108

148

1734

67

251

102

601

406

118

411

291

275

183

128

105

647

206

182

98

332

322

388

261

846

105

340

373

115

593

275

335

506

123

111

288

127

300

297

163

252

96

239

249

222

118

656

115

284

153

135

135

120

432

111

121

176

157

236

415

226

204

608

317

918

327

182

419

182

111

650

177

241

117

291

106

419

278

100

323

246

154

505

505

120

213

552

513

160

423

107

166

102

241

194

218

144

199

211

359

180

102

294

403

547

321

299

118

193

292

102

135

682

159

351

272

157

444

357

202

479

410

429

348

255

153

143

382

128

198

380

445

445

132

112

398

258

166

100

220

186

259

67

1053

368

356

364

239

489

184

404

250

107

246

54

105

678

461

115

305

107

343

525

500

352

113

495

392

889

620

60

561

251

463

120

500

538

193

534

108

242

324

910

106

633

111

107

456

229

275

43

104

116

335

552

140

192

111

258

287

352

227

443

335

135

209

140

174

417

114

398

368

772

216

584

342

307

330

235

479

303

291

105

43

112

126

192

185

378

254

268

782

248

212

647

498

656

144

387

160

275

179

142

119

112

96

100

307

930

297

682

212

151

595

235

163

362

183

105

477

188

101

459

102

205

350

390

478

264

203

184

1539

492

309

117

105

143

807

176

423

423

423

466

267

608

1623

452

182

108

498

175

178

168

875

140

420

196

108

719

424

967

218

186

314

139

191

119

152

118

376

110

267

209

155

752

203

476

102

158

572

449

126

398

575

322

394

537

217

114

128

359

352

185

1387

1387

366

1167

157

43

154

359

242

432

199

158

188

254

277

276

1197

166

81

99

202

329

180

176

591

119

501

381

265

107

217

145

357

292

193

292

1501

413

199

776

475

475

98

356

187

248

153

157

166

671

764

105

423

923

162

300

287

527

112

469

102

102

368

173

492

358

246

159

456

368

141

99

111

331

341

220

356

274

200

188

98

102

154

579

117

291