## Pull in packages

In [2]:
import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

## Pull in raw datasets

In [20]:
# read in Reddit datasets (questions and answers)
ques_reddit = pd.read_csv('../../data-science-for-good-careervillage/data/reddit_careeradvice_questions.csv')
ans_reddit = pd.read_csv('../../data-science-for-good-careervillage/data/reddit_careeradvice_answers.csv')

In [21]:
# read in CareerVillage datasets (questions and answers)
a = pd.read_csv('../../data-science-for-good-careervillage/data/answers.csv')
a_scores = pd.read_csv('../../data-science-for-good-careervillage/data/answer_scores.csv')
q = pd.read_csv('../../data-science-for-good-careervillage/data/questions.csv')
q_scores = pd.read_csv('../../data-science-for-good-careervillage/data/question_scores.csv')

## Combine question datasets and answer datasets, preserving as many common columns as possible

In [22]:
ques_reddit.head()

Unnamed: 0,title,score,id,body,timestamp
0,"Is adult life after college just sort of, gues...",344,bowpxj,"I graduated college at 22, I'm 25 now, and not...",2019-05-15 14:24:22
1,“Most people overestimate what they can do in ...,287,9bkhce,When I hear people on this sub telling others:...,2018-08-30 18:14:12
2,I want to work at an Oreo factory?,281,9n66cn,Today I was eating Oreos and I realized that O...,2018-10-11 05:55:51
3,Update: got a job offer but failed drug test d...,264,9nj22q,So some suggested I just go and tell the truth...,2018-10-12 13:20:16
4,What do people do to make over 200k a year?,227,b1wixu,So i’m in school now (in Finland) and my grade...,2019-03-16 22:53:06


In [23]:
# combine the body and the id columns from CareerVillage and Reddit
combined_ques_body = q['questions_body'].append(ques_reddit['body'])
combined_ques_id = q['questions_id'].append(ques_reddit['id'])

# create the final combined dataframes
combined_ques = pd.DataFrame({'questions_body': combined_ques_body, 'questions_id': combined_ques_id})
combined_ans = a['answers_body'].append(ans_reddit['body'])

In [24]:
# check question dataset dimensions
print('CareerVillage: ', q.shape)
print('Reddit: ', ques_reddit.shape)
print('Total: ', combined_ques.shape)

CareerVillage:  (23931, 5)
Reddit:  (2410, 5)
Total:  (26341, 2)


In [25]:
# get sample of questions dataset
combined_ques.head()

Unnamed: 0,questions_body,questions_id
0,What is a maths teacher? what is a ma...,332a511f1569444485cf7a7a556a5e54
1,I am Priyanka from Bangalore . Now am in 10th ...,eb80205482e4424cad8f16bc25aa2d9c
2,I'm planning on going abroad for my first job....,4ec31632938a40b98909416bdd0decff
3,i hear business management is a hard way to ge...,2f6a9a99d9b24e5baa50d40d0ba50a75
4,I'm trying to find scholarships for first year...,5af8880460c141dbb02971a1a8369529


In [26]:
# check answer dataset dimensions
print('CareerVillage: ', a.shape)
print('Reddit: ', ans_reddit.shape)
print('Total: ', combined_ans.shape)

CareerVillage:  (51123, 5)
Reddit:  (2410, 5)
Total:  (53533,)


In [27]:
# get sample of answers dataset
combined_ans.head()

0    <p>Hi!</p>\n<p>You are asking a very interesti...
1    <p>Hi. I joined the Army after I attended coll...
2    <p>Dear Priyanka,</p><p>Greetings! I have answ...
3    <p>I work for a global company who values high...
4    I agree with Denise. Every single job I've had...
dtype: object

## Clean the combined question and answer datasets

In [54]:
# extract just the question bodies and remove nulls
questions = combined_ques['questions_body'].dropna()
# set up an empty list to populate the parsed data
questions_text = []
ques_clean = []
# for each line in answers, remove HTML encoding
for q in questions:
    soup = BeautifulSoup(q, 'html.parser')
    # find all the paragraph portions of the answer
    # extract the text
    if soup.find_all(['p','n']):
        for section in soup.find_all('p'):
        # combine each section's text into "clean" list
            ques_clean.append(section.get_text())
    else:
        ques_clean.append(q.replace('\r',' ').replace('\n',' '))
    # concatenate all the sections and add to answers_preprocessed
    questions_text.append(' '.join(map(str,ques_clean)))
    # reset ans_clean for next row
    ques_clean = []

In [65]:
# add the cleaned questions column to the original dataframe
combined_ques_clean = combined_ques[combined_ques.questions_body.notnull()]
combined_ques_clean['questions_body_clean'] = questions_text
combined_ques_clean.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,questions_body,questions_id,questions_body_clean
0,What is a maths teacher? what is a ma...,332a511f1569444485cf7a7a556a5e54,What is a maths teacher? what is a ma...
1,I am Priyanka from Bangalore . Now am in 10th ...,eb80205482e4424cad8f16bc25aa2d9c,I am Priyanka from Bangalore . Now am in 10th ...
2,I'm planning on going abroad for my first job....,4ec31632938a40b98909416bdd0decff,I'm planning on going abroad for my first job....
3,i hear business management is a hard way to ge...,2f6a9a99d9b24e5baa50d40d0ba50a75,i hear business management is a hard way to ge...
4,I'm trying to find scholarships for first year...,5af8880460c141dbb02971a1a8369529,I'm trying to find scholarships for first year...


In [68]:
# only select the clean and id columns for the final dataset
questions_clean = combined_ques_clean['questions_id'] + combined_ques_clean['questions_body_clean']

In [47]:
# extract just the answer bodies and remove nulls
answers = combined_ans.dropna()
# set up an empty list to populate the parsed data
answers_text = []
ans_clean = []
# for each line in answers, remove HTML encoding
for ans in answers:
    soup = BeautifulSoup(ans, 'html.parser')
    # find all the paragraph portions of the answer
    # extract the text
    if soup.find_all('p'):
        for section in soup.find_all('p'):
        # combine each section's text into "clean" list
            ans_clean.append(section.get_text())
    else:
        ans_clean.append(ans)
    # concatenate all the sections and add to answers_preprocessed
    answers_text.append(' '.join(map(str,ans_clean)))
    # reset ans_clean for next row
    ans_clean = []

In [49]:
answers_clean = pd.DataFrame(answers_text, columns=['answer_body']).dropna()

In [50]:
answers_clean.head()

Unnamed: 0,answer_body
0,Hi! You are asking a very interesting question...
1,Hi. I joined the Army after I attended college...
2,"Dear Priyanka, Greetings! I have answered this..."
3,I work for a global company who values highly ...
4,I agree with Denise. Every single job I've had...


In [69]:
# write both to csv
questions_clean.to_csv('../../data-science-for-good-careervillage/data/questions_final.csv', index=False) 
answers_clean.to_csv('../../data-science-for-good-careervillage/data/answers_final.csv', index=False) 

  
