## Pull in packages

In [1]:
import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

## Pull in raw datasets

In [30]:
# read in Reddit datasets (questions and answers)
ques_reddit = pd.read_csv('../../data-science-for-good-careervillage/data/reddit_careeradvice_questions.csv')
ans_reddit = pd.read_csv('../../data-science-for-good-careervillage/data/reddit_careeradvice_answers.csv')

In [18]:
# read in CareerVillage datasets (questions and answers)
a = pd.read_csv('../../data-science-for-good-careervillage/data/answers.csv')
a_scores = pd.read_csv('../../data-science-for-good-careervillage/data/answer_scores.csv')
q = pd.read_csv('../../data-science-for-good-careervillage/data/questions.csv')
q_scores = pd.read_csv('../../data-science-for-good-careervillage/data/question_scores.csv')

## Combine question datasets and answer datasets, preserving as many common columns as possible

In [31]:
ques_reddit.head()

Unnamed: 0,title,score,id,body,timestamp
0,Is Business and technology degree worth it?,1,cios52,Due to personal matters I can't attend to univ...,2019-07-28 02:49:22
1,"Only female in new team, Advice?",0,ciopmn,,2019-07-28 02:42:44
2,"Don't know what to do with my life, should I r...",1,cion8y,"\- Ireland, not America, no undecided major, n...",2019-07-28 02:36:21
3,I’m bored out of my mind at my job. Do I tell ...,1,cio6x0,This job is pretty okay. I went to college for...,2019-07-28 01:55:02
4,Should I drop out of college and try to become...,1,cinsg7,Title. How much training/classes would I need?...,2019-07-28 01:18:19


In [32]:
q.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26 UTC+0000,Teacher career question,What is a maths teacher? what is a ma...
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25 UTC+0000,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38 UTC+0000,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32 UTC+0000,To become a specialist in business management...,i hear business management is a hard way to ge...
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54 UTC+0000,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...


In [33]:
# merge CareerVillage question df with scores df
ques_cv = pd.merge(q, q_scores, left_on = 'questions_id', right_on = 'id', how='left')
ques_cv.head()

Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body,id,score
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26 UTC+0000,Teacher career question,What is a maths teacher? what is a ma...,332a511f1569444485cf7a7a556a5e54,1.0
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25 UTC+0000,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...,eb80205482e4424cad8f16bc25aa2d9c,5.0
2,4ec31632938a40b98909416bdd0decff,f2c179a563024ccc927399ce529094b5,2017-02-08 19:13:38 UTC+0000,Will going abroad for your first job increase ...,I'm planning on going abroad for my first job....,4ec31632938a40b98909416bdd0decff,2.0
3,2f6a9a99d9b24e5baa50d40d0ba50a75,2c30ffba444e40eabb4583b55233a5a4,2017-09-01 14:05:32 UTC+0000,To become a specialist in business management...,i hear business management is a hard way to ge...,2f6a9a99d9b24e5baa50d40d0ba50a75,2.0
4,5af8880460c141dbb02971a1a8369529,aa9eb1a2ab184ebbb00dc01ab663428a,2017-09-01 02:36:54 UTC+0000,Are there any scholarships out there for stude...,I'm trying to find scholarships for first year...,5af8880460c141dbb02971a1a8369529,2.0


In [34]:
# combine the body, id and score columns from CareerVillage and Reddit
combined_ques_body = ques_cv['questions_body'].append(ques_reddit['body'])
combined_ques_id = ques_cv['questions_id'].append(ques_reddit['id'])
combined_ques_score = ques_cv['score'].append(ques_reddit['score'])

# create the final combined dataframes
combined_ques = pd.DataFrame({'questions_body': combined_ques_body, 'questions_id': combined_ques_id, 
                              'questions_score':combined_ques_score})

In [35]:
# check question dataset dimensions
print('CareerVillage: ', q.shape)
print('Reddit: ', ques_reddit.shape)
print('Total: ', combined_ques.shape)

CareerVillage:  (23931, 5)
Reddit:  (997, 5)
Total:  (24928, 3)


In [37]:
# get sample of questions dataset
combined_ques.head()

Unnamed: 0,questions_body,questions_id,questions_score
0,What is a maths teacher? what is a ma...,332a511f1569444485cf7a7a556a5e54,1.0
1,I am Priyanka from Bangalore . Now am in 10th ...,eb80205482e4424cad8f16bc25aa2d9c,5.0
2,I'm planning on going abroad for my first job....,4ec31632938a40b98909416bdd0decff,2.0
3,i hear business management is a hard way to ge...,2f6a9a99d9b24e5baa50d40d0ba50a75,2.0
4,I'm trying to find scholarships for first year...,5af8880460c141dbb02971a1a8369529,2.0


In [38]:
# merge CareerVillage answers with answer score
ans_cv = pd.merge(a, a_scores, left_on = 'answers_id', right_on = 'id', how='left')
ans_cv.head()

Unnamed: 0,answers_id,answers_author_id,answers_question_id,answers_date_added,answers_body,id,score
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,2016-04-29 19:40:14 UTC+0000,<p>Hi!</p>\n<p>You are asking a very interesti...,4e5f01128cae4f6d8fd697cec5dca60c,0.0
1,ada720538c014e9b8a6dceed09385ee3,2aa47af241bf42a4b874c453f0381bd4,eb80205482e4424cad8f16bc25aa2d9c,2018-05-01 14:19:08 UTC+0000,<p>Hi. I joined the Army after I attended coll...,ada720538c014e9b8a6dceed09385ee3,0.0
2,eaa66ef919bc408ab5296237440e323f,cbd8f30613a849bf918aed5c010340be,eb80205482e4424cad8f16bc25aa2d9c,2018-05-02 02:41:02 UTC+0000,"<p>Dear Priyanka,</p><p>Greetings! I have answ...",eaa66ef919bc408ab5296237440e323f,0.0
3,1a6b3749d391486c9e371fbd1e605014,7e72a630c303442ba92ff00e8ea451df,4ec31632938a40b98909416bdd0decff,2017-05-10 19:00:47 UTC+0000,<p>I work for a global company who values high...,1a6b3749d391486c9e371fbd1e605014,0.0
4,5229c514000446d582050f89ebd4e184,17802d94699140b0a0d2995f30c034c6,2f6a9a99d9b24e5baa50d40d0ba50a75,2017-10-13 22:07:33 UTC+0000,I agree with Denise. Every single job I've had...,5229c514000446d582050f89ebd4e184,0.0


In [39]:
ans_reddit.head()

Unnamed: 0,post_id,score,id,created,body
0,cinsg7,5,ev814zf,1564296000.0,"If you're so interested in pursuing this path,..."
1,cimtmn,2,ev7l8r5,1564290000.0,If you want to work in film it’s super easy to...
2,cilx2y,2,ev7aiyw,1564286000.0,What type of dentistry? As in practicing denti...
3,cikxrr,2,ev6yw2r,1564282000.0,Tell company one you have an offer on the tabl...
4,cikmyy,3,ev6qyle,1564279000.0,Just apply. Worst they can say is no


In [47]:
# now combine answer datasets
# combine the body, id and score columns from CareerVillage and Reddit
combined_ans_id = ans_cv['answers_id'].append(ans_reddit['id'])
combined_ans_body = ans_cv['answers_body'].append(ans_reddit['body'])
combined_ans_ques_id = ans_cv['answers_question_id'].append(ans_reddit['post_id'])
combined_ans_score = ans_cv['score'].append(ans_reddit['score'])


# create the final combined dataframes
combined_ans = pd.DataFrame({'answers_id': combined_ans_id, 'answers_body': combined_ans_body, 
                             'ans_question_id': combined_ans_ques_id, 'answers_score':combined_ans_score})

In [48]:
# check answer dataset dimensions
print('CareerVillage: ', a.shape)
print('Reddit: ', ans_reddit.shape)
print('Total: ', combined_ans.shape)

CareerVillage:  (51123, 5)
Reddit:  (1255, 5)
Total:  (52378, 4)


In [58]:
# get sample of answers dataset
combined_ans.head()

Unnamed: 0,answers_id,answers_body,ans_question_id,answers_score
0,4e5f01128cae4f6d8fd697cec5dca60c,<p>Hi!</p>\n<p>You are asking a very interesti...,332a511f1569444485cf7a7a556a5e54,0.0
1,ada720538c014e9b8a6dceed09385ee3,<p>Hi. I joined the Army after I attended coll...,eb80205482e4424cad8f16bc25aa2d9c,0.0
2,eaa66ef919bc408ab5296237440e323f,"<p>Dear Priyanka,</p><p>Greetings! I have answ...",eb80205482e4424cad8f16bc25aa2d9c,0.0
3,1a6b3749d391486c9e371fbd1e605014,<p>I work for a global company who values high...,4ec31632938a40b98909416bdd0decff,0.0
4,5229c514000446d582050f89ebd4e184,I agree with Denise. Every single job I've had...,2f6a9a99d9b24e5baa50d40d0ba50a75,0.0


## Clean the combined question and answer datasets

In [50]:
# extract just the question bodies and remove nulls
questions = combined_ques['questions_body'].dropna()
# set up an empty list to populate the parsed data
questions_text = []
ques_clean = []
# for each line in answers, remove HTML encoding
for q in questions:
    soup = BeautifulSoup(q, 'html.parser')
    # find all the paragraph portions of the answer
    # extract the text
    if soup.find_all(['p','n']):
        for section in soup.find_all('p'):
        # combine each section's text into "clean" list
            ques_clean.append(section.get_text())
    else:
        ques_clean.append(q.replace('\r',' ').replace('\n',' '))
    # concatenate all the sections and add to answers_preprocessed
    questions_text.append(' '.join(map(str,ques_clean)))
    # reset ans_clean for next row
    ques_clean = []

In [51]:
# add the cleaned questions column to the original dataframe
combined_ques_clean = combined_ques[combined_ques.questions_body.notnull()]
combined_ques_clean['questions_body_clean'] = questions_text
combined_ques_clean.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,questions_body,questions_id,questions_score,questions_body_clean
0,What is a maths teacher? what is a ma...,332a511f1569444485cf7a7a556a5e54,1.0,What is a maths teacher? what is a ma...
1,I am Priyanka from Bangalore . Now am in 10th ...,eb80205482e4424cad8f16bc25aa2d9c,5.0,I am Priyanka from Bangalore . Now am in 10th ...
2,I'm planning on going abroad for my first job....,4ec31632938a40b98909416bdd0decff,2.0,I'm planning on going abroad for my first job....
3,i hear business management is a hard way to ge...,2f6a9a99d9b24e5baa50d40d0ba50a75,2.0,i hear business management is a hard way to ge...
4,I'm trying to find scholarships for first year...,5af8880460c141dbb02971a1a8369529,2.0,I'm trying to find scholarships for first year...


In [63]:
# extract just the answer bodies and remove nulls
answers = combined_ans['answers_body'].dropna()
# set up an empty list to populate the parsed data
answers_text = []
ans_clean = []
# for each line in answers, remove HTML encoding
for ans in answers:
    soup = BeautifulSoup(ans, 'html.parser')
    # find all the paragraph portions of the answer
    # extract the text
    if soup.find_all('p'):
        for section in soup.find_all('p'):
        # combine each section's text into "clean" list
            ans_clean.append(section.get_text())
    else:
        ans_clean.append(ans)
    # concatenate all the sections and add to answers_preprocessed
    answers_text.append(' '.join(map(str,ans_clean)))
    # reset ans_clean for next row
    ans_clean = []

In [66]:
# add the cleaned questions column to the original dataframe
combined_ans_clean = combined_ans[combined_ans.answers_body.notnull()]
combined_ans_clean['answers_body_clean'] = answers_text
combined_ans_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,answers_id,answers_body,ans_question_id,answers_score,answers_body_clean
0,4e5f01128cae4f6d8fd697cec5dca60c,<p>Hi!</p>\n<p>You are asking a very interesti...,332a511f1569444485cf7a7a556a5e54,0.0,Hi! You are asking a very interesting question...
1,ada720538c014e9b8a6dceed09385ee3,<p>Hi. I joined the Army after I attended coll...,eb80205482e4424cad8f16bc25aa2d9c,0.0,Hi. I joined the Army after I attended college...
2,eaa66ef919bc408ab5296237440e323f,"<p>Dear Priyanka,</p><p>Greetings! I have answ...",eb80205482e4424cad8f16bc25aa2d9c,0.0,"Dear Priyanka, Greetings! I have answered this..."
3,1a6b3749d391486c9e371fbd1e605014,<p>I work for a global company who values high...,4ec31632938a40b98909416bdd0decff,0.0,I work for a global company who values highly ...
4,5229c514000446d582050f89ebd4e184,I agree with Denise. Every single job I've had...,2f6a9a99d9b24e5baa50d40d0ba50a75,0.0,I agree with Denise. Every single job I've had...


In [67]:
# Finally, merge the question and answer datasets and subset the final columns
reddit_cv_df = pd.merge(combined_ans_clean, combined_ques_clean, left_on = 'ans_question_id', 
                        right_on='questions_id', how='left')

In [69]:
# subset only the columns we need for the final dataset being written to CSV
reddit_cv_csv = reddit_cv_df[['answers_id','answers_body_clean','answers_score','questions_id','questions_body_clean','questions_score']]

In [70]:
reddit_cv_csv.head()

Unnamed: 0,answers_id,answers_body_clean,answers_score,questions_id,questions_body_clean,questions_score
0,4e5f01128cae4f6d8fd697cec5dca60c,Hi! You are asking a very interesting question...,0.0,332a511f1569444485cf7a7a556a5e54,What is a maths teacher? what is a ma...,1.0
1,ada720538c014e9b8a6dceed09385ee3,Hi. I joined the Army after I attended college...,0.0,eb80205482e4424cad8f16bc25aa2d9c,I am Priyanka from Bangalore . Now am in 10th ...,5.0
2,eaa66ef919bc408ab5296237440e323f,"Dear Priyanka, Greetings! I have answered this...",0.0,eb80205482e4424cad8f16bc25aa2d9c,I am Priyanka from Bangalore . Now am in 10th ...,5.0
3,1a6b3749d391486c9e371fbd1e605014,I work for a global company who values highly ...,0.0,4ec31632938a40b98909416bdd0decff,I'm planning on going abroad for my first job....,2.0
4,5229c514000446d582050f89ebd4e184,I agree with Denise. Every single job I've had...,0.0,2f6a9a99d9b24e5baa50d40d0ba50a75,i hear business management is a hard way to ge...,2.0


In [71]:
# write final df to csv
reddit_cv_csv.to_csv('../../data-science-for-good-careervillage/data/reddit_careervillage_combined.csv', index=False) 