In this notebook we combine the resume data with the already combined Reddit/CareerVillage dataset.

In [2]:
import pandas as pd

In [7]:
# pull in both datasets
resume_df = pd.read_csv('../../data-science-for-good-careervillage/data/resume_dataset_preprocessed.csv')
reddit_cv_df = pd.read_csv('../../data-science-for-good-careervillage/data/reddit_careervillage_combined.csv')

In [5]:
# let's examine both
resume_df.head()

Unnamed: 0,category,name,resume_line,resume_line_pp
0,['1Amy.docx'],['1Amy.docx'],Fund accountant with nearly years of experienc...,fund account near year expery hedg fund admin ...
1,['1Amy.docx'],['1Amy.docx'],Citco Fund Services Singapore Pte Ltd Jan Pres...,citco fund serv singap pte ltd jan pres fund a...
2,['1Amy.docx'],['1Amy.docx'],Calculation of estimate and final NAVs on a da...,calc estim fin nav dai week month bas
3,['1Amy.docx'],['1Amy.docx'],Preparation of cash and position reconciliatio...,prep cash posit recont report
4,['1Amy.docx'],['1Amy.docx'],Daily preproduction tasks such as price checks...,dai preproduc task pric check interest accr fe...


In [8]:
reddit_cv_df.head()

Unnamed: 0,answers_id,answers_body_clean,answers_score,questions_id,questions_body_clean,questions_score
0,4e5f01128cae4f6d8fd697cec5dca60c,Hi! You are asking a very interesting question...,0.0,332a511f1569444485cf7a7a556a5e54,What is a maths teacher? what is a ma...,1.0
1,ada720538c014e9b8a6dceed09385ee3,Hi. I joined the Army after I attended college...,0.0,eb80205482e4424cad8f16bc25aa2d9c,I am Priyanka from Bangalore . Now am in 10th ...,5.0
2,eaa66ef919bc408ab5296237440e323f,"Dear Priyanka, Greetings! I have answered this...",0.0,eb80205482e4424cad8f16bc25aa2d9c,I am Priyanka from Bangalore . Now am in 10th ...,5.0
3,1a6b3749d391486c9e371fbd1e605014,I work for a global company who values highly ...,0.0,4ec31632938a40b98909416bdd0decff,I'm planning on going abroad for my first job....,2.0
4,5229c514000446d582050f89ebd4e184,I agree with Denise. Every single job I've had...,0.0,2f6a9a99d9b24e5baa50d40d0ba50a75,i hear business management is a hard way to ge...,2.0


We need to format the resume dataset so that it can be easily appended to the Reddit/CareerVillage dataset. We will add in dummy questions and scores to get the same headers.

In [19]:
# first rename the existing columns
resume_df.rename(columns={'resume_line': 'answers_body_clean','name': 'questions_id'}, inplace=True)

# assign the index as the answers_id column, since we don't have an id for resumes
resume_df['answers_id'] = resume_df.index

# assign a score of 0 to each question and answer in the resume dataset
resume_df['answers_score'] = 0
resume_df['questions_score'] = 0

# assign a dummy question to each resume line
resume_df['questions_body_clean'] = 'What does a ' + resume_df['category'] + 'do?'

In [20]:
resume_df.head()

Unnamed: 0,category,questions_id,answers_body_clean,resume_line_pp,answers_id,answers_score,questions_score,questions_body_clean
0,['1Amy.docx'],['1Amy.docx'],Fund accountant with nearly years of experienc...,fund account near year expery hedg fund admin ...,0,0,0,What does a ['1Amy.docx']do?
1,['1Amy.docx'],['1Amy.docx'],Citco Fund Services Singapore Pte Ltd Jan Pres...,citco fund serv singap pte ltd jan pres fund a...,1,0,0,What does a ['1Amy.docx']do?
2,['1Amy.docx'],['1Amy.docx'],Calculation of estimate and final NAVs on a da...,calc estim fin nav dai week month bas,2,0,0,What does a ['1Amy.docx']do?
3,['1Amy.docx'],['1Amy.docx'],Preparation of cash and position reconciliatio...,prep cash posit recont report,3,0,0,What does a ['1Amy.docx']do?
4,['1Amy.docx'],['1Amy.docx'],Daily preproduction tasks such as price checks...,dai preproduc task pric check interest accr fe...,4,0,0,What does a ['1Amy.docx']do?


In [22]:
# subset the correct columns from resume_df and append them to the reddit/CV dataset
resume_final = resume_df[['answers_id','answers_body_clean','answers_score','questions_id','questions_body_clean','questions_score']]
dataset_final = reddit_cv_df.append(resume_final)

In [23]:
dataset_final.head()

Unnamed: 0,answers_id,answers_body_clean,answers_score,questions_id,questions_body_clean,questions_score
0,4e5f01128cae4f6d8fd697cec5dca60c,Hi! You are asking a very interesting question...,0.0,332a511f1569444485cf7a7a556a5e54,What is a maths teacher? what is a ma...,1.0
1,ada720538c014e9b8a6dceed09385ee3,Hi. I joined the Army after I attended college...,0.0,eb80205482e4424cad8f16bc25aa2d9c,I am Priyanka from Bangalore . Now am in 10th ...,5.0
2,eaa66ef919bc408ab5296237440e323f,"Dear Priyanka, Greetings! I have answered this...",0.0,eb80205482e4424cad8f16bc25aa2d9c,I am Priyanka from Bangalore . Now am in 10th ...,5.0
3,1a6b3749d391486c9e371fbd1e605014,I work for a global company who values highly ...,0.0,4ec31632938a40b98909416bdd0decff,I'm planning on going abroad for my first job....,2.0
4,5229c514000446d582050f89ebd4e184,I agree with Denise. Every single job I've had...,0.0,2f6a9a99d9b24e5baa50d40d0ba50a75,i hear business management is a hard way to ge...,2.0


In [24]:
print("resume shape: ", resume_final.shape)
print("reddit/careervilage shape: ", reddit_cv_df.shape)
print("combined shape: ", dataset_final.shape)

resume shape:  (26754, 6)
reddit/careervilage shape:  (52377, 6)
combined shape:  (79131, 6)


In [26]:
# write final df to csv
dataset_final.to_csv('../../data-science-for-good-careervillage/data/qa_dataset_final.csv', index=False) 