In [53]:
#import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pymongo
import datetime
from flask import Flask
from sqlalchemy import create_engine
import json

In [54]:
#import Redfin state data
raw_state_df = pd.read_csv("Resources/state_market_tracker.tsv000", sep='\t')
raw_state_df

Unnamed: 0,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,state,...,sold_above_list_yoy,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated
0,2019-10-01,2019-10-31,30,state,4,23,f,Oklahoma,,Oklahoma,...,0.098560,0.209877,0.083561,0.053355,0.301282,-0.240768,-0.122115,South Region,,2022-01-09 14:29:56
1,2021-07-01,2021-07-31,30,state,4,40,f,Vermont,,Vermont,...,0.251473,0.133696,0.016402,-0.000131,0.424404,-0.064422,0.076156,Northeast Region,,2022-01-09 14:29:56
2,2016-08-01,2016-08-31,30,state,4,10,f,New Hampshire,,New Hampshire,...,0.089756,0.149826,0.009592,0.002687,0.206101,0.061385,0.016779,Northeast Region,,2022-01-09 14:29:56
3,2013-04-01,2013-04-30,30,state,4,51,f,Mississippi,,Mississippi,...,-0.008566,,,,0.044777,-0.014869,-0.005006,South Region,,2022-01-09 14:29:56
4,2019-12-01,2019-12-31,30,state,4,2,f,Missouri,,Missouri,...,-0.017031,0.205339,-0.123381,-0.000410,0.252082,-0.116423,0.095007,Midwest Region,,2022-01-09 14:29:56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27073,2012-03-01,2012-03-31,30,state,4,6,f,Nebraska,,Nebraska,...,0.021426,,,,0.422780,0.042801,0.030384,Midwest Region,,2022-01-09 14:29:56
27074,2016-07-01,2016-07-31,30,state,4,27,f,Idaho,,Idaho,...,0.182390,,,,0.534900,-0.117877,-0.004820,West Region,,2022-01-09 14:29:56
27075,2020-11-01,2020-11-30,30,state,4,42,f,Virginia,,Virginia,...,0.218939,0.126582,-0.048418,-0.006751,0.380567,-0.152381,0.194669,South Region,,2022-01-09 14:29:56
27076,2021-08-01,2021-08-31,30,state,4,40,f,Vermont,,Vermont,...,0.234272,0.158853,0.018284,0.022594,0.381659,-0.018233,0.098021,Northeast Region,,2022-01-09 14:29:56


In [82]:
#checking for outliers
raw_state_df['property_type'].value_counts()

All Residential              5632
Single Family Residential    5632
Condo/Co-op                  5496
Multi-Family (2-4 Unit)      5406
Townhouse                    4912
Name: property_type, dtype: int64

In [86]:
#checking DC for outliers
dc_df = raw_state_df[raw_state_df['state_code'] == 'DC']
dc_df['property_type'].value_counts()

Single Family Residential    120
Multi-Family (2-4 Unit)      120
Townhouse                    120
All Residential              120
Condo/Co-op                  120
Name: property_type, dtype: int64

In [55]:
#drop unnecessary columns
state_df = raw_state_df[["period_end", "state_code", "median_sale_price", "median_ppsf"]]
state_df

Unnamed: 0,period_end,state_code,median_sale_price,median_ppsf
0,2019-10-31,OK,162200,77.0
1,2021-07-31,VT,317900,177.0
2,2016-08-31,NH,200100,155.0
3,2013-04-30,MS,129500,69.0
4,2019-12-31,MO,152000,135.0
...,...,...,...,...
27073,2012-03-31,NE,133000,74.0
27074,2016-07-31,ID,178400,135.0
27075,2020-11-30,VA,264200,122.0
27076,2021-08-31,VT,326800,180.0


In [56]:
#view column types
state_df.dtypes

period_end            object
state_code            object
median_sale_price      int64
median_ppsf          float64
dtype: object

In [57]:
#cast columns to strings and date/time
state_df['period_end'] = pd.to_datetime(state_df['period_end'], format='%Y-%m-%d')
state_df['state_code'] = state_df['state_code'].astype('string')
state_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df['period_end'] = pd.to_datetime(state_df['period_end'], format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df['state_code'] = state_df['state_code'].astype('string')


period_end           datetime64[ns]
state_code                   string
median_sale_price             int64
median_ppsf                 float64
dtype: object

In [58]:
#drop years 2018 and older
state_df = state_df[state_df.period_end.dt.year > 2018]
state_df

Unnamed: 0,period_end,state_code,median_sale_price,median_ppsf
0,2019-10-31,OK,162200,77.0
1,2021-07-31,VT,317900,177.0
4,2019-12-31,MO,152000,135.0
5,2019-07-31,NM,385500,218.0
9,2020-03-31,OH,156100,111.0
...,...,...,...,...
27068,2020-04-30,NE,206700,103.0
27070,2019-12-31,OK,169500,109.0
27071,2019-01-31,AZ,262600,151.0
27075,2020-11-30,VA,264200,122.0


In [39]:
#grab mean sale price by state
state_df_mean = state_df.groupby(['state_code']).mean()
state_df_mean.sort_values(by='median_sale_price').head()

Unnamed: 0_level_0,median_sale_price,median_ppsf
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1
OK,162056.111111,96.811111
OH,169773.888889,103.511111
MS,173332.022472,104.073034
IN,179582.876712,101.0
MO,181113.333333,114.316667


In [59]:
#grab median sale price by state
state_df_median = state_df.groupby(['state_code']).median()
state_df_median_sale = state_df_median.sort_values(by='median_sale_price')
state_df_median_sale.head()

Unnamed: 0_level_0,median_sale_price,median_ppsf
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1
OK,171750.0,97.5
OH,173250.0,110.0
MO,179300.0,119.0
MS,179500.0,105.0
KY,183650.0,114.0


In [64]:
#grab median price per square foot by state
state_df_median_ppsf = state_df_median.sort_values(by='median_ppsf')
state_df_median_ppsf.head()

Unnamed: 0_level_0,median_sale_price,median_ppsf
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1
OK,171750.0,97.5
IN,190350.0,102.0
MS,179500.0,105.0
AR,184000.0,106.0
WV,206900.0,109.0


In [65]:
#grab mean sale price by state
state_df_mean = state_df.groupby(['state_code']).mean()
state_df_mean_sale = state_df_mean.sort_values(by='median_sale_price')
state_df_mean_sale.head()

Unnamed: 0_level_0,median_sale_price,median_ppsf
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1
OK,162056.111111,96.811111
OH,169773.888889,103.511111
MS,173332.022472,104.073034
IN,179582.876712,101.0
MO,181113.333333,114.316667


In [66]:
#grab mean price per square foot by state
state_df_mean_ppsf = state_df_mean.sort_values(by='median_ppsf')
state_df_mean_ppsf.head()

Unnamed: 0_level_0,median_sale_price,median_ppsf
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1
OK,162056.111111,96.811111
IN,179582.876712,101.0
OH,169773.888889,103.511111
MS,173332.022472,104.073034
AR,184565.555556,107.466667


In [67]:
#import data scientist salary information
raw_salary_df = pd.read_csv("Resources/raw_salary_data_scientist.csv")
raw_salary_df

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,tensor,hadoop,tableau,bi,flink,mongo,google_an,job_title_sim,seniority_by_title,Degree
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 - 1000,1973,...,0,0,1,1,0,0,0,data scientist,na,M
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+,1984,...,0,0,0,0,0,0,0,data scientist,na,M
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 - 1000,2010,...,0,0,0,0,0,0,0,data scientist,na,M
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 - 5000,1965,...,0,0,0,0,0,0,0,data scientist,na,na
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 - 200,1998,...,0,0,0,0,0,0,0,data scientist,na,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,950,"Sr Scientist, Immuno-Oncology - Oncology",$58K-$111K (Glassdoor est.),Site Name: USA - Massachusetts - Cambridge\nPo...,3.9,GSK\n3.9,"Cambridge, MA","Brentford, United Kingdom",10000+,1830,...,0,0,0,0,0,0,0,other scientist,sr,M
738,951,Senior Data Engineer,$72K-$133K (Glassdoor est.),THE CHALLENGE\nEventbrite has a world-class da...,4.4,Eventbrite\n4.4,"Nashville, TN","San Francisco, CA",1001 - 5000,2006,...,0,1,0,0,0,0,0,data engineer,sr,na
739,952,"Project Scientist - Auton Lab, Robotics Institute",$56K-$91K (Glassdoor est.),The Auton Lab at Carnegie Mellon University is...,2.6,Software Engineering Institute\n2.6,"Pittsburgh, PA","Pittsburgh, PA",501 - 1000,1984,...,0,0,0,0,0,0,0,other scientist,na,P
740,953,Data Science Manager,$95K-$160K (Glassdoor est.),Data Science ManagerResponsibilities:\n\nOvers...,3.2,"Numeric, LLC\n3.2","Allentown, PA","Chadds Ford, PA",1 - 50,-1,...,0,0,0,0,0,0,0,data scientist,na,na


In [99]:
#determine total number of states included in the original data
raw_salary_df['Job Location'].nunique()

37

In [68]:
#drop unneccesary columns
salary_df = raw_salary_df[['Rating', 'Lower Salary', 'Upper Salary', 'Avg Salary(K)', 'Job Location', 'job_title_sim', 'seniority_by_title']]
salary_df

Unnamed: 0,Rating,Lower Salary,Upper Salary,Avg Salary(K),Job Location,job_title_sim,seniority_by_title
0,3.8,53,91,72.0,NM,data scientist,na
1,3.4,63,112,87.5,MD,data scientist,na
2,4.8,80,90,85.0,FL,data scientist,na
3,3.8,56,97,76.5,WA,data scientist,na
4,2.9,86,143,114.5,NY,data scientist,na
...,...,...,...,...,...,...,...
737,3.9,58,111,84.5,MA,other scientist,sr
738,4.4,72,133,102.5,TN,data engineer,sr
739,2.6,56,91,73.5,PA,other scientist,na
740,3.2,95,160,127.5,PA,data scientist,na


In [69]:
#get column types
salary_df.dtypes

Rating                float64
Lower Salary            int64
Upper Salary            int64
Avg Salary(K)         float64
Job Location           object
job_title_sim          object
seniority_by_title     object
dtype: object

In [70]:
#convery types to strings
salary_df['Job Location'] = salary_df['Job Location'].astype('string')
salary_df['job_title_sim'] = salary_df['job_title_sim'].astype('string')
salary_df['seniority_by_title'] = salary_df['seniority_by_title'].astype('string')
salary_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salary_df['Job Location'] = salary_df['Job Location'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salary_df['job_title_sim'] = salary_df['job_title_sim'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salary_df['seniority_by_title'] = salary_df['seniority_by_tit

Rating                float64
Lower Salary            int64
Upper Salary            int64
Avg Salary(K)         float64
Job Location           string
job_title_sim          string
seniority_by_title     string
dtype: object

In [71]:
#determine values in seniority by title
salary_df['seniority_by_title'].value_counts()

na    519
sr    220
jr      3
Name: seniority_by_title, dtype: Int64

In [72]:
#drop senior and junior positions
salary_df = salary_df[salary_df.seniority_by_title != "sr"]
salary_df = salary_df[salary_df.seniority_by_title != "jr"]
salary_df['seniority_by_title'].value_counts()

na    519
Name: seniority_by_title, dtype: Int64

In [92]:
#re-name columns job location to state_code to match other data frame
salary_df = salary_df.rename(columns = {'Job Location': 'state_code'})
salary_df

Unnamed: 0,Rating,Lower Salary,Upper Salary,Avg Salary(K),state_code,job_title_sim,seniority_by_title
0,3.8,53,91,72.0,NM,data scientist,na
1,3.4,63,112,87.5,MD,data scientist,na
2,4.8,80,90,85.0,FL,data scientist,na
3,3.8,56,97,76.5,WA,data scientist,na
4,2.9,86,143,114.5,NY,data scientist,na
...,...,...,...,...,...,...,...
734,3.6,37,100,68.5,MA,data scientist,na
735,3.9,62,113,87.5,CA,data engineer,na
739,2.6,56,91,73.5,PA,other scientist,na
740,3.2,95,160,127.5,PA,data scientist,na


In [100]:
#number of states after data frame has been manipulated
salary_df['state_code'].nunique()

36

In [102]:
#grab median salary by state
salary_df_median = salary_df.groupby(['state_code']).median()
salary_df_median_sale = salary_df_median.sort_values(by='Avg Salary(K)', ascending=False)
salary_df_median_sale.head()

Unnamed: 0_level_0,Rating,Lower Salary,Upper Salary,Avg Salary(K)
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,3.6,80.0,140.5,110.25
NJ,3.6,85.0,134.0,109.5
KY,3.1,68.0,139.0,103.5
NC,3.7,65.0,126.0,99.5
OR,3.5,74.5,121.5,98.0


In [76]:
#grab an salary by state
salary_df_mean = salary_df.groupby(['state_code']).mean()
salary_df_mean_sale = salary_df_mean.sort_values(by='Avg Salary(K)', ascending=False)
salary_df_mean_sale.head()

Unnamed: 0_level_0,Rating,Lower Salary,Upper Salary,Avg Salary(K)
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IL,3.566667,84.333333,138.814815,111.574074
CA,3.606731,80.625,139.490385,110.057692
NJ,3.691667,77.666667,131.083333,104.375
DC,3.777778,82.0,123.444444,102.722222
KY,3.5,66.333333,127.666667,97.0


In [77]:
#merge median dfs
merged_median_df = pd.merge(state_df_median, salary_df_median_sale, how = 'inner', on = 'state_code')
merged_median_df.head()

Unnamed: 0_level_0,median_sale_price,median_ppsf,Rating,Lower Salary,Upper Salary,Avg Salary(K)
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL,192000.0,111.0,4.25,42.0,76.0,60.75
AZ,292350.0,185.5,3.45,57.0,84.0,69.0
CA,637500.0,417.5,3.6,80.0,140.5,110.25
CO,447000.0,192.0,4.0,64.0,111.0,87.5
CT,278800.0,155.0,3.0,54.0,71.0,62.5


In [103]:
#determining differential between median salary and price
merged_median_df['cost_vs_salary'] = merged_median_df['median_sale_price'] - (merged_median_df['Avg Salary(K)'] * 1000)
merged_median_df.sort_values(by='cost_vs_salary')

Unnamed: 0_level_0,median_sale_price,median_ppsf,Rating,Lower Salary,Upper Salary,Avg Salary(K),cost_vs_salary
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KY,183650.0,114.0,3.1,68.0,139.0,103.5,80150.0
OH,173250.0,110.0,3.5,56.0,117.0,86.5,86750.0
MI,189850.0,127.0,3.95,65.5,118.0,91.75,98100.0
MO,179300.0,119.0,4.1,44.0,101.0,70.5,108800.0
IN,190350.0,102.0,3.95,56.0,91.0,73.5,116850.0
AL,192000.0,111.0,4.25,42.0,76.0,60.75,131250.0
PA,221250.0,142.5,3.2,61.0,109.0,85.0,136250.0
IL,227500.0,147.5,3.7,64.0,105.0,87.5,140000.0
KS,234150.0,125.0,3.7,61.0,113.0,87.0,147150.0
WI,205850.0,129.0,3.3,40.0,73.0,56.5,149350.0


In [105]:
#determining highest rated state for data science by median rating
merged_median_df.sort_values(by='Rating', ascending=False)

Unnamed: 0_level_0,median_sale_price,median_ppsf,Rating,Lower Salary,Upper Salary,Avg Salary(K),cost_vs_salary
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GA,255800.0,141.5,4.7,60.0,99.0,79.5,176300.0
IA,200600.0,142.0,4.6,31.0,55.0,43.0,157600.0
MN,268750.0,151.5,4.45,60.0,111.0,85.5,183250.0
SC,250700.0,144.5,4.4,39.0,82.0,60.5,190200.0
AL,192000.0,111.0,4.25,42.0,76.0,60.75,131250.0
UT,374700.0,182.0,4.2,53.0,91.0,72.0,302700.0
MO,179300.0,119.0,4.1,44.0,101.0,70.5,108800.0
ID,345250.0,200.0,4.1,39.5,73.0,56.25,289000.0
VA,355350.0,196.0,4.0,65.0,113.0,88.0,267350.0
CO,447000.0,192.0,4.0,64.0,111.0,87.5,359500.0


In [106]:
#merge mean dfs
merged_mean_df = pd.merge(state_df_mean, salary_df_mean_sale, how = 'inner', on = 'state_code')
merged_mean_df.head()

Unnamed: 0_level_0,median_sale_price,median_ppsf,Rating,Lower Salary,Upper Salary,Avg Salary(K)
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL,189148.888889,109.494382,4.2125,40.875,75.0,57.9375
AZ,294833.333333,197.627778,3.516667,62.166667,109.833333,86.0
CA,670961.111111,420.272222,3.606731,80.625,139.490385,110.057692
CO,458560.0,207.455556,3.833333,58.333333,99.333333,78.833333
CT,359550.299401,184.581818,3.4,57.6,84.6,71.1


In [107]:
#determining differential between mean salary and price
merged_mean_df['cost_vs_salary'] = merged_mean_df['median_sale_price'] - (merged_mean_df['Avg Salary(K)'] * 1000)
merged_mean_df.sort_values(by='cost_vs_salary')

Unnamed: 0_level_0,median_sale_price,median_ppsf,Rating,Lower Salary,Upper Salary,Avg Salary(K),cost_vs_salary
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KY,186930.555556,111.733333,3.5,66.333333,127.666667,97.0,89930.555556
OH,169773.888889,103.511111,3.622222,53.222222,100.333333,76.777778,92996.111111
MO,181113.333333,114.316667,4.111111,59.666667,109.777778,84.722222,96391.111111
MI,191853.797468,116.960784,3.95,65.5,118.0,91.75,100103.797468
IN,179582.876712,101.0,3.916667,52.333333,90.666667,71.5,108082.876712
IL,232036.666667,147.055556,3.566667,84.333333,138.814815,111.574074,120462.592593
AL,189148.888889,109.494382,4.2125,40.875,75.0,57.9375,131211.388889
PA,220665.555556,146.25,3.184211,64.947368,113.526316,89.236842,131428.71345
WI,206947.777778,126.877778,3.255556,52.0,88.555556,70.277778,136670.0
KS,232371.666667,126.261111,3.7,61.0,113.0,87.0,145371.666667


In [108]:
#determining highest rated state for data science by mean rating
merged_mean_df.sort_values(by='Rating', ascending=False)

Unnamed: 0_level_0,median_sale_price,median_ppsf,Rating,Lower Salary,Upper Salary,Avg Salary(K),cost_vs_salary
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MN,266106.111111,158.616667,4.45,60.0,111.0,85.5,180606.111111
GA,264038.333333,150.194444,4.433333,61.333333,101.333333,81.333333,182705.0
SC,260442.777778,149.544444,4.4,39.0,82.0,60.5,199942.777778
IA,202222.44898,138.30137,4.366667,38.666667,70.333333,54.5,147722.44898
AL,189148.888889,109.494382,4.2125,40.875,75.0,57.9375,131211.388889
MO,181113.333333,114.316667,4.111111,59.666667,109.777778,84.722222,96391.111111
ID,356761.666667,213.622222,4.1,39.5,73.0,56.25,300511.666667
UT,392304.444444,203.638889,4.042857,51.857143,96.714286,74.285714,318018.730159
FL,276862.777778,174.522222,3.966667,57.666667,88.333333,73.0,203862.777778
MI,191853.797468,116.960784,3.95,65.5,118.0,91.75,100103.797468


In [109]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [110]:
#define the data science housing data base in Mongo
db = client.ds_housing_db

In [110]:
#declare the median and mean collections
median = db.median
mean = db.mean

In [111]:
#insert median data frame in to median collection
median.insert_many(merged_median_df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x7f90556eaf70>

In [112]:
#insert mean data frame in to mean collection
mean.insert_many(merged_mean_df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x7f90556c2dc0>