In [1]:
# using data from: https://www.payscale.com/college-salary-report

# methodology
# https://www.payscale.com/college-salary-report/methodology

"Best Schools by Major: Since major choice plays an instrumental role in your potential salary, we rank schools based on the earnings of graduates within a major group for Bachelor’s only graduates. Inclusion in this list required that a school-major combination met the same inclusion criteria described above. For this reason, the schools included for each major are a subset of the bachelor’s degree lists."

and

"Additionally, similarly to the overall school rankings, these schools are ranked based on the median mid-career earnings of graduates within a major group. The pay values do not control for job choice, only major and school selection. This is important to keep in mind as graduates for one school may select very different jobs than graduates from another school, even if they share the same major.

Also important to keep in mind is that salaries are grouped into "early" and "mid", where early is 0-5 years of experience and "mid" is 10+ years (this informationis available by clicking the info icon next to early and mid in the report tables). 

also

"Salary: Combines base annual salary or hourly wage, bonuses, profit sharing, tips, commissions, overtime, and other forms of cash earnings, as applicable.

Salary does not include equity (stock) compensation, which can be a significant portion of pay for some executive and high-tech jobs. In addition, salary does not include cash value of retirement benefits, or value of other non-cash benefits (e.g. healthcare). Salaries were inflation-adjusted to current year dollars to ensure apples-to-apples comparison over the data collection period."

In [5]:
import pandas as pd

In [6]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [7]:
pd.set_option('display.max_rows', None)

In [8]:
df = pd.read_csv('payscale-all-majors-colleges.csv')

In [9]:
df = df.reset_index()

In [10]:
# by query
# early salary as a percentage of the highest early salary
# late salary as a percentage of highest late salary

avg_early = pysqldf("""
SELECT major, AVG(early), 
    AVG(
    early / (SELECT MAX(avg_early) FROM (SELECT AVG(early) avg_early FROM df GROUP BY major))
    ) AS as_pct_max
FROM df
GROUP BY major
""")

#avg_early

avg_late = pysqldf("""
SELECT major, AVG(late), 
    AVG(
    late / (SELECT MAX(avg_late) FROM (SELECT AVG(late) avg_late FROM df GROUP BY major))
    ) AS as_pct_max
FROM df
GROUP BY major
""")

#avg_late

pysqldf("""
SELECT
    ae.major, 
    ae.as_pct_max as pct_max_early, 
    al.as_pct_max as pct_max_late
FROM
    avg_early ae
JOIN
    avg_late al
ON
    ae.major = al.major

""")

Unnamed: 0,major,pct_max_early,pct_max_late
0,art,0.675873,0.651709
1,communication,0.693436,0.748773
2,computer-science,0.96908,0.95502
3,engineering,0.981709,0.952594
4,health-science,0.791484,0.670402
5,humanities,0.698443,0.719255
6,math,1.0,1.0
7,physical-and-life-sciences,0.745503,0.783604
8,social-sciences,0.700739,0.708938


In [13]:
# by query
# early salary as a percentage of the highest early salary
# late salary as a percentage of highest late salary

avg_early = pysqldf("""

WITH avg_early AS
(
SELECT major, AVG(early), 
    AVG(
    early / (SELECT MAX(avg_early) FROM (SELECT AVG(early) avg_early FROM df GROUP BY major))
    ) AS as_pct_max
FROM df
GROUP BY major
),

avg_mid AS
(
SELECT major, AVG(late), 
    AVG(
    late / (SELECT MAX(avg_late) FROM (SELECT AVG(late) avg_late FROM df GROUP BY major))
    ) AS as_pct_max
FROM df
GROUP BY major
) 

SELECT
    ae.major, 
    ae.as_pct_max as pct_max_early, 
    am.as_pct_max as pct_max_late
FROM
    avg_early ae
JOIN
    avg_mid am
ON
    ae.major = am.major

""")

In [14]:
avg_early

Unnamed: 0,major,pct_max_early,pct_max_late
0,art,0.675873,0.651709
1,communication,0.693436,0.748773
2,computer-science,0.96908,0.95502
3,engineering,0.981709,0.952594
4,health-science,0.791484,0.670402
5,humanities,0.698443,0.719255
6,math,1.0,1.0
7,physical-and-life-sciences,0.745503,0.783604
8,social-sciences,0.700739,0.708938


In [7]:
#df.groupby("major").describe()['late']

In [8]:
#df.groupby("major").describe()['early']

In [9]:
# let's see if there's a big difference if you only consider the top programs from each field. 
# I'll do this two ways. First, I'll just consider the top 10 for each program.
# then, I'll consider the top 10% for each program (since the number of programs by major varies
# i.e, there are a lot more computer science departments than math departments represented)

In [10]:
df_top_10_by_major = pysqldf("""
SELECT r.*
FROM
(
    SELECT
        r.*,
        ROW_NUMBER() OVER(PARTITION BY r.major
                          ORDER BY r.late DESC) rn
    FROM df r
) r
WHERE r.rn <= 10
ORDER BY late DESC
""")

In [11]:
df_top_10_by_major

Unnamed: 0.1,level_0,Unnamed: 0,index,school,early,late,major,rn
0,2750,2750,0,Harvard University,96600,205600,math,1
1,525,525,0,Harvey Mudd College,117700,189200,computer-science,1
2,526,526,1,Stanford University,119600,183100,computer-science,2
3,527,527,2,University of California-Berkeley,115700,181100,computer-science,3
4,528,528,3,Harvard University,103900,179400,computer-science,4
5,529,529,4,Carnegie Mellon University,109200,178300,computer-science,5
6,530,530,5,Massachusetts Institute of Technology,109300,177200,computer-science,6
7,531,531,6,Princeton University,115100,177100,computer-science,7
8,532,532,7,Columbia University in the City of New York,104300,175400,computer-science,8
9,533,533,8,Dartmouth College,113300,174900,computer-science,9


In [12]:
# by query
# early salary as a percentage of the highest early salary
# late salary as a percentage of highest late salary

avg_early = pysqldf("""
SELECT major, AVG(early), 
    AVG(
    early / (SELECT MAX(avg_early) FROM (SELECT AVG(early) avg_early FROM df_top_10_by_major GROUP BY major))
    ) AS as_pct_max
FROM df_top_10_by_major
GROUP BY major
""")

#avg_early

avg_late = pysqldf("""
SELECT major, AVG(late), 
    AVG(
    late / (SELECT MAX(avg_late) FROM (SELECT AVG(late) avg_late FROM df_top_10_by_major GROUP BY major))
    ) AS as_pct_max
FROM df_top_10_by_major
GROUP BY major
""")

#avg_late

pysqldf("""
SELECT
    ae.major, 
    ae.as_pct_max as pct_max_early, 
    al.as_pct_max as pct_max_late
FROM
    avg_early ae
JOIN
    avg_late al
ON
    ae.major = al.major

""")

Unnamed: 0,major,pct_max_early,pct_max_late
0,art,0.583979,0.66352
1,communication,0.577214,0.768827
2,computer-science,1.0,1.0
3,engineering,0.804628,0.89933
4,health-science,0.692123,0.69648
5,humanities,0.618514,0.742179
6,math,0.785047,0.893687
7,physical-and-life-sciences,0.692835,0.812514
8,social-sciences,0.703605,0.916257


In [13]:
# by query
# early salary as a percentage of the highest early salary
# late salary as a percentage of highest late salary
# now doing this by percent rather than by number

In [14]:
# easier to do this with pandas

In [15]:
a = 0.1

df_early_t10p = (df.groupby('major',group_keys=False)
        .apply(lambda x: x.nlargest(int(len(x) * a), 'early')))

df_late_t10p = (df.groupby('major',group_keys=False)
        .apply(lambda x: x.nlargest(int(len(x) * a), 'late')))

In [20]:
df_early_t10p.groupby('major').describe()

Unnamed: 0_level_0,level_0,level_0,level_0,level_0,level_0,level_0,level_0,level_0,Unnamed: 0,Unnamed: 0,...,early,early,late,late,late,late,late,late,late,late
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
major,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
art,40.0,1983.175,30.394743,1950.0,1959.75,1974.0,1993.25,2056.0,40.0,1983.175,...,63100.0,85200.0,40.0,106390.0,9746.999538,91900.0,100525.0,105800.0,111725.0,138700.0
communication,42.0,2878.833333,25.888991,2850.0,2861.25,2872.5,2890.0,2959.0,42.0,2878.833333,...,63300.0,73100.0,42.0,122435.714286,10990.102452,104300.0,114300.0,119200.0,129700.0,148300.0
computer-science,60.0,565.083333,33.550514,525.0,539.75,559.5,578.25,672.0,60.0,565.083333,...,99500.0,119600.0,60.0,157435.0,12884.68839,133900.0,149000.0,154700.0,165750.0,189200.0
engineering,40.0,2378.9,24.459071,2350.0,2359.75,2373.5,2392.25,2447.0,40.0,2378.9,...,88125.0,100800.0,40.0,150420.0,8219.52647,134100.0,144475.0,149050.0,157775.0,163800.0
health-science,65.0,3827.907692,47.491355,3775.0,3792.0,3811.0,3857.0,3973.0,65.0,3827.907692,...,74700.0,115200.0,65.0,104398.461538,11166.824363,90500.0,97200.0,101900.0,109700.0,164000.0
humanities,52.0,34.288462,32.468589,0.0,12.75,26.5,45.5,162.0,52.0,34.288462,...,67300.0,78900.0,52.0,118994.230769,9658.632499,97600.0,113025.0,118150.0,125800.0,145800.0
math,10.0,2756.8,6.425643,2750.0,2752.25,2755.5,2759.25,2771.0,10.0,2756.8,...,95200.0,101300.0,10.0,158030.0,17802.374998,141900.0,149275.0,154000.0,158800.0,205600.0
physical-and-life-sciences,50.0,3303.94,21.548626,3275.0,3287.25,3299.5,3316.75,3360.0,50.0,3303.94,...,72250.0,92100.0,50.0,127802.0,11427.723004,111700.0,118100.0,126200.0,135075.0,163100.0
social-sciences,82.0,1177.292683,51.83342,1125.0,1145.25,1166.5,1189.75,1470.0,82.0,1177.292683,...,73250.0,88900.0,82.0,135480.487805,16102.73682,90700.0,125625.0,132850.0,145325.0,174500.0


In [16]:
avg_early = pysqldf("""
SELECT major, AVG(early), 
    AVG(
    early / (SELECT MAX(avg_early) FROM (SELECT AVG(early) avg_early FROM df_early_t10p GROUP BY major))
    ) AS as_pct_max
FROM df_early_t10p
GROUP BY major
""")

#avg_early

avg_late = pysqldf("""
SELECT major, AVG(late), 
    AVG(
    late / (SELECT MAX(avg_late) FROM (SELECT AVG(late) avg_late FROM df_late_t10p GROUP BY major))
    ) AS as_pct_max
FROM df_late_t10p
GROUP BY major
""")

#avg_late

pysqldf("""
SELECT
    ae.major, 
    ae.as_pct_max as pct_max_early, 
    al.as_pct_max as pct_max_late
FROM
    avg_early ae
JOIN
    avg_late al
ON
    ae.major = al.major

""")

Unnamed: 0,major,pct_max_early,pct_max_late
0,art,0.641649,0.681925
1,communication,0.640847,0.77705
2,computer-science,1.0,0.999031
3,engineering,0.895911,0.954101
4,health-science,0.757244,0.665692
5,humanities,0.676717,0.753675
6,math,0.951914,1.0
7,physical-and-life-sciences,0.713982,0.803726
8,social-sciences,0.722089,0.860466
