In [35]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [36]:
df = pd.read_csv('cleaned_major_data.csv')

In [37]:
# remove rows where headcount == '<10'

df = df[df['headcount'] != '<10']

# convert 'headcount' to int

df['headcount'] = df['headcount'].astype(int)

In [38]:
df

Unnamed: 0,entry_term_cd,entry_year_trailing_summer_7,admit_level,start_major,end_major,end_status,section,headcount
26,18F,2018-19,Freshman,MATHEMATICS/ECONOMICS ...,MATHEMATICS/ECONOMICS ...,Enrolled,Fall 1 to Fall 2,39
40,18F,2018-19,Freshman,BIOCHEMISTRY ...,Not Enrolled,Not Enrolled,Fall 1 to Fall 2,13
49,18F,2018-19,Freshman,CIVIL ENGINEERING ...,CIVIL ENGINEERING ...,Enrolled,Fall 1 to Fall 2,81
52,18F,2018-19,Freshman,DESIGN / MEDIA ARTS ...,DESIGN / MEDIA ARTS ...,Enrolled,Fall 1 to Fall 2,24
58,18F,2018-19,Freshman,FINANCIAL ACTUARIAL MATHEMATICS ...,FINANCIAL ACTUARIAL MATHEMATICS ...,Enrolled,Fall 1 to Fall 2,44
...,...,...,...,...,...,...,...,...
12793,18F,2018-19,Freshman,ECONOMICS ...,ECONOMICS ...,Graduated,Fall 5 to Fall 6,16
12797,18F,2018-19,Freshman,ELECTRICAL ENGINEERING ...,ELECTRICAL ENGINEERING ...,Graduated,Fall 5 to Fall 6,12
12841,18F,2018-19,Freshman,MECHANICAL ENGINEERING ...,MECHANICAL ENGINEERING ...,Graduated,Fall 5 to Fall 6,25
12867,18F,2018-19,Freshman,POLITICAL SCIENCE ...,POLITICAL SCIENCE ...,Graduated,Fall 5 to Fall 6,12


In [39]:
# Remove trailing whitespace in 'start_major' and 'end_major'
df['start_major'] = df['start_major'].str.strip()
df['end_major'] = df['end_major'].str.strip()

In [40]:
majors1 = df['start_major'].tolist()
majors2 = df['end_major'].tolist()
sum_majors = majors1 + majors2
all_majors = list(set(sum_majors))
all_majors

['CHEMICAL ENGINEERING',
 'UNDECLARED',
 'ELECTRICAL ENGINEERING',
 'COMPUTER ENGINEERING',
 'Not Enrolled',
 'CHEMISTRY',
 'MATHEMATICS/ECONOMICS',
 'BIOPHYSICS',
 'ASTROPHYSICS',
 'FILM AND TELEVISION',
 'PUBLIC AFFAIRS',
 'APPLIED MATHEMATICS',
 'SOCIOLOGY',
 'MATERIALS ENGINEERING',
 'JAPANESE',
 'LINGUISTICS AND COMPUTER SCIENCE',
 'DANCE',
 'HUMAN BIOLOGY AND SOCIETY - BA',
 'NEUROSCIENCE',
 'LINGUISTICS',
 'MUSIC EDUCATION',
 'BUSINESS ECONOMICS',
 'ASIAN STUDIES',
 'MATHEMATICS OF COMPUTATION',
 'DESIGN / MEDIA ARTS',
 'PSYCHOLOGY',
 'LABOR STUDIES',
 'MATHEMATICS',
 'MUSIC',
 'MOLECULAR, CELL, AND DEVELOPMENTAL BIOLOGY',
 'MUSIC HISTORY AND INDUSTRY',
 'AEROSPACE ENGINEERING',
 'CHICANA AND CHICANO STUDIES',
 'PHILOSOPHY',
 'COMPUTER SCIENCE',
 'MATHEMATICS/APPLIED SCIENCE',
 'ARCHITECTURAL STUDIES',
 'GLOBAL JAZZ STUDIES',
 'AFRICAN AMERICAN STUDIES',
 'ENGLISH',
 'MUSIC INDUSTRY',
 'NURSING-PRELICENSURE',
 'LINGUISTICS AND PSYCHOLOGY',
 'PHYSICS',
 'EDUCATION AND SOCIAL TRAN

In [41]:
# Convert the DataFrame to a list of dictionaries
students = df.to_dict('records')

students

[{'entry_term_cd': '18F',
  'entry_year_trailing_summer_7': '2018-19',
  'admit_level': 'Freshman',
  'start_major': 'MATHEMATICS/ECONOMICS',
  'end_major': 'MATHEMATICS/ECONOMICS',
  'end_status': 'Enrolled',
  'section': 'Fall 1 to Fall 2',
  'headcount': 39},
 {'entry_term_cd': '18F',
  'entry_year_trailing_summer_7': '2018-19',
  'admit_level': 'Freshman',
  'start_major': 'BIOCHEMISTRY',
  'end_major': 'Not Enrolled',
  'end_status': 'Not Enrolled',
  'section': 'Fall 1 to Fall 2',
  'headcount': 13},
 {'entry_term_cd': '18F',
  'entry_year_trailing_summer_7': '2018-19',
  'admit_level': 'Freshman',
  'start_major': 'CIVIL ENGINEERING',
  'end_major': 'CIVIL ENGINEERING',
  'end_status': 'Enrolled',
  'section': 'Fall 1 to Fall 2',
  'headcount': 81},
 {'entry_term_cd': '18F',
  'entry_year_trailing_summer_7': '2018-19',
  'admit_level': 'Freshman',
  'start_major': 'DESIGN / MEDIA ARTS',
  'end_major': 'DESIGN / MEDIA ARTS',
  'end_status': 'Enrolled',
  'section': 'Fall 1 to Fal

In [42]:
# Initialize a dictionary to store counts for each major transition
major_transitions = {}

# Initialize counters for each major transition
for start_major in all_majors:
    for end_major in all_majors:
        major_transitions[(start_major, end_major)] = 0

In [43]:
major_transitions

{('CHEMICAL ENGINEERING', 'CHEMICAL ENGINEERING'): 0,
 ('CHEMICAL ENGINEERING', 'UNDECLARED'): 0,
 ('CHEMICAL ENGINEERING', 'ELECTRICAL ENGINEERING'): 0,
 ('CHEMICAL ENGINEERING', 'COMPUTER ENGINEERING'): 0,
 ('CHEMICAL ENGINEERING', 'Not Enrolled'): 0,
 ('CHEMICAL ENGINEERING', 'CHEMISTRY'): 0,
 ('CHEMICAL ENGINEERING', 'MATHEMATICS/ECONOMICS'): 0,
 ('CHEMICAL ENGINEERING', 'BIOPHYSICS'): 0,
 ('CHEMICAL ENGINEERING', 'ASTROPHYSICS'): 0,
 ('CHEMICAL ENGINEERING', 'FILM AND TELEVISION'): 0,
 ('CHEMICAL ENGINEERING', 'PUBLIC AFFAIRS'): 0,
 ('CHEMICAL ENGINEERING', 'APPLIED MATHEMATICS'): 0,
 ('CHEMICAL ENGINEERING', 'SOCIOLOGY'): 0,
 ('CHEMICAL ENGINEERING', 'MATERIALS ENGINEERING'): 0,
 ('CHEMICAL ENGINEERING', 'JAPANESE'): 0,
 ('CHEMICAL ENGINEERING', 'LINGUISTICS AND COMPUTER SCIENCE'): 0,
 ('CHEMICAL ENGINEERING', 'DANCE'): 0,
 ('CHEMICAL ENGINEERING', 'HUMAN BIOLOGY AND SOCIETY - BA'): 0,
 ('CHEMICAL ENGINEERING', 'NEUROSCIENCE'): 0,
 ('CHEMICAL ENGINEERING', 'LINGUISTICS'): 0,
 ('C

In [45]:
for student in students:
    
    start_major = student['start_major']
    end_major = student['end_major']
    headcount = student['headcount']
    major_transitions[(start_major, end_major)] += headcount

In [46]:
major_transitions

{('CHEMICAL ENGINEERING', 'CHEMICAL ENGINEERING'): 895,
 ('CHEMICAL ENGINEERING', 'UNDECLARED'): 0,
 ('CHEMICAL ENGINEERING', 'ELECTRICAL ENGINEERING'): 0,
 ('CHEMICAL ENGINEERING', 'COMPUTER ENGINEERING'): 0,
 ('CHEMICAL ENGINEERING', 'Not Enrolled'): 0,
 ('CHEMICAL ENGINEERING', 'CHEMISTRY'): 0,
 ('CHEMICAL ENGINEERING', 'MATHEMATICS/ECONOMICS'): 0,
 ('CHEMICAL ENGINEERING', 'BIOPHYSICS'): 0,
 ('CHEMICAL ENGINEERING', 'ASTROPHYSICS'): 0,
 ('CHEMICAL ENGINEERING', 'FILM AND TELEVISION'): 0,
 ('CHEMICAL ENGINEERING', 'PUBLIC AFFAIRS'): 0,
 ('CHEMICAL ENGINEERING', 'APPLIED MATHEMATICS'): 0,
 ('CHEMICAL ENGINEERING', 'SOCIOLOGY'): 0,
 ('CHEMICAL ENGINEERING', 'MATERIALS ENGINEERING'): 0,
 ('CHEMICAL ENGINEERING', 'JAPANESE'): 0,
 ('CHEMICAL ENGINEERING', 'LINGUISTICS AND COMPUTER SCIENCE'): 0,
 ('CHEMICAL ENGINEERING', 'DANCE'): 0,
 ('CHEMICAL ENGINEERING', 'HUMAN BIOLOGY AND SOCIETY - BA'): 0,
 ('CHEMICAL ENGINEERING', 'NEUROSCIENCE'): 0,
 ('CHEMICAL ENGINEERING', 'LINGUISTICS'): 0,
 (

In [47]:
# print all major transitions with headcount > 0

for key, value in major_transitions.items():
    if value > 0:
        print(key, value)

('CHEMICAL ENGINEERING', 'CHEMICAL ENGINEERING') 895
('ELECTRICAL ENGINEERING', 'ELECTRICAL ENGINEERING') 1708
('ELECTRICAL ENGINEERING', 'COMPUTER ENGINEERING') 40
('ELECTRICAL ENGINEERING', 'COMPUTER SCIENCE') 24
('COMPUTER ENGINEERING', 'COMPUTER ENGINEERING') 271
('CHEMISTRY', 'CHEMISTRY') 743
('CHEMISTRY', 'BIOCHEMISTRY') 45
('MATHEMATICS/ECONOMICS', 'MATHEMATICS/ECONOMICS') 884
('BIOPHYSICS', 'BIOPHYSICS') 22
('ASTROPHYSICS', 'ASTROPHYSICS') 393
('FILM AND TELEVISION', 'FILM AND TELEVISION') 292
('PUBLIC AFFAIRS', 'PUBLIC AFFAIRS') 1005
('APPLIED MATHEMATICS', 'APPLIED MATHEMATICS') 1394
('APPLIED MATHEMATICS', 'MATHEMATICS OF COMPUTATION') 13
('SOCIOLOGY', 'Not Enrolled') 195
('SOCIOLOGY', 'SOCIOLOGY') 4510
('MATERIALS ENGINEERING', 'MATERIALS ENGINEERING') 382
('JAPANESE', 'JAPANESE') 96
('LINGUISTICS AND COMPUTER SCIENCE', 'LINGUISTICS AND COMPUTER SCIENCE') 322
('DANCE', 'DANCE') 272
('HUMAN BIOLOGY AND SOCIETY - BA', 'HUMAN BIOLOGY AND SOCIETY - BA') 192
('NEUROSCIENCE', 'No