In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('berkeley.csv', usecols=['Major', 'Gender', 'Admission'], nrows=2000)
df.head()

Unnamed: 0,Major,Gender,Admission
0,C,F,Rejected
1,B,M,Accepted
2,Other,F,Accepted
3,Other,M,Accepted
4,Other,M,Rejected


In [3]:
df.columns = ['Major', 'Enrolled', 'Graduated in 4 years']
df.head()

Unnamed: 0,Major,Enrolled,Graduated in 4 years
0,C,F,Rejected
1,B,M,Accepted
2,Other,F,Accepted
3,Other,M,Accepted
4,Other,M,Rejected


In [4]:
df['Enrolled'] = df.Enrolled.map(lambda x: "2006-2010" if x == "M" else "2011-2015")
df.head()

Unnamed: 0,Major,Enrolled,Graduated in 4 years
0,C,2011-2015,Rejected
1,B,2006-2010,Accepted
2,Other,2011-2015,Accepted
3,Other,2006-2010,Accepted
4,Other,2006-2010,Rejected


In [5]:
df['Graduated in 4 years'] = df['Graduated in 4 years'].map(lambda x: "No" if x == "Rejected" else "Yes")
df.head()

Unnamed: 0,Major,Enrolled,Graduated in 4 years
0,C,2011-2015,No
1,B,2006-2010,Yes
2,Other,2011-2015,Yes
3,Other,2006-2010,Yes
4,Other,2006-2010,No


In [6]:
field_headers = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI']

for h in field_headers:
    df[h] = ''

df.head()

Unnamed: 0,Major,Enrolled,Graduated in 4 years,I,II,III,IV,V,VI,VII,VIII,IX,X,XI
0,C,2011-2015,No,,,,,,,,,,,
1,B,2006-2010,Yes,,,,,,,,,,,
2,Other,2011-2015,Yes,,,,,,,,,,,
3,Other,2006-2010,Yes,,,,,,,,,,,
4,Other,2006-2010,No,,,,,,,,,,,


In [7]:
# Pandas is making it very hard to do this
def load_eleven_values(df, row_number, values):
    for i in range(11):
        df.loc[row_number, field_headers[i]] = values[int(i)]

# ["Major", "Enrolled", "Graduated in 4 years", 
#  '', 'Major', 
#  'BOTH-Total', 'BOTH-Graduated', 'BOTH-Percent', 
#  '2000-2010-Total', '2000-2010-Graduated', '2000-2010-Percent', 
#  '2010-2015-Total', '2010-2015-Graduated', '2010-2015-Percent'],
        
# Then, fill in calculated fields
extra0 =  ['', 'Major', 
'BOTH-Total', 'BOTH-Graduated', 'BOTH-Percent', 
'2006-2010-Total', '2006-2010-Graduated', '2006-2010-Percent', 
'2011-2015-Total', '2011-2015-Graduated', '2011-2015-Percent']
load_eleven_values(df, 0, extra0)

extra1 =  ['', 'A', 
'=COUNTIF(A2:A2001, E3)', '=COUNTIFS(A2:A2001,E3,C2:C2001,"Yes")', '=G3/F3', 
'=COUNTIFS(A2:A2001, E3, B2:B2001,"2006-2010")', '=COUNTIFS(A2:A2001,E3,C2:C2001,"Yes",  B2:B2001,"2006-2010")', '=J3/I3', 
'=COUNTIFS(A2:A2001, E3, B2:B2001,"2011-2015")', '=COUNTIFS(A2:A2001,E3,C2:C2001,"Yes",  B2:B2001,"2011-2015")', '=M3/L3']
load_eleven_values(df, 1, extra1)

extra2 =  ['', 'B', 
'=COUNTIF(A2:A2001, E4)', '=COUNTIFS(A2:A2001,E4,C2:C2001,"Yes")', '=G4/F4', 
'=COUNTIFS(A2:A2001, E4, B2:B2001,"2006-2010")', '=COUNTIFS(A2:A2001,E4,C2:C2001,"Yes",  B2:B2001,"2006-2010")', '=J4/I4', 
'=COUNTIFS(A2:A2001, E4, B2:B2001,"2011-2015")', '=COUNTIFS(A2:A2001,E4,C2:C2001,"Yes",  B2:B2001,"2011-2015")', '=M4/L4']
load_eleven_values(df, 2, extra2)

extra3 =  ['', 'C', 
'=COUNTIF(A2:A2001, E5)', '=COUNTIFS(A2:A2001,E5,C2:C2001,"Yes")', '=G5/F5', 
'=COUNTIFS(A2:A2001, E5, B2:B2001,"2006-2010")', '=COUNTIFS(A2:A2001,E5,C2:C2001,"Yes",  B2:B2001,"2006-2010")', '=J5/I5', 
'=COUNTIFS(A2:A2001, E5, B2:B2001,"2011-2015")', '=COUNTIFS(A2:A2001,E5,C2:C2001,"Yes",  B2:B2001,"2011-2015")', '=M5/L5']
load_eleven_values(df, 3, extra3)

extra4 =  ['', 'D', 
'=COUNTIF(A2:A2001, E6)', '=COUNTIFS(A2:A2001,E6,C2:C2001,"Yes")', '=G6/F6', 
'=COUNTIFS(A2:A2001, E6, B2:B2001,"2006-2010")', '=COUNTIFS(A2:A2001,E6,C2:C2001,"Yes",  B2:B2001,"2006-2010")', '=J6/I6', 
'=COUNTIFS(A2:A2001, E6, B2:B2001,"2011-2015")', '=COUNTIFS(A2:A2001,E6,C2:C2001,"Yes",  B2:B2001,"2011-2015")', '=M6/L6']
load_eleven_values(df, 4, extra4)

extra5 =  ['', 'E', 
'=COUNTIF(A2:A2001, E7)', '=COUNTIFS(A2:A2001,E7,C2:C2001,"Yes")', '=G7/F7', 
'=COUNTIFS(A2:A2001, E7, B2:B2001,"2006-2010")', '=COUNTIFS(A2:A2001,E7,C2:C2001,"Yes",  B2:B2001,"2006-2010")', '=J7/I7', 
'=COUNTIFS(A2:A2001, E7, B2:B2001,"2011-2015")', '=COUNTIFS(A2:A2001,E7,C2:C2001,"Yes",  B2:B2001,"2011-2015")', '=M7/L7']
load_eleven_values(df, 5, extra5)

extra6 =  ['', 'F', 
'=COUNTIF(A2:A2001, E8)', '=COUNTIFS(A2:A2001,E8,C2:C2001,"Yes")', '=G8/F8', 
'=COUNTIFS(A2:A2001, E8, B2:B2001,"2006-2010")', '=COUNTIFS(A2:A2001,E8,C2:C2001,"Yes",  B2:B2001,"2006-2010")', '=J8/I8', 
'=COUNTIFS(A2:A2001, E8, B2:B2001,"2011-2015")', '=COUNTIFS(A2:A2001,E8,C2:C2001,"Yes",  B2:B2001,"2011-2015")', '=M8/L8']
load_eleven_values(df, 6, extra6)

extra7 =  ['', 'Total', 
'=ROWS(A2:A2001)', '=COUNTIFS(C2:C2001,"Yes")', '=G9/F9', 
'=COUNTIFS(B2:B2001,"2006-2010")', '=COUNTIFS(C2:C2001,"Yes",  B2:B2001,"2006-2010")', '=J9/I9', 
'=COUNTIFS(B2:B2001,"2011-2015")', '=COUNTIFS(C2:C2001,"Yes",  B2:B2001,"2011-2015")', '=M9/L9']
load_eleven_values(df, 7, extra7)

df.head()

Unnamed: 0,Major,Enrolled,Graduated in 4 years,I,II,III,IV,V,VI,VII,VIII,IX,X,XI
0,C,2011-2015,No,,Major,BOTH-Total,BOTH-Graduated,BOTH-Percent,2006-2010-Total,2006-2010-Graduated,2006-2010-Percent,2011-2015-Total,2011-2015-Graduated,2011-2015-Percent
1,B,2006-2010,Yes,,A,"=COUNTIF(A2:A2001, E3)","=COUNTIFS(A2:A2001,E3,C2:C2001,""Yes"")",=G3/F3,"=COUNTIFS(A2:A2001, E3, B2:B2001,""2006-2010"")","=COUNTIFS(A2:A2001,E3,C2:C2001,""Yes"", B2:B200...",=J3/I3,"=COUNTIFS(A2:A2001, E3, B2:B2001,""2011-2015"")","=COUNTIFS(A2:A2001,E3,C2:C2001,""Yes"", B2:B200...",=M3/L3
2,Other,2011-2015,Yes,,B,"=COUNTIF(A2:A2001, E4)","=COUNTIFS(A2:A2001,E4,C2:C2001,""Yes"")",=G4/F4,"=COUNTIFS(A2:A2001, E4, B2:B2001,""2006-2010"")","=COUNTIFS(A2:A2001,E4,C2:C2001,""Yes"", B2:B200...",=J4/I4,"=COUNTIFS(A2:A2001, E4, B2:B2001,""2011-2015"")","=COUNTIFS(A2:A2001,E4,C2:C2001,""Yes"", B2:B200...",=M4/L4
3,Other,2006-2010,Yes,,C,"=COUNTIF(A2:A2001, E5)","=COUNTIFS(A2:A2001,E5,C2:C2001,""Yes"")",=G5/F5,"=COUNTIFS(A2:A2001, E5, B2:B2001,""2006-2010"")","=COUNTIFS(A2:A2001,E5,C2:C2001,""Yes"", B2:B200...",=J5/I5,"=COUNTIFS(A2:A2001, E5, B2:B2001,""2011-2015"")","=COUNTIFS(A2:A2001,E5,C2:C2001,""Yes"", B2:B200...",=M5/L5
4,Other,2006-2010,No,,D,"=COUNTIF(A2:A2001, E6)","=COUNTIFS(A2:A2001,E6,C2:C2001,""Yes"")",=G6/F6,"=COUNTIFS(A2:A2001, E6, B2:B2001,""2006-2010"")","=COUNTIFS(A2:A2001,E6,C2:C2001,""Yes"", B2:B200...",=J6/I6,"=COUNTIFS(A2:A2001, E6, B2:B2001,""2011-2015"")","=COUNTIFS(A2:A2001,E6,C2:C2001,""Yes"", B2:B200...",=M6/L6


In [8]:
df.to_csv("berkeley_with_calculations.csv", header=True, index=False)

In [9]:
df.shape

(2000, 14)

In [10]:
12764

12764