In [2]:
import pandas as pd

pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:


nls97 = pd.read_csv("data/nls97.csv")

nls97.set_index("personid", inplace=True)
nls97.loc[:, nls97.dtypes == 'object'] = \
  nls97.select_dtypes(['object']). \
  apply(lambda x: x.astype('category'))



In [4]:
# show the names of columns with category data type and check for number of missings
catcols = nls97.select_dtypes(include=["category"]).columns
nls97[catcols].isnull().sum()



gender                  0
maritalstatus        2312
weeklyhrscomputer    2274
weeklyhrstv          2273
highestdegree          31
                     ... 
colenroct15          1515
colenrfeb16          1948
colenroct16          2251
colenrfeb17          2251
colenroct17          2250
Length: 57, dtype: int64

In [5]:
# show frequencies for marital status
nls97.maritalstatus.value_counts()



Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64

In [7]:
# turn off sorting by frequency
nls97.maritalstatus.value_counts(sort=False)



Divorced          663
Married          3066
Never-married    2766
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64

In [8]:
# show percentages instead of counts
nls97.maritalstatus.value_counts(sort=False, normalize=True)


Divorced        0.10
Married         0.46
Never-married   0.41
Separated       0.02
Widowed         0.00
Name: maritalstatus, dtype: float64

In [9]:
# do percentages for all government responsibility variables
nls97.filter(like="gov").apply(pd.value_counts, normalize=True)



Unnamed: 0,govprovidejobs,govpricecontrols,...,govdecenthousing,govprotectenvironment
1. Definitely,0.25,0.54,...,0.44,0.67
2. Probably,0.34,0.33,...,0.43,0.29
3. Probably not,0.25,0.09,...,0.1,0.03
4. Definitely not,0.16,0.04,...,0.02,0.02


In [10]:
# do percentages for all government responsibility variables for people who are married
nls97[nls97.maritalstatus=="Married"].\
filter(like="gov").\
apply(pd.value_counts, normalize=True)



Unnamed: 0,govprovidejobs,govpricecontrols,...,govdecenthousing,govprotectenvironment
1. Definitely,0.17,0.46,...,0.36,0.64
2. Probably,0.33,0.38,...,0.49,0.31
3. Probably not,0.31,0.11,...,0.12,0.03
4. Definitely not,0.18,0.05,...,0.03,0.01


In [19]:
# do frequencies and percentages for all category variables in data frame
freqout = open('views/frequencies.txt', 'w') 


In [20]:
for col in nls97.select_dtypes(include=["category"]):
    print(col, "----------------------", "frequencies",
    nls97[col].value_counts(sort=False),"percentages",
    nls97[col].value_counts(normalize=True, sort=False),
    sep="\n\n", end="\n\n\n", file=freqout)



In [21]:
freqout.close()