- Importing modules
- Importing Data from json

In [1]:
import pandas as pd
import os, bisect, math

data_file_path = os.path.abspath("./data/data_165.json")

# Tested using the book's example at page 125
# data_file_path = os.path.abspath("./data/data_125.csv")

df = pd.read_json(data_file_path)

# Tested using the book's example at page 125
# df = pd.read_csv(data_file_path)

- Creating Midpoint column

In [2]:
df["midpoint"] = (df["lower_limit"] + df["upper_limit"]) / 2
df.head(5)

Unnamed: 0,lower_limit,upper_limit,male_occurance,female_occurance,midpoint
0,0,4,7963,7631,2.0
1,5,9,9482,9213,7.0
2,10,14,7175,6267,12.0
3,15,19,4819,4881,17.0
4,20,24,4356,5009,22.0


- Creating Cumulative Frequency column

In [3]:
df["cumulative_frequency_male"] = df["male_occurance"].cumsum()
df["cumulative_frequency_female"] = df["female_occurance"].cumsum()
df.head(10)

Unnamed: 0,lower_limit,upper_limit,male_occurance,female_occurance,midpoint,cumulative_frequency_male,cumulative_frequency_female
0,0,4,7963,7631,2.0,7963,7631
1,5,9,9482,9213,7.0,17445,16844
2,10,14,7175,6267,12.0,24620,23111
3,15,19,4819,4881,17.0,29439,27992
4,20,24,4356,5009,22.0,33795,33001
5,25,29,4537,4934,27.0,38332,37935
6,30,34,3495,3502,32.0,41827,41437
7,35,39,3367,2782,37.0,45194,44219
8,40,44,2519,2215,42.0,47713,46434
9,45,49,1958,1669,47.0,49671,48103


- Creating $f_ix_i$ column


In [4]:
df["male_fx_i"] = df["male_occurance"] * df["midpoint"]
df["female_fx_i"] = df["female_occurance"] * df["midpoint"]

- Calculating $n = \sum f_i$

In [5]:
sample_size_of_male = df["male_occurance"].sum()
sample_size_of_female = df["female_occurance"].sum()

- Calculating $\sum f_ix_i$

In [6]:
sum_over_male_fx_i = df["male_fx_i"].sum() 
sum_over_female_fx_i = df["female_fx_i"].sum()

- Calculating the average ages for male and female individually
- Formula $\bar{x} = \frac{\sum f_ix_i }{\sum f_i}$

In [7]:
# \bar{x_male}
average_age_of_male = sum_over_male_fx_i / sample_size_of_male

# \bar{x_female}
average_age_of_female = sum_over_female_fx_i / sample_size_of_female


- Calculating combined average
- Formula $\bar{x} = \frac{n_m \bar{x_m} + n_f \bar{x_f}}{n_m + n_f}$

In [8]:
combined_average_age = (
    sample_size_of_male * average_age_of_male
    + sample_size_of_female * average_age_of_female
) / (sample_size_of_male + sample_size_of_female)


- Printing results of Question A

In [9]:
print(
    f"""
average_age_of_male: {average_age_of_male}
average_age_of_female: {average_age_of_female}
combined_average_age: {combined_average_age}
        """
)


average_age_of_male: 22.752962903196888
average_age_of_female: 22.095991442405136
combined_average_age: 22.431968017311732
        


**Calculating 3rd Quartile**

In [10]:
r = 3
h = 5
denominator = 4

estimated_median_male = math.ceil(sample_size_of_male * r/denominator)
estimated_median_female = math.ceil(sample_size_of_female * r/denominator)

cumalitive_freq_col_male = df["cumulative_frequency_male"].tolist()
cumalitive_freq_col_female = df["cumulative_frequency_female"].tolist()

class_idx_male = bisect.bisect_right(cumalitive_freq_col_male, estimated_median_male)
class_idx_female = bisect.bisect_right(cumalitive_freq_col_female, estimated_median_female)

# print("idx",class_idx_male,class_idx_female)

lr_m = df["lower_limit"].iloc[class_idx_male]
lr_f = df["lower_limit"].iloc[class_idx_female]
# print("lr",lr_m,lr_f)

fr_m = df["male_occurance"].iloc[class_idx_male]
fr_f = df["female_occurance"].iloc[class_idx_female]
# print("fr",fr_m,fr_f)

Fr_m = df["cumulative_frequency_male"].iloc[class_idx_male-1]
Fr_f = df["cumulative_frequency_female"].iloc[class_idx_female-1]

# print("Fr_-1",Fr_m, Fr_f)
# print(sample_size_of_male*r/4,sample_size_of_female*r/4)

Qr_m = lr_m + (h/fr_m)*(sample_size_of_male*r/denominator - Fr_m)
Qr_f = lr_f + (h/fr_f)*(sample_size_of_female*r/denominator - Fr_f)

print(Qr_m,Qr_f)

35.00408375408375 32.89762992575671


**Calculating Percentile**

In [11]:
r = 78
h = 5
denominator = 100

estimated_median_male = math.ceil(sample_size_of_male * r/denominator)
estimated_median_female = math.ceil(sample_size_of_female * r/denominator)

cumalitive_freq_col_male = df["cumulative_frequency_male"].tolist()
cumalitive_freq_col_female = df["cumulative_frequency_female"].tolist()

class_idx_male = bisect.bisect_right(cumalitive_freq_col_male, estimated_median_male)
class_idx_female = bisect.bisect_right(cumalitive_freq_col_female, estimated_median_female)

# print("idx",class_idx_male,class_idx_female)

lr_m = df["lower_limit"].iloc[class_idx_male]
lr_f = df["lower_limit"].iloc[class_idx_female]
# print("lr",lr_m,lr_f)

fr_m = df["male_occurance"].iloc[class_idx_male]
fr_f = df["female_occurance"].iloc[class_idx_female]
# print("fr",fr_m,fr_f)

Fr_m = df["cumulative_frequency_male"].iloc[class_idx_male-1]
Fr_f = df["cumulative_frequency_female"].iloc[class_idx_female-1]

# print("Fr_-1",Fr_m, Fr_f)
# print(sample_size_of_male*r/4,sample_size_of_female*r/4)

Qr_m = lr_m + (h/fr_m)*(sample_size_of_male*r/denominator - Fr_m)
Qr_f = lr_f + (h/fr_f)*(sample_size_of_female*r/denominator - Fr_f)

print(Qr_m,Qr_f)

37.488773388773396 35.226599568655644


**Calculating Decile**

In [12]:
r = 6
h = 5
denominator = 10

estimated_median_male = math.ceil(sample_size_of_male * r/denominator)
estimated_median_female = math.ceil(sample_size_of_female * r/denominator)

cumalitive_freq_col_male = df["cumulative_frequency_male"].tolist()
cumalitive_freq_col_female = df["cumulative_frequency_female"].tolist()

class_idx_male = bisect.bisect_right(cumalitive_freq_col_male, estimated_median_male)
class_idx_female = bisect.bisect_right(cumalitive_freq_col_female, estimated_median_female)

# print("idx",class_idx_male,class_idx_female)

lr_m = df["lower_limit"].iloc[class_idx_male]
lr_f = df["lower_limit"].iloc[class_idx_female]
# print("lr",lr_m,lr_f)

fr_m = df["male_occurance"].iloc[class_idx_male]
fr_f = df["female_occurance"].iloc[class_idx_female]
# print("fr",fr_m,fr_f)

Fr_m = df["cumulative_frequency_male"].iloc[class_idx_male-1]
Fr_f = df["cumulative_frequency_female"].iloc[class_idx_female-1]

# print("Fr_-1",Fr_m, Fr_f)
# print(sample_size_of_male*r/4,sample_size_of_female*r/4)

Qr_m = lr_m + (h/fr_m)*(sample_size_of_male*r/denominator - Fr_m)
Qr_f = lr_f + (h/fr_f)*(sample_size_of_female*r/denominator - Fr_f)

print(Qr_m,Qr_f)

24.619834710743806 23.972449590736673
