LabLogBook w2

In [2]:
import pandas as pd
import numpy as np

# Determine n from your SID
sid_last_digit = 1  # From your SID ending with 91
n = sid_last_digit if sid_last_digit != 0 else 10
print(f"Last digit of SID: {sid_last_digit}")
print(f"Value of n: {n}")

# Create sample DataFrame (since no actual data was provided)
# This mimics typical census/relationship data structure
data = {
    'relationship': ['Husband', 'Wife', 'Husband', 'Unmarried', 'Wife', 'Husband', 'Unmarried', 'Own-child', 'Husband', 'Wife'],
    'hours-per-week': [40, 35, 50, 45, 30, 60, 40, 20, 55, 25],
    'age': [35, 32, 40, 28, 31, 45, 26, 18, 38, 29],
    'education': ['Bachelors', 'Masters', 'HS-grad', 'Bachelors', 'HS-grad', 'Masters', 'HS-grad', 'HS-grad', 'Bachelors', 'Masters']
}

df = pd.DataFrame(data)
print("\nOriginal DataFrame:")
print(df)
print(f"\nOriginal DataFrame shape: {df.shape}")

# Function to reduce hours-per-week by n
def reduce_hours(hours):
    return hours - n

# Apply the function to reduce all "hours-per-week" values by n
df['reduced-hours-per-week'] = df['hours-per-week'].apply(reduce_hours)

print(f"\nDataFrame after reducing 'hours-per-week' by {n}:")
print(df)

# First grouping: by relationship only
group1 = df.groupby('relationship')
print("\nFirst grouping - by 'relationship' only:")
print("Size of each group:")
print(group1.size())
print("\nMean hours-per-week by relationship:")
print(group1['hours-per-week'].mean())

# Group by "relationship" and "hours-per-week" (original)
group2_original = df.groupby(['relationship', 'hours-per-week'])
print(f"\nGrouping by 'relationship' and original 'hours-per-week':")
print("Group sizes:")
print(group2_original.size())

# Group by "relationship" and reduced "hours-per-week"
group2_reduced = df.groupby(['relationship', 'reduced-hours-per-week'])
print(f"\nGrouping by 'relationship' and reduced 'hours-per-week' (reduced by {n}):")
print("Group sizes:")
print(group2_reduced.size())

# More detailed analysis of the final grouping
print(f"\nDetailed analysis - Grouping by 'relationship' and reduced 'hours-per-week':")
for name, group in group2_reduced:
    print(f"\nGroup: {name}")
    print(f"Number of records: {len(group)}")
    print(group[['age', 'education', 'hours-per-week', 'reduced-hours-per-week']])

Last digit of SID: 1
Value of n: 1

Original DataFrame:
  relationship  hours-per-week  age  education
0      Husband              40   35  Bachelors
1         Wife              35   32    Masters
2      Husband              50   40    HS-grad
3    Unmarried              45   28  Bachelors
4         Wife              30   31    HS-grad
5      Husband              60   45    Masters
6    Unmarried              40   26    HS-grad
7    Own-child              20   18    HS-grad
8      Husband              55   38  Bachelors
9         Wife              25   29    Masters

Original DataFrame shape: (10, 4)

DataFrame after reducing 'hours-per-week' by 1:
  relationship  hours-per-week  age  education  reduced-hours-per-week
0      Husband              40   35  Bachelors                      39
1         Wife              35   32    Masters                      34
2      Husband              50   40    HS-grad                      49
3    Unmarried              45   28  Bachelors             