# *Aufgabe: Einführung in Python*

1. Lesen Sie den Datensatz mit Pandas ein:

In [1]:
# define all imports used in the notebook in the first cell

import pandas as pd
import os

In [2]:
# navigate to the relative path where the data is saved
file_path = os.path.join("..", "..", "..", "data", "census.csv")

# read csv with pandas package
census = pd.read_csv(file_path)

# show all column names in the data frame
print(census.columns)

# show the first 10 rows to investigate
census.head(10)


Index(['age', 'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'native-country', 'target'],
      dtype='object')


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [3]:
# Pop the columns:
# define the list of columns to remove
to_pop = ["occupation", "relationship", "capital-gain", "capital-loss", "native-country", "education", "workclass", "marital-status"]

# go through the list and remove them individually
for col in to_pop:
    census.pop(col)

census.head()

Unnamed: 0,age,race,sex,hours-per-week,target
0,39,White,Male,40,<=50K
1,50,White,Male,13,<=50K
2,38,White,Male,40,<=50K
3,53,Black,Male,40,<=50K
4,28,Black,Female,40,<=50K


In [4]:
# Replace categorical values with numerical values

# define a dictionary to replace the values
sex_dict = {" Male": 1, " Female": 0}
# replace the values in the specific columns with the dictionary
census["sex"] = census["sex"].replace(sex_dict)

# do the same for the other categorical columns
race_dict = {
    " White": 3,
    " Black": 2,
    " Asian-Pac-Islander": 4,
    " Amer-Indian-Eskimo": 1,
    " Other": 0,
}
census["race"] = census["race"].replace(race_dict)

# the dictionary can also be given directly to the function:
census["target"] = census["target"].replace({' >50K': 1, ' <=50K': 0})

census.head()

  census["sex"] = census["sex"].replace(sex_dict)
  census["race"] = census["race"].replace(race_dict)
  census["target"] = census["target"].replace({' >50K': 1, ' <=50K': 0})


Unnamed: 0,age,race,sex,hours-per-week,target
0,39,3,1,40,0
1,50,3,1,13,0
2,38,3,1,40,0
3,53,2,1,40,0
4,28,2,0,40,0


2. Geben Sie eine Statistische Zusammenfassung der Daten aus:

In [None]:
# overview of all columns easily available with one call
census.describe()

age               38.581647
race               2.891895
sex                0.669205
hours-per-week    40.437456
target             0.240810
dtype: float64

3. Teilen Sie die Daten mittels Train-Test-Split von Sklearn in Trainings- und Testdaten

In [None]:
from sklearn.model_selection import train_test_split

# split the data frame randomly into train/ test, test size
train, test = train_test_split(census, test_size=0.3)

4. Analysieren Sie mit Numpy, ob die Features in Test- und Trainingsdatensatz sich in den Mittelwerten unterscheiden.

In [None]:
import numpy as np

def calculate_mean(dataframe, feature):
    """Calculate the mean for a feature in a data frame using numpy."""
    return np.mean(dataframe[feature].to_numpy())


print('Train: {}'.format(train.shape))
print('Test: {}'.format(test.shape))

# go through the features to get the mean for both train and test values
for feature in census.columns:
    print("Feature: {}, mean train: {}, mean test: {}".format(feature, calculate_mean(train, feature), calculate_mean(test, feature)))

Train: (22792, 5)
Test: (9769, 5)
Feature: age, mean train: 38.572700947700945, mean test: 38.60251816972055
Feature: race, mean train: 2.891891891891892, mean test: 2.8919029583375986
Feature: sex, mean train: 0.6711126711126711, mean test: 0.6647558603746545
Feature: hours-per-week, mean train: 40.48433660933661, mean test: 40.3280786160303
Feature: target, mean train: 0.2414882414882415, mean test: 0.23922612345173508


In [8]:
# Additional: function to compare the mean for different split rates

def train_test_mean(split_rate: float):
    """Show the mean for all features and train/test data based on split rate.

    Args:
        split_rate (float): Number between 0 and 1 to indicate percentage of data to use for test.
    """

    # split the data frame randomly into train/ test based on the split rate
    train, test = train_test_split(census, test_size=split_rate)

    # Output information
    print("\n---------split rate: {}---------".format(split_rate))

    print("Train: {}".format(train.shape))
    print("Test: {}".format(test.shape))

    # go through the features to get the mean for both train and test values
    for feature in census.columns:
        print(
            "Feature: {}, mean train: {}, mean test: {}".format(
                feature, calculate_mean(train, feature), calculate_mean(test, feature)
            )
        )


In [9]:
train_test_mean(0.2)
train_test_mean(0.3)
train_test_mean(0.4)


---------split rate: 0.2---------
Train: (26048, 5)
Test: (6513, 5)
Feature: age, mean train: 38.55862254299754, mean test: 38.67372946414863
Feature: race, mean train: 2.8898955773955772, mean test: 2.8998925226470136
Feature: sex, mean train: 0.668458230958231, mean test: 0.6721940733916781
Feature: hours-per-week, mean train: 40.43930436117936, mean test: 40.430062951021036
Feature: target, mean train: 0.23913544226044225, mean test: 0.24750499001996007

---------split rate: 0.3---------
Train: (22792, 5)
Test: (9769, 5)
Feature: age, mean train: 38.599640224640225, mean test: 38.539666291329716
Feature: race, mean train: 2.891979641979642, mean test: 2.8916982290920257
Feature: sex, mean train: 0.6694015444015444, mean test: 0.6687480806633228
Feature: hours-per-week, mean train: 40.471130221130224, mean test: 40.358890367488996
Feature: target, mean train: 0.23982098982098982, mean test: 0.24311597911761695

---------split rate: 0.4---------
Train: (19536, 5)
Test: (13025, 5)
Fea