<p align="center">
  <img src="../Images/datasafari-logo-primary.png" width="300">
</p>

---

# **Working with Data in Python (NumPy & Pandas)**
***Complete Code Solutions***

In [None]:
pip install numpy pandas

In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd

## **NumPy Arrays & Operations**

In [2]:
# Example: Monthly rainfall (12 months)
rainfall_1d = np.array([120, 90, 85, 100, 150, 130, 160, 140, 110, 95, 80, 105])
print("Original shape:", rainfall_1d.shape)

Original shape: (12,)


In [3]:
# Reshape to 3 regions × 4 months
rainfall_grid1 = rainfall_1d.reshape(3, 4)
print("Reshaped (3×4):\n", rainfall_grid1)

Reshaped (3×4):
 [[120  90  85 100]
 [150 130 160 140]
 [110  95  80 105]]


In [4]:
# Try to reshape to 4×4 → WILL RAISE ERROR (intentionally)
rainfall_grid2 = rainfall_1d.reshape(4, 4)  # ValueError: cannot reshape array of size 12 into shape (4,4)

ValueError: cannot reshape array of size 12 into shape (4,4)

In [5]:
# Slicing and indexing on rainfall_grid1
print("Value at row 0, column 2:", rainfall_grid1[0, 2])
print("All rows from row 1 onward:\n", rainfall_grid1[1:, :])

Value at row 0, column 2: 85
All rows from row 1 onward:
 [[150 130 160 140]
 [110  95  80 105]]


### Practical task: Create arrays of different shapes

In [6]:
# 1D array using arange
array1d = np.arange(12)
print("1D array:", array1d)

1D array: [ 0  1  2  3  4  5  6  7  8  9 10 11]


In [7]:
# Reshape to 2D (3×4)
array2d = array1d.reshape(3, 4)
print("2D array:\n", array2d)

2D array:
 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [8]:
# 3D array example
array1d_3d = np.arange(24)
array3d = array1d_3d.reshape(2, 3, 4)  # (measurement, day, village)
print("3D array shape:", array3d.shape)

3D array shape: (2, 3, 4)


In [9]:
# Indexing: humidity (index 1) on day 2 (index 1) across all villages
humidity_day2 = array3d[1, 1, :]
print("Humidity on day 2 across villages:", humidity_day2)

Humidity on day 2 across villages: [16 17 18 19]


## **Pandas Series & DataFrames**

In [10]:
# Data Frame from dictionary

# Student data dictionary
data = {
    'StudentID': [101, 102, 103, 104],
    'MathScore': [80, 72, 90, 88],
    'ReadingScore': [85, 78, 95, 90],
    'SchoolType': ['Primary', 'Primary', 'Secondary', 'Secondary']
}

In [11]:
# Create DataFrame from dictionary
df_students = pd.DataFrame(data)
print(df_students)

   StudentID  MathScore  ReadingScore SchoolType
0        101         80            85    Primary
1        102         72            78    Primary
2        103         90            95  Secondary
3        104         88            90  Secondary


In [12]:
# Check data type of MathScore column
print("\nType of MathScore column:", type(df_students['MathScore']))


Type of MathScore column: <class 'pandas.core.series.Series'>


### Pandas Series example

In [13]:
# Pandas Series example

#Rainfall data
rain_series = pd.Series([300, 250, 400, 500], index=['Jan', 'Feb', 'Mar', 'Apr'])
print("Rainfall Series:\n", rain_series)
print("Type:", type(rain_series))

Rainfall Series:
 Jan    300
Feb    250
Mar    400
Apr    500
dtype: int64
Type: <class 'pandas.core.series.Series'>


## Loading data with pandas (CSV/Excel)

In [10]:
# Load Financial Inclusion data
df_finscope = pd.read_csv('../Data/FinScope.csv')
print(df_finscope.head())
print("Columns:", df_finscope.columns.tolist())

     SN       reg_name  reg_code  dist_code dist_name  ward_code1 ward_name  \
0  4529         Mwanza        19          7  Misungwi         251     Mondo   
1  4245         Kagera        18          7  Missenyi          11   Kakunyu   
2  8149          Mbeya        12          3     Kyela         283     Nkuyu   
3  6763         Dodoma         1          3    Kongwa         123  Kibaigwa   
4  7805  Dar es Salaam         7          2     Ilala         252    Majohe   

   ea_code clustertype        c1  ...      SOCIAL_GROUPS  OTHER_FORMAL  \
0        4       Rural  Original  ...  Not SOCIAL_GROUPS  OTHER_FORMAL   
1        1       Rural  Original  ...  Not SOCIAL_GROUPS  OTHER_FORMAL   
2      301       Urban  Original  ...  Not SOCIAL_GROUPS  OTHER_FORMAL   
3      301       Urban  Original  ...  Not SOCIAL_GROUPS  OTHER_FORMAL   
4       29       Urban  Original  ...  Not SOCIAL_GROUPS  OTHER_FORMAL   

   OVERALL_FORMAL                                        INFORMAL  \
0  OVERALL_

### Exploring and Summarizing Data

In [4]:
# Using soil_quality as example (replace path if needed)
soil_quality = pd.read_csv('../Data/soil_quality.csv')
soil_quality.head(5)

Unnamed: 0,Location,Year,pH,Nitrogen
0,Lake3,2020,7.265471,
1,Lake3,2022,7.035864,
2,Lake1,2020,7.464235,
3,Lake3,2022,,3.91404
4,Lake4,2021,7.392931,


In [5]:
# Explore soil_quality data with pandas functions

print("\nShape:", soil_quality.shape)
print("\nInfo:")
soil_quality.info()
print("\nDescribe numeric:\n", soil_quality.describe())
print("\nUnique locations:", soil_quality['Location'].unique())
print("Observations per location:\n", soil_quality['Location'].value_counts())



Shape: (50, 4)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Location  50 non-null     object 
 1   Year      50 non-null     int64  
 2   pH        40 non-null     float64
 3   Nitrogen  40 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.7+ KB

Describe numeric:
               Year         pH   Nitrogen
count    50.000000  40.000000  40.000000
mean   2021.160000   7.055830   3.212081
std       0.765586   0.280987   0.447264
min    2020.000000   6.517519   2.514250
25%    2021.000000   6.852653   2.821561
50%    2021.000000   7.061865   3.223255
75%    2022.000000   7.312350   3.624739
max    2022.000000   7.499938   3.914040

Unique locations: ['Lake3' 'Lake1' 'Lake4' 'Lake2' 'Lake5']
Observations per location:
 Location
Lake3    13
Lake4    12
Lake1    10
Lake2     9
Lake5     6
Name: count, dtype: int64


### Feature Selection (Indexing & Slicing)

In [6]:
# Select single column → Series
ph_data = soil_quality['pH']
print(ph_data)
print("Type:", type(ph_data))

0     7.265471
1     7.035864
2     7.464235
3          NaN
4     7.392931
5          NaN
6     7.024666
7     7.137944
8     7.336847
9     6.517519
10    6.927869
11    6.729860
12    7.211337
13    7.388178
14    6.857791
15    7.461954
16         NaN
17         NaN
18    7.335059
19    7.115331
20    6.965044
21    6.837240
22         NaN
23    6.951128
24    7.194639
25    6.765773
26    6.902185
27    6.574971
28    7.126293
29         NaN
30    7.408674
31    7.006642
32    6.753939
33    7.003657
34         NaN
35         NaN
36    7.499938
37    7.229404
38         NaN
39    7.314076
40         NaN
41    6.765056
42    6.602112
43    6.936394
44    7.436273
45    6.568579
46    7.311775
47    7.087866
48    7.099857
49    6.688822
Name: pH, dtype: float64
Type: <class 'pandas.core.series.Series'>


In [7]:
# Select multiple columns → DataFrame
ph_location = soil_quality[['Location', 'pH']]
print(ph_location.head())
print("Type:", type(ph_location))

  Location        pH
0    Lake3  7.265471
1    Lake3  7.035864
2    Lake1  7.464235
3    Lake3       NaN
4    Lake4  7.392931
Type: <class 'pandas.core.frame.DataFrame'>


In [19]:
# Boolean indexing: acidic soils (pH < 7)
acidic_soils = soil_quality[soil_quality['pH'] < 7]
print("\nAcidic soils:\n", acidic_soils)


Acidic soils:
    Location  Year        pH  Nitrogen
9     Lake4  2022  6.517519  2.867883
10    Lake4  2020  6.927869  3.298869
11    Lake2  2022  6.729860  3.438091
14    Lake3  2020  6.857791  2.737204
20    Lake4  2021  6.965044  3.640594
21    Lake3  2021  6.837240  2.702412
23    Lake4  2022  6.951128  3.495090
25    Lake1  2022  6.765773  2.543385
26    Lake4  2021  6.902185  3.886312
27    Lake2  2021  6.574971  2.755632
32    Lake1  2021  6.753939  2.668627
41    Lake3  2021  6.765056  3.480762
42    Lake5  2020  6.602112       NaN
43    Lake2  2021  6.936394  3.557779
45    Lake4  2020  6.568579  3.157998
49    Lake5  2021  6.688822  2.866127


In [8]:
## .iloc examples

# Print first 5 rows (all columns)
print("\nFirst 5 rows (all columns):")
print(soil_quality.iloc[:5])


First 5 rows (all columns):
  Location  Year        pH  Nitrogen
0    Lake3  2020  7.265471       NaN
1    Lake3  2022  7.035864       NaN
2    Lake1  2020  7.464235       NaN
3    Lake3  2022       NaN   3.91404
4    Lake4  2021  7.392931       NaN


In [21]:
# Print first 5 rows, Location & pH columns
print("\nFirst 5 rows, Location & pH columns:")
print(soil_quality.iloc[:5, [0, 2]])  #


First 5 rows, Location & pH columns:
  Location        pH
0    Lake3  7.265471
1    Lake3  7.035864
2    Lake1  7.464235
3    Lake3       NaN
4    Lake4  7.392931


In [22]:
# .loc examples

# Print first 5 rows of all columns
print("\nFirst 5 rows using .loc:")
print(soil_quality.loc[:4])


First 5 rows using .loc:
  Location  Year        pH  Nitrogen
0    Lake3  2020  7.265471       NaN
1    Lake3  2022  7.035864       NaN
2    Lake1  2020  7.464235       NaN
3    Lake3  2022       NaN   3.91404
4    Lake4  2021  7.392931       NaN


In [23]:
# Print first 5 rows, Location & pH
print("\nFirst 5 rows, Location & pH using .loc:")
print(soil_quality.loc[:4, ['Location', 'pH']])


First 5 rows, Location & pH using .loc:
  Location        pH
0    Lake3  7.265471
1    Lake3  7.035864
2    Lake1  7.464235
3    Lake3       NaN
4    Lake4  7.392931


In [40]:
# Fill missing values in numerical columns with the mean
numerical_cols = soil_quality.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    mean_value = soil_quality[col].mean()
    soil_quality[col].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  soil_quality[col].fillna(mean_value, inplace=True)


### Financial Inclusion – Feature Selection & Renaming

In [11]:
# Selected columns
column_names = [
    'IncomeMain', 'e_7_n_3', 'g_2_1_1', 'e_5_2',
    'c8c', 'c9', 'c10', 'c11', 'RU', 'c27__2',
    'c25__1', 'comm3_1'
]


In [12]:
# Filter the FinScop data to keep only selected columns
df_selected = df_finscope[column_names]
print(df_selected.head())

                   IncomeMain e_7_n_3 g_2_1_1  e_5_2  c8c      c9  \
0         Farmers and fishers      No     Yes  False   47  Female   
1         Farmers and fishers     Yes      No  False   63  Female   
2     Piece work/casual labor     Yes     Yes   True   74    Male   
3  Traders - non-agricultural     Yes     Yes  False   29  Female   
4         Farmers and fishers     Yes     Yes  False   53    Male   

                       c10                  c11             RU c27__2 c25__1  \
0  Married/living together         Some primary          Rural     No     No   
1                  Widowed  No formal education          Rural     No          
2                  Widowed         Some primary    Other urban    Yes     No   
3       Divorced/separated         Some primary    Other urban     No     No   
4  Married/living together    Primary completed  Dar es Salaam    Yes     No   

  comm3_1  
0          
1          
2          
3          
4          


In [13]:
# Rename columns the column with more descriptive names

df_renamed = df_selected.rename(columns={
    'IncomeMain': 'Income_source',
    'e_7_n_3': 'Savings_habits',
    'g_2_1_1': 'Borrowing_status',
    'e_5_2': 'Financial_education',
    'c8c': 'Age',
    'c9': 'Gender',
    'c10': 'Marital_status',
    'c11': 'Educational_level',
    'RU': 'Geographic_location',
    'c27__2': 'NIDA_number',
    'c25__1': 'Smartphone_ownership',
    'comm3_1': 'has_bank_account'
})


In [14]:
# Check the renamed data
print("\nAfter renaming:")
print(df_renamed.head())


After renaming:
                Income_source Savings_habits Borrowing_status  \
0         Farmers and fishers             No              Yes   
1         Farmers and fishers            Yes               No   
2     Piece work/casual labor            Yes              Yes   
3  Traders - non-agricultural            Yes              Yes   
4         Farmers and fishers            Yes              Yes   

   Financial_education  Age  Gender           Marital_status  \
0                False   47  Female  Married/living together   
1                False   63  Female                  Widowed   
2                 True   74    Male                  Widowed   
3                False   29  Female       Divorced/separated   
4                False   53    Male  Married/living together   

     Educational_level Geographic_location NIDA_number Smartphone_ownership  \
0         Some primary               Rural          No                   No   
1  No formal education               Rural       

In [28]:
# Explore renamed data
print("\nShape:", df_renamed.shape)
print("\nInfo:")
df_renamed.info()
print("\nDescribe all:")
print(df_renamed.describe(include='all'))


Shape: (9915, 12)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9915 entries, 0 to 9914
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Income_source         9915 non-null   object
 1   Savings_habits        9915 non-null   object
 2   Borrowing_status      9915 non-null   object
 3   Financial_education   9915 non-null   bool  
 4   Age                   9915 non-null   int64 
 5   Gender                9915 non-null   object
 6   Marital_status        9915 non-null   object
 7   Educational_level     9915 non-null   object
 8   Geographic_location   9915 non-null   object
 9   NIDA_number           9915 non-null   object
 10  Smartphone_ownership  9915 non-null   object
 11  has_bank_account      9915 non-null   object
dtypes: bool(1), int64(1), object(10)
memory usage: 861.9+ KB

Describe all:
              Income_source Savings_habits Borrowing_status  \
count                  9915

### Detecting Missing Data

In [15]:
# Check for missing values
print("Missing values before cleaning:")
print(df_renamed.isna().sum())

Missing values before cleaning:
Income_source           0
Savings_habits          0
Borrowing_status        0
Financial_education     0
Age                     0
Gender                  0
Marital_status          0
Educational_level       0
Geographic_location     0
NIDA_number             0
Smartphone_ownership    0
has_bank_account        0
dtype: int64


In [16]:
print("\nValue counts for target (note empty strings):")
print(df_renamed['has_bank_account'].value_counts(dropna=False).head())



Value counts for target (note empty strings):
has_bank_account
                            8559
One account                 1112
Two or three accounts        234
More than three accounts      10
Name: count, dtype: int64


- Remind the learner about function
- Link with the below example to remind the about importance of function in Data task

### Text Cleaning Function

In [17]:
# Task automation with function
# White space removal, special character removal, and case normalization

def clean_text_columns(df, columns=None, case="lower"):
    if columns is None:
        columns = df.select_dtypes(include=['object', 'string']).columns
    
    for col in columns:
        series = df[col].astype(str).str.strip()
        series = series.str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
        
        if case == "lower":
            series = series.str.lower()
        elif case == "upper":
            series = series.str.upper()
        elif case == "title":
            series = series.str.title()
            
        df[col] = series.replace({'': np.nan, 'nan': np.nan})
    
    return df

In [None]:
# Apply the function to all text columns
df_cleaned_all = clean_text_columns(df_renamed.copy(), case="lower")
cleaned_soildata = clean_text_columns(soil_quality.copy(), case="lower")


In [41]:
# Fill missing values in categorical columns with the mode
# FinScope data
categorical_cols_F = df_cleaned_all.select_dtypes(include=['object', 'string']).columns
for col in categorical_cols_F:
    mode_value = df_cleaned_all[col].mode()[0]
    df_cleaned_all[col].fillna(mode_value, inplace=True)

# Soil quality data
categorical_cols_S = cleaned_soildata.select_dtypes(include=['object', 'string']).columns
for col in categorical_cols_S:
    mode_value = cleaned_soildata[col].mode()[0]
    cleaned_soildata[col].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned_all[col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_soildata[col].fillna(mode_value, inplace=True)


In [42]:
# Fill missing values in numerical columns with the mean
numerical_cols_F = df_cleaned_all.select_dtypes(include=[np.number]).columns
for col in numerical_cols_F:
    mean_value = df_cleaned_all[col].mean()
    df_cleaned_all[col].fillna(mean_value, inplace=True)

numerical_cols_S = cleaned_soildata.select_dtypes(include=[np.number]).columns
for col in numerical_cols_S:
    mean_value = cleaned_soildata[col].mean()
    cleaned_soildata[col].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned_all[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_soildata[col].fillna(mean_value, inplace=True)


In [43]:


# Save cleaned data to new CSV
cleaned_soildata.to_csv('../Data/soil_quality_cleaned.csv', index=False)
df_cleaned_all.to_csv('../Data/FinScope_cleaned.csv', index=False)

In [33]:
print("Missing values after cleaning:")
print(df_cleaned_all.isna().sum())

Missing values after cleaning:
Income_source              0
Savings_habits             0
Borrowing_status           0
Financial_education        0
Age                        0
Gender                     0
Marital_status             0
Educational_level          0
Geographic_location        0
NIDA_number                0
Smartphone_ownership    2456
has_bank_account        8559
dtype: int64


In [34]:
# Chech the value counts of columns with mising values

# value counts for Smartphone_ownership
print("\nSmartphone_ownership value counts:")
print(df_cleaned_all['Smartphone_ownership'].value_counts(dropna=False))



Smartphone_ownership value counts:
Smartphone_ownership
no     5839
NaN    2456
yes    1620
Name: count, dtype: int64


In [35]:
# value counts for has_bank_account
print("\nhas_bank_account value counts:")
print(df_cleaned_all['has_bank_account'].value_counts(dropna=False))



has_bank_account value counts:
has_bank_account
NaN                         8559
one account                 1112
two or three accounts        234
more than three accounts      10
Name: count, dtype: int64


### Handling Missing Values

In [36]:
# Create a copy of cleaned DataFrame
# This is useful for further processing without altering the original cleaned data
df_cleaned_all_copy = df_cleaned_all.copy()

In [37]:
# Fill Smartphone_ownership with mode
smartphone_mode = df_cleaned_all_copy['Smartphone_ownership'].mode()[0]
df_cleaned_all_copy['Smartphone_ownership'] = df_cleaned_all_copy['Smartphone_ownership'].fillna(smartphone_mode)

In [38]:
# Fill has_bank_account missing with 'no account'
df_cleaned_all_copy['has_bank_account'] = df_cleaned_all_copy['has_bank_account'].fillna('no account')


In [39]:
# Check missing values after filling
print("After filling missing values:")
print(df_cleaned_all_copy.isna().sum())

After filling missing values:
Income_source           0
Savings_habits          0
Borrowing_status        0
Financial_education     0
Age                     0
Gender                  0
Marital_status          0
Educational_level       0
Geographic_location     0
NIDA_number             0
Smartphone_ownership    0
has_bank_account        0
dtype: int64


### Removing Duplicates

In [40]:
# Check for duplicates
print("Number of duplicate rows:", df_cleaned_all_copy.duplicated().sum())

Number of duplicate rows: 1494


In [41]:
# Drop duplicates
df_final = df_cleaned_all_copy.drop_duplicates().reset_index(drop=True)
print("After removing duplicates:", df_final.duplicated().sum())
print("Final shape:", df_final.shape)

After removing duplicates: 0
Final shape: (8421, 12)


In [42]:
# Summary
print(df_final.describe(include='all'))

              Income_source Savings_habits Borrowing_status  \
count                  8421           8421             8421   
unique                   14              2                2   
top     farmers and fishers            yes               no   
freq                   2574           4605             4969   
mean                    NaN            NaN              NaN   
std                     NaN            NaN              NaN   
min                     NaN            NaN              NaN   
25%                     NaN            NaN              NaN   
50%                     NaN            NaN              NaN   
75%                     NaN            NaN              NaN   
max                     NaN            NaN              NaN   

       Financial_education          Age  Gender          Marital_status  \
count                 8421  8421.000000    8421                    8421   
unique                   2          NaN       2                       4   
top               

### Final Categorical Mapping (has_bank_account) - Target feature

In [43]:
# Final Categorical Mapping (has_bank_account)
print("Unique values before mapping:", df_final['has_bank_account'].unique())


Unique values before mapping: ['no account' 'one account' 'two or three accounts'
 'more than three accounts']


In [44]:
# Map multiple categories to 'yes' or 'no'
df_final['has_bank_account'] = df_final['has_bank_account'].replace({
    'no account': 'no',
    'one account': 'yes',
    'two or three accounts': 'yes',
    'more than three accounts': 'yes'
})

In [45]:
# Check mapping results
print("After mapping:")
print(df_final['has_bank_account'].value_counts())
print("Unique values:", df_final['has_bank_account'].unique())


After mapping:
has_bank_account
no     7082
yes    1339
Name: count, dtype: int64
Unique values: ['no' 'yes']


## **Capstone Project: Adult Census Income Dataset**

### Part 1: Load & Explore

In [20]:
# Columns for Adult Census Income Dataset
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]

In [21]:
# Load Adult data
df_adult = pd.read_csv('../Data/adult.csv', header=None, names=columns, na_values=' ?', skipinitialspace=True)

# BONUS: You can also Load Adult Census Income Dataset from UCI repository
# df_adult = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None, names=columns, na_values=' ?', skipinitialspace=True)

In [22]:
# Show shape and head of the dataset
print("Shape:", df_adult.shape)
df_adult.head(10)

Shape: (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [23]:
# Summary info
print("\nInfo:")
df_adult.info()


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [24]:
# Summary statistics
print("\nDescribe all:")
print(df_adult.describe(include="all"))


Describe all:
                 age workclass        fnlwgt education  education_num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

            marital_status      occupation relationship   race    sex  \
count                32561         

### Part 2: Investigate Data Quality

In [25]:
# Check for missing values
print("Missing values:")
print(df_adult.isna().sum())

Missing values:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [26]:
# Explore unique values in categorical columns with missing data
print("\nUnique workclass values:", df_adult['workclass'].unique())
print("Unique occupation values:", df_adult['occupation'].unique())
print("Unique native_country values:", df_adult['native_country'].unique())



Unique workclass values: ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
Unique occupation values: ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
Unique native_country values: ['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador'
 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands']


In [27]:
# Check for duplicates
print("\nDuplicate rows:", df_adult.duplicated().sum())


Duplicate rows: 24


### Part 3: Handle Missing Values

In [28]:
# Handle Missing Values
# Replace '?' with NaN
df_adult.replace('?', np.nan, inplace=True)
print("Missing % per column:")
print((df_adult.isna().mean() * 100).round(2))

Missing % per column:
age               0.00
workclass         5.64
fnlwgt            0.00
education         0.00
education_num     0.00
marital_status    0.00
occupation        5.66
relationship      0.00
race              0.00
sex               0.00
capital_gain      0.00
capital_loss      0.00
hours_per_week    0.00
native_country    1.79
income            0.00
dtype: float64


In [29]:
# Fill categorical missing with "Unknown", drop rows only if necessary
df_adult['workclass'].fillna('Unknown', inplace=True)
df_adult['occupation'].fillna('Unknown', inplace=True)
df_adult['native_country'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_adult['workclass'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_adult['occupation'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are

In [30]:
# Check missing values after filling
print("\nAfter filling:")
print(df_adult.isna().sum())


After filling:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


### Part 4: Clean Categorical Data

In [31]:
# Clean Categorical Data
cat_cols = ['workclass', 'education', 'marital_status', 'occupation',
            'relationship', 'race', 'sex', 'native_country', 'income']
for col in cat_cols:
    df_adult[col] = df_adult[col].str.strip().str.lower()

In [32]:
# Check unique income values after cleaning
print("Unique income values after cleaning:", df_adult['income'].unique())


Unique income values after cleaning: ['<=50k' '>50k']


In [33]:
# Map income to binary (0 and 1) for modeling
df_adult['income'] = df_adult['income'].map({'<=50k': 0, '>50k': 1})
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,state-gov,77516,bachelors,13,never-married,adm-clerical,not-in-family,white,male,2174,0,40,united-states,0
1,50,self-emp-not-inc,83311,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,13,united-states,0
2,38,private,215646,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,0,0,40,united-states,0
3,53,private,234721,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,0,0,40,united-states,0
4,28,private,338409,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,0,0,40,cuba,0


### Part 5: Handle Duplicates

In [34]:
# Check for duplicates
print("Duplicates before:", df_adult.duplicated().sum())

# Drop duplicates
df_adult.drop_duplicates(inplace=True)

# Check for duplicates again
print("Duplicates after:", df_adult.duplicated().sum())
print("Final shape:", df_adult.shape)

Duplicates before: 24
Duplicates after: 0
Final shape: (32537, 15)


### Part 6: Feature Selection

In [35]:
# Drop fnlwgt (sampling weight, not predictive)
df_adult.drop('fnlwgt', axis=1, inplace=True)

In [36]:
# Optional: drop education (education_num is numeric version)
df_adult.drop('education', axis=1, inplace=True)


In [37]:
# Final columns after feature selection
print("Final columns:", df_adult.columns.tolist())
df_adult.head()

Final columns: ['age', 'workclass', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']


Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,state-gov,13,never-married,adm-clerical,not-in-family,white,male,2174,0,40,united-states,0
1,50,self-emp-not-inc,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,13,united-states,0
2,38,private,9,divorced,handlers-cleaners,not-in-family,white,male,0,0,40,united-states,0
3,53,private,7,married-civ-spouse,handlers-cleaners,husband,black,male,0,0,40,united-states,0
4,28,private,13,married-civ-spouse,prof-specialty,wife,black,female,0,0,40,cuba,0


In [38]:
# Save cleaned Adult Census Income data
df_adult.to_csv('../Data/Adult_cleaned.csv', index=False)

---
<p align="center">
  <img src="../Images/datasafari-logo-primary.png" width="300">
</p>